In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import RobustScaler

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords 
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize 

In [2]:
syns_raw = pd.read_csv("../raw_data/wine_synonyms_csv.csv", index_col=0)
syns_raw.head()

Unnamed: 0_level_0,NAME_ORIGINAL,NAME_SYNONYMS,NAME_DIRTY,NAME_ALL,TYPE,Blend
ID_original_Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
w003,Abouriou,,,Abouriou,Red,single_var
w004,Agiorgitiko,St. George,,"Agiorgitiko, St. George",Red,single_var
w005,Aglianico,,,Aglianico,Red,single_var
w006,Aidani,,,Aidani,White,single_var
w007,Airen,,,Airen,White,single_var


In [None]:
np.unique(syns_raw['TYPE'])

In [3]:
def clean_rawdata():
    '''Once called, reads the raw data and convert it in a Pandas Dataframe.
        Carries out the cleaning tasks and returns a clean Dataframe, also saves
        a csv file locally'''


    #Clean duplicates and deals with unmatched grape varieties
    df = pd.read_csv("../raw_data/wine_data_csv.csv", index_col=0)
    df.drop_duplicates(inplace=True)
    columns_to_drop=['designation','region_2','taster_name','taster_twitter_handle','region_1']
    df.drop(columns=columns_to_drop,inplace=True)
    #df.dropna(inplace=True)
    df.drop(df[df['type']=='delete'].index,inplace=True)
    # df.to_csv("./raw_data/preprocessed_data.csv", index=False) #backup file moved to after resolve_synonyms()
    return df

def resolve_synonyms(df:pd.DataFrame)->pd.DataFrame:
    '''Reads the original data and implement standardised names for grape
    varieties. The user does not know this happens in the background,
    recommendations are still provided in the original variety.'''
    # Loading wine synonyms input data from file created with unique grape for one or several wines
    syns_raw = pd.read_csv("../raw_data/wine_synonyms_csv.csv", index_col=0)
    # Extracting the single column from the synonyms files that has all synonyns for each row
    all_grape_names = syns_raw.NAME_ALL.str.split(', ')
    # The synonyms file has multiple rows (synonyms) that hasn't been unified (synonymised) properly.
    # Here a table of synonyms is created to consolidate all synonyms.
    syns = {}     # for each item: key will be main synonym, values will include all synonyms including the main one (used for key)
    for row in all_grape_names:
        flat_dict = [num for elem in list(syns.values()) for num in elem]
        # checking if synonyms in each row are already present is the dictionary being created
        # if not, it creates a key and values
        if any(grape in flat_dict for grape in row) == False:
            syns[row[0]] = row
        # if yes, adds the new synonyns that don't exist yet in the list
        else:
            res = next((sub for sub in syns if any(grape in syns[sub] for grape in row) == True), None)
            syns[res].extend([item for item in row if item not in syns[res]])
    del flat_dict
    # Using the unified synonym table, it populates the column for grape vaiety in the main data table
    # with the main synonyms for each grape variety
    #df['variety_adj'] = df['variety'].apply(lambda grape: next((key for key, value in syns.items() if grape in value), None))
    #df.drop(columns=['variety'], inplace=True)
    def match_grape(grape):
        for key, value in syns.items():
            if grape in value:
                return key
    df['variety_adj'] = df['variety'].apply(match_grape)

    # ESTA 2 SGTEs LINEAS LA PUSE YO
    df.drop(columns=['country', 'points', 'price', 'title', 'variety', 'winery', ], inplace=True)
    df = df[['description', 'variety_adj', 'type', 'province', 'region']]

    
    df.to_csv("../raw_data/preprocessed_data.csv", index=False)

    return df

In [4]:
df = clean_rawdata()

In [5]:
df.columns

Index(['country', 'description', 'points', 'price', 'province', 'title',
       'variety', 'winery', 'variety_adj', 'type', 'region'],
      dtype='object')

In [6]:
df = resolve_synonyms(df)

In [7]:
df.head(3)

Unnamed: 0_level_0,description,variety_adj,type,province,region
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,"Aromas include tropical fruit, broom, brimston...",White Blend,white,Sicily & Sardinia,Etna
1,"This is ripe and fruity, a wine that is smooth...",Portuguese Red,red,Douro,Douro
2,"Tart and snappy, the flavors of lime flesh and...",Pinot Gris,white,Oregon,Willamette Valley


In [8]:
#from nltk.corpus import stopwords 
#import string
#from nltk.stem.wordnet import WordNetLemmatizer
#from nltk import word_tokenize 

def clean (text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ') # Remove Punctuation
    text = text.strip() ## remove whitespaces
    lowercased = text.lower() # Lower Case
    tokenized = word_tokenize(lowercased) # Tokenize
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    stop_words = set(stopwords.words('english')) # Make stopword list
    without_stopwords = [word for word in words_only if not word in stop_words] # Remove Stop Words
    lemma=WordNetLemmatizer() # Initiate Lemmatizer
    lemmatized = [lemma.lemmatize(word) for word in without_stopwords] # Lemmatize
    cleaned = ' '.join(lemmatized) # Join back to a string
    return cleaned

# Apply to all texts
df['description_clean'] = df.description.apply(clean)

df.head(3)

Unnamed: 0_level_0,description,variety_adj,type,province,region,description_clean
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,"Aromas include tropical fruit, broom, brimston...",White Blend,white,Sicily & Sardinia,Etna,aroma include tropical fruit broom brimstone d...
1,"This is ripe and fruity, a wine that is smooth...",Portuguese Red,red,Douro,Douro,ripe fruity wine smooth still structured firm ...
2,"Tart and snappy, the flavors of lime flesh and...",Pinot Gris,white,Oregon,Willamette Valley,tart snappy flavor lime flesh rind dominate gr...


In [9]:
df = df.head(200)

In [17]:
#vectorizer_tfidf = TfidfVectorizer()
#lda_model_tfidf = LatentDirichletAllocation(n_components=2)

vectorizer_tfidf = TfidfVectorizer()
#vect_tfidf = vectorizer_tfidf.fit_transform(df['description_clean'])
vectorized_descr = vectorizer_tfidf.fit_transform(df['description_clean'])
vectorized_descr = pd.DataFrame(vectorized_descr.toarray(), columns = vectorizer_tfidf.get_feature_names_out())

lda_model_tfidf = LatentDirichletAllocation(n_components=2)
lda_model_tfidf.fit(vectorized_descr)
Descr_Grouping_tfidf = lda_model_tfidf.transform(vectorized_descr)

# Tf-idf  -  term frequency - inverse document frequency
#     The intuition of Tf-idf is to give a high weight to any term which appears frequently
#     in a single document, but not in too many documents of the corpus.

# Check the maximun size of the tokenized vector

In [18]:
Descr_Grouping_tfidf

array([[0.88502254, 0.11497746],
       [0.14618399, 0.85381601],
       [0.88538316, 0.11461684],
       [0.88841028, 0.11158972],
       [0.11123014, 0.88876986],
       [0.88169606, 0.11830394],
       [0.8481962 , 0.1518038 ],
       [0.66045902, 0.33954098],
       [0.87540892, 0.12459108],
       [0.88020438, 0.11979562],
       [0.8498797 , 0.1501203 ],
       [0.86213902, 0.13786098],
       [0.84924117, 0.15075883],
       [0.14393323, 0.85606677],
       [0.8978647 , 0.1021353 ],
       [0.89717623, 0.10282377],
       [0.1522849 , 0.8477151 ],
       [0.11228147, 0.88771853],
       [0.87583391, 0.12416609],
       [0.86448881, 0.13551119],
       [0.88306881, 0.11693119],
       [0.14799428, 0.85200572],
       [0.8892001 , 0.1107999 ],
       [0.13852165, 0.86147835],
       [0.13650681, 0.86349319],
       [0.90074278, 0.09925722],
       [0.89323884, 0.10676116],
       [0.87807355, 0.12192645],
       [0.87442753, 0.12557247],
       [0.50019317, 0.49980683],
       [0.

In [19]:
Token_Weights_tfidf = pd.DataFrame(lda_model_tfidf.components_, columns = vectorizer_tfidf.get_feature_names_out())
Token_Weights_tfidf
# lda_model_tfidf.components_   gives same output but without the words as column names

Unnamed: 0,abound,abrupt,acacia,acadia,accent,accented,accentuated,accessibility,accessible,account,...,yet,yield,young,yountville,youthfully,zealand,zest,zesty,zinfandel,zippy
0,0.782382,0.507345,0.747354,0.508811,1.014745,1.152081,0.745182,0.742695,0.510806,0.539247,...,1.869589,0.504904,0.678797,0.772896,0.717765,0.775395,0.983665,1.549937,0.516993,0.745182
1,0.505744,0.775947,0.510502,0.714494,1.178096,0.557804,0.506828,0.513213,0.791719,0.695758,...,0.507588,0.732232,1.651587,0.729822,0.509313,0.664383,0.508226,0.507912,0.899051,0.506828


In [21]:
def print_topics(lda_model, vectorizer, top_words):
    # 1. TOPIC MIXTURE OF WORDS FOR EACH TOPIC
    topic_mixture = pd.DataFrame(lda_model.components_, columns = vectorizer.get_feature_names_out())
    
    # 2. FINDING THE TOP WORDS FOR EACH TOPIC
    ## Number of topics
    n_components = topic_mixture.shape[0]

    ## Top words for each topic
    for topic in range(n_components):
        print("-"*10)
        print(f"For topic {topic}, here are the the top {top_words} words with weights:")

        topic_df = topic_mixture.iloc[topic]\
            .sort_values(ascending = False).head(top_words)
        
        print(round(topic_df,3))

In [15]:
print_topics(lda_model_tfidf, vectorizer_tfidf, 20)

----------
For topic 0, here are the the top 20 words with weights:
lime         1.469
blanc        1.306
bottling     1.299
brisk        1.287
pair         1.277
rind         1.272
tart         1.168
tropical     1.125
citrus       1.098
green        1.050
add          1.046
new          1.038
cranberry    1.033
clean        1.012
overall      1.002
good         0.982
fermented    0.970
steel        0.963
stainless    0.963
riesling     0.959
Name: 0, dtype: float64
----------
For topic 1, here are the the top 20 words with weights:
wine       10.271
flavor      9.147
fruit       8.879
aroma       8.130
palate      7.818
ripe        6.913
acidity     6.607
black       6.333
drink       6.263
finish      6.242
tannin      6.120
note        5.910
cherry      5.779
red         5.592
oak         5.493
berry       5.386
spice       5.216
soft        4.914
offer       4.710
plum        4.700
Name: 1, dtype: float64
