In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import RobustScaler

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords 
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize 

In [2]:
syns_raw = pd.read_csv("../raw_data/wine_synonyms_csv.csv", index_col=0)
syns_raw.head()

Unnamed: 0_level_0,NAME_ORIGINAL,NAME_SYNONYMS,NAME_DIRTY,NAME_ALL,TYPE,Blend
ID_original_Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
w003,Abouriou,,,Abouriou,Red,single_var
w004,Agiorgitiko,St. George,,"Agiorgitiko, St. George",Red,single_var
w005,Aglianico,,,Aglianico,Red,single_var
w006,Aidani,,,Aidani,White,single_var
w007,Airen,,,Airen,White,single_var


In [3]:
np.unique(syns_raw['TYPE'])

array(['Cider', 'Fortified', 'Red', 'Rose', 'Sparkling', 'Sparkling Red',
       'Varies', 'White'], dtype=object)

In [3]:
def clean_rawdata():
    '''Once called, reads the raw data and convert it in a Pandas Dataframe.
        Carries out the cleaning tasks and returns a clean Dataframe, also saves
        a csv file locally'''


    #Clean duplicates and deals with unmatched grape varieties
    df = pd.read_csv("../raw_data/wine_data_csv.csv", index_col=0)
    df.drop_duplicates(inplace=True)
    columns_to_drop=['designation','region_2','taster_name','taster_twitter_handle','region_1']
    df.drop(columns=columns_to_drop,inplace=True)
    #df.dropna(inplace=True)
    df.drop(df[df['type']=='delete'].index,inplace=True)
    # df.to_csv("./raw_data/preprocessed_data.csv", index=False) #backup file moved to after resolve_synonyms()
    return df

def resolve_synonyms(df:pd.DataFrame)->pd.DataFrame:
    '''Reads the original data and implement standardised names for grape
    varieties. The user does not know this happens in the background,
    recommendations are still provided in the original variety.'''
    # Loading wine synonyms input data from file created with unique grape for one or several wines
    syns_raw = pd.read_csv("../raw_data/wine_synonyms_csv.csv", index_col=0)
    # Extracting the single column from the synonyms files that has all synonyns for each row
    all_grape_names = syns_raw.NAME_ALL.str.split(', ')
    # The synonyms file has multiple rows (synonyms) that hasn't been unified (synonymised) properly.
    # Here a table of synonyms is created to consolidate all synonyms.
    syns = {}     # for each item: key will be main synonym, values will include all synonyms including the main one (used for key)
    for row in all_grape_names:
        flat_dict = [num for elem in list(syns.values()) for num in elem]
        # checking if synonyms in each row are already present is the dictionary being created
        # if not, it creates a key and values
        if any(grape in flat_dict for grape in row) == False:
            syns[row[0]] = row
        # if yes, adds the new synonyns that don't exist yet in the list
        else:
            res = next((sub for sub in syns if any(grape in syns[sub] for grape in row) == True), None)
            syns[res].extend([item for item in row if item not in syns[res]])
    del flat_dict
    # Using the unified synonym table, it populates the column for grape vaiety in the main data table
    # with the main synonyms for each grape variety
    #df['variety_adj'] = df['variety'].apply(lambda grape: next((key for key, value in syns.items() if grape in value), None))
    #df.drop(columns=['variety'], inplace=True)
    def match_grape(grape):
        for key, value in syns.items():
            if grape in value:
                return key
    df['variety_adj'] = df['variety'].apply(match_grape)

    # ESTA 2 SGTEs LINEAS LA PUSE YO
    df.drop(columns=['country', 'points', 'price', 'title', 'variety', 'winery', ], inplace=True)
    df = df[['description', 'variety_adj', 'type', 'province', 'region']]

    
    df.to_csv("../raw_data/preprocessed_data.csv", index=False)

    return df

In [4]:
df = clean_rawdata()

In [None]:
df.columns

In [5]:
df = resolve_synonyms(df)

In [7]:
df.head(3)

Unnamed: 0_level_0,description,variety_adj,type,province,region
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,"Aromas include tropical fruit, broom, brimston...",White Blend,white,Sicily & Sardinia,Etna
1,"This is ripe and fruity, a wine that is smooth...",Portuguese Red,red,Douro,Douro
2,"Tart and snappy, the flavors of lime flesh and...",Pinot Gris,white,Oregon,Willamette Valley


In [6]:
#from nltk.corpus import stopwords 
#import string
#from nltk.stem.wordnet import WordNetLemmatizer
#from nltk import word_tokenize 

def clean (text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ') # Remove Punctuation
    text = text.strip() ## remove whitespaces
    lowercased = text.lower() # Lower Case
    tokenized = word_tokenize(lowercased) # Tokenize
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    stop_words = set(stopwords.words('english')) # Make stopword list
    without_stopwords = [word for word in words_only if not word in stop_words] # Remove Stop Words
    lemma=WordNetLemmatizer() # Initiate Lemmatizer
    lemmatized = [lemma.lemmatize(word) for word in without_stopwords] # Lemmatize
    cleaned = ' '.join(lemmatized) # Join back to a string
    return cleaned

# Apply to all texts
df['description_clean'] = df.description.apply(clean)

df.head(3)

Unnamed: 0_level_0,description,variety_adj,type,province,region,description_clean
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,"Aromas include tropical fruit, broom, brimston...",White Blend,white,Sicily & Sardinia,Etna,aroma include tropical fruit broom brimstone d...
1,"This is ripe and fruity, a wine that is smooth...",Portuguese Red,red,Douro,Douro,ripe fruity wine smooth still structured firm ...
2,"Tart and snappy, the flavors of lime flesh and...",Pinot Gris,white,Oregon,Willamette Valley,tart snappy flavor lime flesh rind dominate gr...


In [17]:
df = df.head(200)
df

Unnamed: 0_level_0,description,variety_adj,type,province,region,description_clean
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,"Aromas include tropical fruit, broom, brimston...",White Blend,white,Sicily & Sardinia,Etna,aroma include tropical fruit broom brimstone d...
1,"This is ripe and fruity, a wine that is smooth...",Portuguese Red,red,Douro,Douro,ripe fruity wine smooth still structured firm ...
2,"Tart and snappy, the flavors of lime flesh and...",Pinot Gris,white,Oregon,Willamette Valley,tart snappy flavor lime flesh rind dominate gr...
3,"Pineapple rind, lemon pith and orange blossom ...",Riesling,white,Michigan,Lake Michigan Shore,pineapple rind lemon pith orange blossom start...
4,"Much like the regular bottling from 2012, this...",Pinot Noir,red,Oregon,Willamette Valley,much like regular bottling come across rather ...
...,...,...,...,...,...,...
195,"This is a mature, jammy wine with thick layers...",Prugnolo Gentile,red,Tuscany,Vino Nobile di Montepulciano,mature jammy wine thick layer plum prune raisi...
196,Pancole is a pretty expression of Tuscany's po...,Vernaccia,white,Tuscany,Vernaccia di San Gimignano,pancole pretty expression tuscany popular vern...
197,"This is a concentrated, fairly full and lush C...",Chenin Blanc,white,Western Cape,Western Cape,concentrated fairly full lush chenin rich toas...
198,"This is a soft, well-crafted wine from the eas...",Cabernet Sauvignon,red,California,Napa Valley,soft well crafted wine eastern side st helena ...


In [16]:
vectorizer_tfidf = TfidfVectorizer()
vect_tfidf = vectorizer_tfidf.fit_transform(df['description_clean'])
vect_tfidf

<200x1449 sparse matrix of type '<class 'numpy.float64'>'
	with 4867 stored elements in Compressed Sparse Row format>

In [18]:
vectorized_documents = pd.DataFrame(
    vect_tfidf.toarray(), 
    columns = vectorizer_tfidf.get_feature_names_out()
)
vectorized_documents

Unnamed: 0,abound,abrupt,acacia,acadia,accent,accented,accentuated,accessibility,accessible,account,...,yet,yield,young,yountville,youthfully,zealand,zest,zesty,zinfandel,zippy
0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
196,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
197,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
198,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
lda_model_tfidf = LatentDirichletAllocation(n_components=2)
Descr_Grouping_tfidf = lda_model_tfidf.fit(vectorized_documents)
Descr_Grouping_tfidf

In [22]:
Descr_Grouping_tfidf = lda_model_tfidf.transform(vectorized_documents)
Descr_Grouping_tfidf

array([[0.88584573, 0.11415427],
       [0.8907165 , 0.1092835 ],
       [0.88183544, 0.11816456],
       [0.89738909, 0.10261091],
       [0.20349258, 0.79650742],
       [0.90336104, 0.09663896],
       [0.89735841, 0.10264159],
       [0.87269069, 0.12730931],
       [0.88561711, 0.11438289],
       [0.88981353, 0.11018647],
       [0.89441545, 0.10558455],
       [0.89073981, 0.10926019],
       [0.8910464 , 0.1089536 ],
       [0.90148758, 0.09851242],
       [0.90177943, 0.09822057],
       [0.89373328, 0.10626672],
       [0.90131821, 0.09868179],
       [0.89382887, 0.10617113],
       [0.89089744, 0.10910256],
       [0.90804559, 0.09195441],
       [0.90381148, 0.09618852],
       [0.89398617, 0.10601383],
       [0.89051423, 0.10948577],
       [0.89258243, 0.10741757],
       [0.89064631, 0.10935369],
       [0.89793679, 0.10206321],
       [0.90213106, 0.09786894],
       [0.90536467, 0.09463533],
       [0.89529121, 0.10470879],
       [0.37304694, 0.62695306],
       [0.

In [23]:
Token_Weights_tfidf = pd.DataFrame(lda_model_tfidf.components_, columns = vectorizer_tfidf.get_feature_names_out())
Token_Weights_tfidf

Unnamed: 0,abound,abrupt,acacia,acadia,accent,accented,accentuated,accessibility,accessible,account,...,yet,yield,young,yountville,youthfully,zealand,zest,zesty,zinfandel,zippy
0,0.778332,0.774697,0.748393,0.713709,1.680057,1.198934,0.742066,0.745971,0.792971,0.726401,...,1.81768,0.728205,1.808904,0.989956,0.716622,0.558536,0.981358,1.479796,0.728504,0.742066
1,0.509794,0.508595,0.509462,0.509596,0.512784,0.510951,0.509945,0.509937,0.509554,0.508604,...,0.559498,0.508932,0.52148,0.512763,0.510456,0.881243,0.510533,0.578053,0.68754,0.509945


In [24]:
def print_topics(lda_model, vectorizer, top_words):
    # 1. TOPIC MIXTURE OF WORDS FOR EACH TOPIC
    topic_mixture = pd.DataFrame(
        lda_model.components_,
        columns = vectorizer.get_feature_names_out()
    )
    
    # 2. FINDING THE TOP WORDS FOR EACH TOPIC
    ## Number of topics
    n_components = topic_mixture.shape[0]

    ## Top words for each topic
    for topic in range(n_components):
        print("-"*10)
        print(f"For topic {topic}, here are the the top {top_words} words with weights:")

        topic_df = topic_mixture.iloc[topic]\
            .sort_values(ascending = False).head(top_words)
        
        print(round(topic_df,3))

In [26]:
print_topics(lda_model_tfidf, vectorizer_tfidf, 20)

----------
For topic 0, here are the the top 20 words with weights:
wine       10.377
flavor      9.241
fruit       8.898
aroma       8.191
palate      7.870
ripe        6.913
acidity     6.789
black       6.328
drink       6.271
finish      6.172
tannin      6.091
note        5.975
cherry      5.761
red         5.570
oak         5.460
berry       5.385
spice       5.202
soft        4.906
plum        4.747
offer       4.734
Name: 0, dtype: float64
----------
For topic 1, here are the the top 20 words with weights:
bottling      1.225
vineyard      1.136
tomato        1.023
new           0.984
leaf          0.974
known         0.969
blanc         0.948
keeping       0.928
salt          0.919
designates    0.902
defines       0.898
spiced        0.897
region        0.884
gooseberry    0.881
zealand       0.881
many          0.876
like          0.873
herbal        0.852
slate         0.846
short         0.831
Name: 1, dtype: float64


In [8]:
#from sklearn.decomposition import LatentDirichletAllocation
#from sklearn.feature_extraction.text import CountVectorizer
#from sklearn.feature_extraction.text import TfidfVectorizer

# vectorizer_cnt = CountVectorizer()
# vect_cnt = vectorizer_cnt.fit_transform(df['description_clean'])
# lda_model_cnt = LatentDirichletAllocation(n_components=2)
# lda_model_cnt.fit
# Descr_Grouping_cnt = lda_model_cnt.fit_transform(vect_cnt)
# Token_Weights_cnt = pd.DataFrame(lda_model_cnt.components_, columns = vect_cnt.get_feature_names_out())

vectorizer_tfidf = TfidfVectorizer()
vect_tfidf = vectorizer_tfidf.fit_transform(df['description_clean'])
vect_tfidf = pd.DataFrame(vect_tfidf.toarray(), columns = vectorizer_tfidf.get_feature_names_out())
lda_model_tfidf = LatentDirichletAllocation(n_components=2, max_iter = 100)
lda_model_tfidf.fit(vect_tfidf)
Descr_Grouping_tfidf = lda_model_tfidf.transform(vect_tfidf)
#Token_Weights_tfidf = pd.DataFrame(lda_model_tfidf.components_, columns = vect_tfidf.get_feature_names_out())

# Tf-idf  -  term frequency - inverse document frequency
#     The intuition of Tf-idf is to give a high weight to any term which appears frequently
#     in a single document, but not in too many documents of the corpus.

# Check the maximun size of the tokenized vector

In [12]:
lda_model_tfidf.get_feature_names_out()

array(['latentdirichletallocation0', 'latentdirichletallocation1'],
      dtype=object)

In [13]:
Token_Weights_tfidf = pd.DataFrame(lda_model_tfidf.components_, columns = lda_model_tfidf.get_feature_names_out())

ValueError: Shape of passed values is (2, 1449), indices imply (2, 2)

In [14]:
lda_model_tfidf.components_

array([[0.77691175, 0.77385502, 0.74763894, ..., 1.54500946, 0.90046577,
        0.74076908],
       [0.51121406, 0.50943706, 0.51021641, ..., 0.51283976, 0.51557812,
        0.51124165]])

In [15]:
lda_model_tfidf.print_topics()

AttributeError: 'LatentDirichletAllocation' object has no attribute 'print_topics'

In [10]:
Descr_Grouping_cnt

array([[0.39642286, 0.60357714],
       [0.05814122, 0.94185878],
       [0.15372074, 0.84627926],
       ...,
       [0.03755896, 0.96244104],
       [0.03211926, 0.96788074],
       [0.04734801, 0.95265199]])

In [8]:
vectorizer_tfidf = TfidfVectorizer()
vect_tfidf = vectorizer_tfidf.fit_transform(df['description_clean'])
vect_tfidf = pd.DataFrame(vect_tfidf.toarray(), columns = vectorizer_tfidf.get_feature_names_out())
lda_model_tfidf = LatentDirichletAllocation(n_components=2, max_iter = 100)
lda_model_tfidf.fit(vect_tfidf)
Descr_Grouping_tfidf = lda_model_tfidf.transform(vect_tfidf)
Token_Weights_tfidf = pd.DataFrame(lda_model_tfidf.components_, columns = vect_tfidf.get_feature_names_out())

MemoryError: Unable to allocate 24.4 GiB for an array with shape (119986, 27282) and data type float64

In [None]:
#from sklearn.decomposition import LatentDirichletAllocation
#from sklearn.feature_extraction.text import CountVectorizer
#from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_cnt = CountVectorizer()
vect_cnt = vectorizer_cnt.fit_transform(df['description_clean'])
lda_model_cnt = LatentDirichletAllocation(n_components=2)
Descr_Grouping_cnt = lda_model_cnt.fit_transform(vect_cnt)
Token_Weights_cnt = pd.DataFrame(lda_model_cnt.components_, columns = vect_cnt.get_feature_names_out())

vectorizer_tfidf = TfidfVectorizer()
vect_tfidf = vectorizer_tfidf.fit_transform(df['description_clean'])
lda_model_tfidf = LatentDirichletAllocation(n_components=2)
Descr_Grouping_tfidf = lda_model_tfidf.fit_transform(vect_tfidf)
Token_Weights_tfidf = pd.DataFrame(lda_model_tfidf.components_, columns = vect_tfidf.get_feature_names_out())
#vect_tfidf = pd.DataFrame(vect_tfidf.toarray(), columns = vectorizer.get_feature_names_out())

# Tf-idf  -  term frequency - inverse document frequency
#     The intuition of Tf-idf is to give a high weight to any term which appears frequently
#     in a single document, but not in too many documents of the corpus.

# Check the maximun size of the tokenized vector

In [None]:
Token_Weights_cnt = pd.DataFrame(lda_model_cnt.components_, columns = vect_cnt.get_feature_names_out())

In [None]:
vect_cnt