In [1]:
# IMPORTING THE NECESSARY PACKAGES AND FUNCTIONS:

# generic:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import time 

# more specific:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer # Normalize samples individually to unit norm.
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler # used to compute stacked percentages barplots
import pickle # to save models, for instance LDA outputs

# NLP:
import spacy # version 2.2.1 
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import word2vec
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD # LSA
from sklearn.decomposition import NMF # NMF
from sklearn.decomposition import LatentDirichletAllocation # LDA
import pyLDAvis # used to visualize and plot ouptut of LDA
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()


In [2]:
winedata = pd.read_csv('../data/winedata_processed_and_tokenized.csv')
print(winedata.shape)
winedata.head()

(106873, 15)


Unnamed: 0,index,country,description,designation,points,price,province,region_1,region_2,taster_name,title,variety,winery,tokenized_descriptions,token_descr_as_string
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,"['aromas', 'include', 'tropical', 'fruit', 'br...",aromas include tropical fruit broom brimstone ...
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,"['ripe', 'fruity', 'smooth', 'structured', 'fi...",ripe fruity smooth structured firm tannins fil...
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,"['tart', 'snappy', 'lime', 'flesh', 'rind', 'd...",tart snappy lime flesh rind dominate green pin...
3,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,"['like', 'regular', 'bottling', 'comes', 'roug...",like regular bottling comes rough tannic rusti...
4,7,France,This dry and restrained wine offers spice in p...,,87,24.0,Alsace,Alsace,,Roger Voss,Trimbach 2012 Gewurztraminer (Alsace),Gewürztraminer,Trimbach,"['dry', 'restrained', 'offers', 'spice', 'prof...",dry restrained offers spice profusion balanced...


In [4]:
def print_wines(wine_indexes):
    """
        a generic function to print the most important feature of a or several wine(s).
        from their index(es): ! wine_indexes must be a list
    """
    # print most important characteristic of wine(s),
    # from their index(es).
    # ! wine_indexes must be a list
    for index_ in wine_indexes:
        current_wine = winedata.loc[index_, :]
        print(current_wine[['title', 'country', 'province', 'region_1', 'variety', 'taster_name']])
        print('desciption: ', current_wine.description)
        print('tokens: ', current_wine.tokenized_descriptions)
        print('------------------------------------------------------\n')

In [3]:
# vectorization using BoW :
time0 = time.time()

# note: we only keep 2000 features:
bow_vectorizer = CountVectorizer(ngram_range=(1,1), # 1-gram
                               max_df=0.95, # ignore t that have a df higher than max_df (corpus-specific stopwords)
                               min_df=10, # ignore terms that have a doc freq lower than threshold.
                               max_features=2000, # max number of features
                              )
# Applying the vectorizer on the "clean" descriptions:
wine_bow = bow_vectorizer.fit_transform(winedata.token_descr_as_string)

# list of features
terms = bow_vectorizer.get_feature_names()

# store the features in a dataframe:
bow_features = pd.DataFrame(wine_bow.toarray(), columns=terms)

print('Done! Vectorization took', time.time()-time0, 'seconds.')

Done! Vectorization took 2.8347349166870117 seconds.


In [13]:
domain_specific_stopwords = ['$', ' ', '’s', 'wine', 'winemaker', 'winemaking', 'winery',
                             'château', 'village', 'domaine',
                             'côte', 'saint', 'village', 'parcel', 'parcels',
                             'I', 'flavors', 'vineyard', 'vintage',
                             'now-2015', 'now-2018', 'now-2025']
# Other potential words to consider removing:
# douro, nacional, widely, muscat, willamette, bordeaux, pommard, rioja, barbaresco, chianti...

# a list of all wine varieties. this is not perfect as some varieties are compound words. But it's a start.
variety_stopwords = winedata.variety.unique().tolist()
variety_stopwords = [str(variety).lower() for variety in variety_stopwords] # convert to lowercase
# we add some notable varieties to the list:
variety_stopwords = variety_stopwords + ['pinot', 'gris', 'noir', 'grigio', 'cabernet', 
                                         'cabernets', 'sauvignon', 'sirah', 'tempranillo', 
                                         'chenin', 'sangiovese', 'grüner', 'veltliner', 'corvina', 
                                         'rondinella', 'molinara', 'franc', 'blanc', 'blend',
                                         'franc', 'mourvèdre']

# a list of all wine provinces. 
province_stopwords = winedata.province.unique().tolist()
province_stopwords = [str(province).lower() for province in province_stopwords] # convert to lowercase
# we add some notable province-related words to the list:
province_stopwords = province_stopwords + ['france', 'sicily', 'sardinia', 'mendoza', 'spain', 
                                           'australia', 'italy', 'loire', 'beaujolais']

# Standard model used for tokenization.
# When using another model, we will have to load it and redefine "nlp"

# We do not need word vectors here, so we can upload the small English model from spaCy:
# We do not need PoS tags, dependencies or named entities, so we disable them (run faster!):
nlp = spacy.load('en_core_web_sm', disable=['tagger', 'parser', 'ner'])

def tokenize_and_clean(description):
    """
        Our basic tokenizer function. It takes as input:
            - a pd.Series of the descriptions
            - a nlp model
        After tokenizing, it cleans the data too,
        and returns a list of tokens. 
    """
    
#     # eventually clean the description with regular expressions here:
#     pattern = "[\[].*?[\]]" # an example of an regex
#     description = re.sub(pattern, "", description) # remove this pattern from description.

    # Tokenize:
    mytokens = nlp(description)

    # Removing stopwords, punctuation, numbers and convert to lower_case
    mytokens = [token.lower_ for token in mytokens if not token.is_punct and not token.is_stop and not token.like_num and not token.is_digit]
    
    # remove domain-specific stopwords:
    mytokens = [token for token in mytokens if token not in domain_specific_stopwords]
            
    # remove wine variety occurrences in the descriptions:
    mytokens = [token for token in mytokens if token not in variety_stopwords]
    
    # remove wine province occurrences in the descriptions:
    mytokens = [token for token in mytokens if token not in province_stopwords]
                
    # Return preprocessed list of tokens
    return mytokens

def concatenate_list_of_words_in_one_string(list_of_words):
    """
        build one string with a list of words, with a space in the middle
    """
    return " ".join(list_of_words)

In [17]:
# targetwine1 = winedata.iloc[14,:] # this wine is the input to the recommender system
# target1 = tokenize_and_clean(targetwine1.description) # tokenize and clean
# target1 = concatenate_list_of_words_in_one_string(target1) # need 1 string as input to BoW vectorizer
# target1 = bow_vectorizer.fit_transform([target1])
# target1

ValueError: max_df corresponds to < documents than min_df

In [24]:
wine_vectors = bow_features.copy()
target1 = wine_vectors.iloc[14,:]
print(wine_vectors.shape)
wine_vectors.drop(14, inplace=True)
print(wine_vectors.shape)
target1

(106873, 2000)
(106872, 2000)


10       0
2015     0
2016     0
2017     0
2018     0
        ..
zest     0
zesty    0
zin      0
zingy    0
zippy    0
Name: 14, Length: 2000, dtype: int64

## Similarity of target with full vectorized dataset

In [29]:
cos_sim = cosine_similarity(target1.values.reshape(-1, 1), bow_features)

AttributeError: 'Series' object has no attribute 'to_array'

In [39]:
target1.values.reshape(-1, 1)

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])