In [9]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, FreqDist
import string
import re
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.model_selection import train_test_split

from keras import models
from keras import layers
from keras import optimizers



Using TensorFlow backend.


In [2]:
data = pd.read_csv('data\winemag-data-130k-v2.csv', index_col=0)
data.dropna(inplace=True, subset=['description', 'variety'])

X = data['description']
y = data['variety'].astype('category')



In [3]:
# unique varieties
n_labels = len(y.unique())
n_labels

707

In [4]:
# one hot encoding for the labels
from sklearn.preprocessing import LabelBinarizer

lb = LabelBinarizer()
y = lb.fit_transform(y)




In [5]:

# Get all the stop words in the English language
stopwords_list = stopwords.words('english')

# It is generally a good idea to also remove punctuation
# Now we have a list that includes all english stopwords, as well as all punctuation
stopwords_list += list(string.punctuation)


#REGEX for words to tokenized
pattern = "([A-Za-z]+-?'?[A-Za-z]+)"

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1988)
X_train_final, X_train_val, y_train_final, y_train_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1988)

In [None]:
n_features = 2000
tfid_vectorizer = TfidfVectorizer(
    'content',
    stop_words = stopwords_list,
    token_pattern=pattern,
    ngram_range=(1, 2),
    max_features=n_features # what is the ideal number here?
    )
tfid_vectors = tfid_vectorizer.fit_transform(X_train)
X_train_final = tfid_vectorizer.transform(X_train_final)
X_train_val = tfid_vectorizer.transform(X_train_val)


In [7]:
n_features = 2000
count_vectorizer = CountVectorizer(
    'content',
    stop_words = stopwords_list,
    token_pattern=pattern,
    ngram_range=(1, 2),
    max_features=n_features # what is the ideal number here?
    )

count_vector = count_vectorizer.fit(X_train)
X_train_final = count_vectorizer.transform(X_train_final)
X_train_val = count_vectorizer.transform(X_train_val)

In [None]:

# network architecture
model = models.Sequential()

# add layers 
# (in this case, Dense which means that this layer will be fully connected)
# input_shape parameter is often optiona
model.add(layers.Dense(100, activation='relu', input_shape=(n_features,)))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(100, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(500, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(n_labels, activation='softmax'))

# compile the model
model.compile(optimizer='Adam',
              loss='mse',
              metrics=['accuracy'])

# train the model
# batched size can be tuned. The model will forward and backwards propagate once per batch
history = model.fit(X_train_final,
                    y_train_final,
                    epochs=300,
                    batch_size=10000,
                    validation_data=(X_train_val, y_train_val))


# useful attributes

#history.history #retrieves further information regarding how the model training progressed from epoch to epoch

# evaluation
model.evaluate(X_train_final, y_train_final)

In [10]:

# network architecture
model_2 = models.Sequential()

# add layers 
# (in this case, Dense which means that this layer will be fully connected)
# input_shape parameter is often optiona
model_2.add(layers.Dense(200, activation='relu', input_shape=(n_features,)))
model_2.add(layers.Dropout(0.5))
model_2.add(layers.Dense(100, activation='relu'))
model_2.add(layers.Dropout(0.5))
model_2.add(layers.Dense(100, activation='relu'))
model_2.add(layers.Dropout(0.3))
model_2.add(layers.Dense(n_labels, activation='softmax'))

# compile the model
model_2.compile(optimizer='Adam',
              loss='mse',
              metrics=['accuracy'])

# train the model
# batched size can be tuned. The model will forward and backwards propagate once per batch
history_2 = model_2.fit(X_train_final,
                    y_train_final,
                    epochs=300,
                    batch_size=10000,
                    validation_data=(X_train_val, y_train_val))

Train on 51988 samples, validate on 12997 samples
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300


Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78/300
Epoch 79/300
Epoch 80/300
Epoch 81/300
Epoch 82/300
Epoch 83/300
Epoch 84/300
Epoch 85/300
Epoch 86/300
Epoch 87/300
Epoch 88/300
Epoch 89/300
Epoch 90/300
Epoch 91/300
Epoch 92/300
Epoch 93/300
Epoch 94/300
Epoch 95/300
Epoch 96/300
Epoch 97/300
Epoch 98/300
Epoch 99/300
Epoch 100/300
Epoch 101/300
Epoch 102/300
Epoch 103/300
Epoch 104/300
Epoch 105/300
Epoch 106/300
Epoch 107/300
Epoch 108/300
Epoch 109/300
Epoch 110/300
Epoch 111/300
Epoch 112/300
Epoch 113/300
Epoch 114/300
Epoch 115/300
Epoch 116/300
Epoch 117/300
Epoch 118/300
Epoch 119/300
Epoch 120/300
Epoch 121/300
Epoch 122/300
Epoch 123/300
Epoch 124/300
Epoch 125/300
Epoch 126/300
Epoch 127/300
Epoch 128/300
Epoch 129/300
Epoch 130/300
Epoch 131/300
Epoch 132/300
Epoch

Epoch 171/300
Epoch 172/300
Epoch 173/300
Epoch 174/300
Epoch 175/300
Epoch 176/300
Epoch 177/300
Epoch 178/300
Epoch 179/300
Epoch 180/300
Epoch 181/300
Epoch 182/300
Epoch 183/300
Epoch 184/300
Epoch 185/300
Epoch 186/300
Epoch 187/300
Epoch 188/300
Epoch 189/300
Epoch 190/300
Epoch 191/300
Epoch 192/300
Epoch 193/300
Epoch 194/300
Epoch 195/300
Epoch 196/300
Epoch 197/300
Epoch 198/300
Epoch 199/300
Epoch 200/300
Epoch 201/300
Epoch 202/300
Epoch 203/300
Epoch 204/300
Epoch 205/300
Epoch 206/300
Epoch 207/300
Epoch 208/300
Epoch 209/300
Epoch 210/300
Epoch 211/300
Epoch 212/300
Epoch 213/300
Epoch 214/300
Epoch 215/300
Epoch 216/300
Epoch 217/300
Epoch 218/300
Epoch 219/300
Epoch 220/300
Epoch 221/300
Epoch 222/300
Epoch 223/300
Epoch 224/300
Epoch 225/300


Epoch 226/300
Epoch 227/300
Epoch 228/300
Epoch 229/300
Epoch 230/300
Epoch 231/300
Epoch 232/300
Epoch 233/300
Epoch 234/300
Epoch 235/300
Epoch 236/300
Epoch 237/300
Epoch 238/300
Epoch 239/300
Epoch 240/300
Epoch 241/300
Epoch 242/300
Epoch 243/300
Epoch 244/300
Epoch 245/300
Epoch 246/300
Epoch 247/300
Epoch 248/300
Epoch 249/300
Epoch 250/300
Epoch 251/300
Epoch 252/300
Epoch 253/300
Epoch 254/300
Epoch 255/300
Epoch 256/300
Epoch 257/300
Epoch 258/300
Epoch 259/300
Epoch 260/300
Epoch 261/300
Epoch 262/300
Epoch 263/300
Epoch 264/300
Epoch 265/300
Epoch 266/300
Epoch 267/300
Epoch 268/300
Epoch 269/300
Epoch 270/300
Epoch 271/300
Epoch 272/300
Epoch 273/300
Epoch 274/300
Epoch 275/300
Epoch 276/300
Epoch 277/300
Epoch 278/300
Epoch 279/300
Epoch 280/300
Epoch 281/300
Epoch 282/300
Epoch 283/300
Epoch 284/300
Epoch 285/300
Epoch 286/300
Epoch 287/300
Epoch 288/300
Epoch 289/300
Epoch 290/300
Epoch 291/300
Epoch 292/300
Epoch 293/300
Epoch 294/300
Epoch 295/300
Epoch 296/300
Epoch 

In [None]:
n_features = 2000
tfid_vectorizer = TfidfVectorizer(
    'content',
    stop_words = stopwords_list,
    token_pattern=pattern,
    ngram_range=(1, 2),
    max_features=n_features # what is the ideal number here?
    )
tfid_vectors = tfid_vectorizer.fit_transform(X_train)
X_train_final = tfid_vectorizer.transform(X_train_final)
X_train_val = tfid_vectorizer.transform(X_train_val)


from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
tfid_vectors_scaled = ss.fit_transform(tfid_vectors)
X_train_final_scaled = ss.transform(X_train_final)
X_test_val_scaled = ss.transform(X_test_val)

# network architecture
model_3 = models.Sequential()

# add layers 
# (in this case, Dense which means that this layer will be fully connected)
# input_shape parameter is often optiona
model_3.add(layers.Dense(200, activation='relu', input_shape=(n_features,)))
model_3.add(layers.Dropout(0.5))
model_3.add(layers.Dense(100, activation='relu'))
model_3.add(layers.Dropout(0.5))
model_3.add(layers.Dense(100, activation='relu'))
model_3.add(layers.Dropout(0.3))
model_3.add(layers.Dense(n_labels, activation='softmax'))

# compile the model
model_3.compile(optimizer='Adam',
              loss='mse',
              metrics=['accuracy'])

# train the model
# batched size can be tuned. The model will forward and backwards propagate once per batch
history_3 = model_3.fit(X_train_final_scaled,
                    y_train_final,
                    epochs=300,
                    batch_size=10000,
                    validation_data=(X_train_val_scaled, y_train_val))

In [None]:
# check distribution of tdfi and absolute frequency. it might help deciding n_features

# try with simple frequency

In [None]:

# plot the frequency of the wine varieties
# plot the scatter points for frequancies of words, with a colormap or dotsize to show scale of tfid (uniqueness).
    #for the most commo wines
# plot the clusters for k nearst groups optimized


# how well does a deep neural network predict based only on relative word frequencies
# how well can a deep network predict the rating? (base on sentiment, words?)

# then try to connect sentiment analysis
    # compare sentiment analisys with rating given by the critic
    
# compare to recomendation system
    # if you believe you have a similar taste to that of a certain critic, these are other wines you may like:...

# other possible insights:
# correlation between critics opinion and price

In [None]:
#transform every single variety into a single "text" and run the tdfi in it. Then try to predict one just from the score

In [None]:


texts_regex = [nltk.regexp_tokenize(text, pattern) for text in X]

# if using regex, no need to use word_tokenize ******
#texts_tokens = [word_tokenize(' '.join(text)) for text in texts_regex]

# lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokenized_texts = []
for text in texts_regex:
    lemmatized_tokenized_text = [lemmatizer.lemmatize(w) for w in text]
    lemmatized_tokenized_texts.append(lemmatized_tokenized_text)



final_tokenized_texts = []
for lemmatized_tokenized_text in lemmatized_tokenized_texts:
    final_tokenized_text = [w.lower() for w in lemmatized_tokenized_text if w not in stopwords_list]
    final_tokenized_texts.append(final_tokenized_text)


In [None]:
# full vocebulary
full_vocab = set()
for lst in final_tokenized_texts:
    full_vocab.update(set(lst))
full_vocab = list(full_vocab)

In [None]:
freq_dist_tokens = []
for final_tokenized_text in final_tokenized_texts:
    freqdist = FreqDist(final_tokenized_text)
    freq_dist_tokens.append(freqdist)

# get the 200 most common words 
#most_common = freqdist.most_common(200)

In [None]:
# vectorization

def count_vectorize(text, vocab=None):
    if vocab:
        unique_words = vocab
    else:
        unique_words = list(set(text))
    
    text_dict = {i:0 for i in unique_words}
    
    for word in text:
        text_dict[word] += 1
    
    return text_dict

In [None]:
#count_vectors = [count_vectorize(text, vocab=full_vocab) for text in final_tokenized_texts]

#TfidfVectorizer(input='content',analyzer='word', vocabulary=) # add tokenizer maybe?


tfid_vectorizer = TfidfVectorizer(
    'content',
    stop_words = stopwords_list,
    token_pattern=pattern,
    ngram_range=(1, 2),
    max_features=2000 # what is the ideal number here?
    )
tfid_vectors = tfid_vectorizer.fit_transform(X_train)


In [None]:
tfid_vectors.shape

In [None]:
from keras import models
from keras import layers
from keras import optimizers


# network architecture
model = models.Sequential()

# add layers 
# (in this case, Dense which means that this layer will be fully connected)
# input_shape parameter is often optiona
model.add(layers.Dense(20, 'relu', input_shape(2000,)))
model.add(layers.Dense(10, 'softmax'))

# compile the model
model.compile(optimizer='SGD',
              loss='mse',
              metrics=['accuracy'])

# train the model
# batched size can be tuned. The model will forward and backwards propagate once per batch
history = model.fit(x_train,
                    y_train,
                    epochs=20,
                    batch_size=512,
                    validation_data=(x_val, y_val))


# useful attributes

history.history #retrieves further information regarding how the model training progressed from epoch to epoch

# evaluation
model.evaluate(X_train, X_train_labels)