# Libraries

In [1]:
import pandas as pd
import numpy as np
import string
import re
from wordcloud import WordCloud


from keras import models, layers, optimizers

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.decomposition import PCA

import nltk
from nltk.collocations import * 
from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline



Using TensorFlow backend.


# Data extraction and preprocessing

In [2]:
data = pd.read_csv('data\winemag-data-130k-v2.csv', index_col=0)
data.dropna(inplace=True, subset=['description', 'variety'])

X = data['description']
y = data['variety'].astype('category')

In [3]:
# unique varieties
n_labels = len(y.unique())
wine_varieties = list(y.unique())

lb = LabelBinarizer()
y = lb.fit_transform(y)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1988)

# EDA

In [5]:
# tokenize
pattern = "([A-Za-z]+-?'?[A-Za-z]+)"
texts_regex = [nltk.regexp_tokenize(text, pattern) for text in X]


# Get all the stop words in the English language
stopwords_list = stopwords.words('english')

# Now we have a list that includes all english stopwords, ponctuation and wine varieties
stopwords_list += list(string.punctuation)
stopwords_list += wine_varieties


# if using regex, no need to use word_tokenize ******
#texts_tokens = [word_tokenize(' '.join(text)) for text in texts_regex]

# lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokenized_texts = []
for text in texts_regex:
    lemmatized_tokenized_text = [lemmatizer.lemmatize(w) for w in text]
    lemmatized_tokenized_texts.append(lemmatized_tokenized_text)



##### *********    
#bigram_measures = nltk.collocations.BigramAssocMeasures()



#for lemmatized_tokenized_text in lemmatized_tokenized_texts:
#    bigram_finder = BigramCollocationFinder.from_words(macbeth_words_stopped)
#   bigram_scored = macbeth_finder.score_ngrams(bigram_measures.raw_freq)
#### **********
    
    
    
final_tokenized_texts = []
for lemmatized_tokenized_text in lemmatized_tokenized_texts:
    final_tokenized_text = [w.lower() for w in lemmatized_tokenized_text if w.lower() not in stopwords_list]
    final_tokenized_texts.append(final_tokenized_text)


# Vectorization

In [None]:
# TF IDF Vectorization


tfid_vectorizer = TfidfVectorizer(
    'content',
    token_pattern=pattern,
    ngram_range=(1, 2),
    min_df=50,
    stop_words=stopwords_list,
    max_features=None, # what is the ideal number here?
    )
X_train_tfid = tfid_vectorizer.fit_transform(X_train)
X_test_tfid = tfid_vectorizer.transform(X_test)

n_features = X_train_tfid.shape[-1]

token_counts = tfid_vectorizer.vocabulary_


In [None]:
# BoW Vectorization

count_vectorizer = CountVectorizer(
    'content',
    token_pattern=pattern,
    ngram_range=(1, 2),
    stop_words=stopwords_list,
    min_df=50,
    max_features=None, # what is the ideal number here?
    )

X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)

token_counts = count_vectorizer.vocabulary_


# Visualization

In [None]:
(y)

In [None]:
# wordclouds
    # overall
    # by variety


wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white',
                min_font_size = 10)
wordcloud.generate(' '.join(BoW)) 
  
# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show() 

In [None]:
# chart: alsolute word count
token_counts.min()




In [None]:
# chart: n of docs per word 

# chart: grouppings per TF IDF

# chart: wine variety grouppings per TF IDF


In [None]:
x = list((count_vectorizer.vocabulary_).values())
sns.distplot(x)

# Classification

In [None]:
# network architecture
model_1 = models.Sequential()

# add layers 
# (in this case, Dense which means that this layer will be fully connected)
# input_shape parameter is often optiona
model_1.add(layers.Dense(400, activation='relu', input_shape=(n_features,)))
model_1.add(layers.Dropout(0.5))
model_1.add(layers.Dense(200, activation='relu'))
model_1.add(layers.Dropout(0.5))
model_1.add(layers.Dense(200, activation='relu'))
model_1.add(layers.Dropout(0.3))
model_1.add(layers.Dense(n_labels, activation='softmax'))

# compile the model
model_1.compile(optimizer='Adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# train the model
# batched size can be tuned. The model will forward and backwards propagate once per batch
history_1 = model_1.fit(X_train_tfid, y_train,
                        epochs=50, batch_size=5000,
                       validation_split=0.1)

In [None]:
plt.figure()
plt.plot((history_1.history)['val_acc'], label='Validation Accuracy')
plt.plot((history_1.history)['acc'], label='Accuracy')
plt.legend()
plt.show()

In [None]:
(history_1.history).keys()