# Libraries

In [6]:
import pandas as pd
import numpy as np
import string
import re
from wordcloud import WordCloud


from keras import models, layers, optimizers

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.decomposition import PCA

import nltk
from nltk.collocations import * 
from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline



# Data extraction and preprocessing

In [7]:
data = pd.read_csv('data\winemag-data-130k-v2.csv', index_col=0)
data.dropna(inplace=True, subset=['description', 'variety'])

X = data['description']
y = data['variety'].astype('category')

In [8]:
# unique varieties
n_labels = len(y.unique())
wine_varieties = list(y.unique())

lb = LabelBinarizer()
y = lb.fit_transform(y)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1988)

# EDA

In [10]:
# tokenize
pattern = "([A-Za-z]+-?'?[A-Za-z]+)"
texts_regex = [nltk.regexp_tokenize(text, pattern) for text in X]


# Get all the stop words in the English language
stopwords_list = stopwords.words('english')

# Now we have a list that includes all english stopwords, ponctuation and wine varieties
stopwords_list += list(string.punctuation)
stopwords_list += wine_varieties


# if using regex, no need to use word_tokenize ******
#texts_tokens = [word_tokenize(' '.join(text)) for text in texts_regex]

# lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokenized_texts = []
for text in texts_regex:
    lemmatized_tokenized_text = [lemmatizer.lemmatize(w) for w in text]
    lemmatized_tokenized_texts.append(lemmatized_tokenized_text)



##### *********    
#bigram_measures = nltk.collocations.BigramAssocMeasures()



#for lemmatized_tokenized_text in lemmatized_tokenized_texts:
#    bigram_finder = BigramCollocationFinder.from_words(macbeth_words_stopped)
#   bigram_scored = macbeth_finder.score_ngrams(bigram_measures.raw_freq)
#### **********
    
    
    
final_tokenized_texts = []
for lemmatized_tokenized_text in lemmatized_tokenized_texts:
    final_tokenized_text = [w.lower() for w in lemmatized_tokenized_text if w.lower() not in stopwords_list]
    final_tokenized_texts.append(final_tokenized_text)


# Vectorization

In [11]:
# TF IDF Vectorization


tfid_vectorizer = TfidfVectorizer(
    'content',
    token_pattern=pattern,
    ngram_range=(1, 2),
    min_df=50,
    stop_words=stopwords_list,
    max_features=None, # what is the ideal number here?
    )
X_train_tfid = tfid_vectorizer.fit_transform(X_train)
X_test_tfid = tfid_vectorizer.transform(X_test)

n_features = X_train_tfid.shape[-1]

token_counts = tfid_vectorizer.vocabulary_


  'stop_words.' % sorted(inconsistent))


In [12]:
# BoW Vectorization

count_vectorizer = CountVectorizer(
    'content',
    token_pattern=pattern,
    ngram_range=(1, 2),
    stop_words=stopwords_list,
    min_df=50,
    max_features=None, # what is the ideal number here?
    )

X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)

token_counts = count_vectorizer.vocabulary_


  'stop_words.' % sorted(inconsistent))


# Visualization

In [None]:
# wordclouds
    # overall
    # by variety


wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white',
                min_font_size = 10)
wordcloud.generate(' '.join(BoW)) 
  
# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show() 

In [None]:
# chart: alsolute word count
token_counts.min()




In [None]:
# chart: n of docs per word

# chart: grouppings per TF IDF

# chart: wine variety grouppings per TF IDF


In [None]:
x = list((count_vectorizer.vocabulary_).values())
sns.distplot(x)

# Classification

In [13]:
# network architecture
model_1 = models.Sequential()

# add layers 
# (in this case, Dense which means that this layer will be fully connected)
# input_shape parameter is often optiona
model_1.add(layers.Dense(700, activation='relu', input_shape=(n_features,)))
model_1.add(layers.Dropout(0.3))
model_1.add(layers.Dense(500, activation='relu'))
model_1.add(layers.Dropout(0.3))
model_1.add(layers.Dense(300, activation='relu'))
model_1.add(layers.Dropout(0.3))
model_1.add(layers.Dense(n_labels, activation='softmax'))

# compile the model
model_1.compile(optimizer='Adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# train the model
# batched size can be tuned. The model will forward and backwards propagate once per batch
history_1 = model_1.fit(X_train_tfid, y_train,
                        epochs=50, batch_size=5000,
                       validation_split=0.1)

Train on 81881 samples, validate on 9098 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
plt.figure()
plt.plot((history_1.history)['val_acc'], label='Validation Accuracy')
plt.plot((history_1.history)['acc'], label='Accuracy')
plt.legend()
plt.show()

In [None]:
(history_1.history).keys()