In [1]:
import pandas as pd
import unidecode
import re
#unaccented_string = unidecode.unidecode(accented_string)

In [133]:
data = pd.read_csv("data/winemag-data-130k-v2.csv", encoding='utf-8')
data.drop([data.columns[0], 'designation', 'taster_twitter_handle'], axis=1, inplace=True)

In [134]:
def func(x):
    try: 
        return unidecode.unidecode(x).lower() 
    except: 
        return x
for col in data.columns:
    data[col] = data[col].apply(func)
data['description'] = data['description'].apply(lambda(x): re.sub("[^a-zA-Z ]","", re.sub("-", " ", x)))

In [135]:
chosen = list(data['variety'].value_counts()[0:30].index)
def label_row(row):
    if row['variety'] in chosen:
        return row['variety']
    return 'other'
data['y_variety'] = data.apply (lambda row: label_row(row),axis=1)
chosen = list(data['province'].value_counts()[0:30].index)
def label_row(row):
    if row['province'] in chosen:
        return row['province']
    return 'other'
data['y_province'] = data.apply (lambda row: label_row(row),axis=1)

In [136]:
data = data.drop_duplicates()
trainRaw = data.sample(n=100000, replace=False, random_state=1)
test_val = data.drop(trainRaw.index)
testRaw = test_val.sample(frac=0.5, replace=False, random_state=1)
valRaw = test_val.drop(testRaw.index)

In [110]:
trainRaw.to_json("train.json",orient='records')
testRaw.to_json("test.json",orient='records')
valRaw = test_val.drop(testRaw.index)

In [137]:
vectors = np.load("data/GloVe_wine_5k.npy")
words = np.load('data/5k_vocab_dict.npy').item()
EMBEDDING_DIM = len(vectors[0])

In [85]:
embedding_dict = {}
for k,v in words.items():
    embedding_dict[k] = vectors[v]

In [138]:
texts = list(trainRaw['description'])
texts_val = list(valRaw['description'])
labels = list(trainRaw['y_variety'])

In [122]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

MAX_SEQUENCE_LENGTH=136

tokenizer = Tokenizer(num_words=len(vectors))
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
sequences_val = tokenizer.texts_to_sequences(texts_val)

word_index = tokenizer.word_index
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
data_val = pad_sequences(sequences_val, maxlen=MAX_SEQUENCE_LENGTH)

In [73]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embedding_dict.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        

In [123]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(list(trainRaw['y_variety']))
labels = le.transform(trainRaw['y_variety'])
labels_val = le.transform(valRaw['y_variety'])
keys = list(le.classes_)
vals = le.transform(keys)
labels_index = dict(zip(keys,vals))

In [127]:
from keras.layers import *
from keras.models import Model

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 3, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 3, activation='relu')(x)
x = MaxPooling1D(2)(x)  # global max pooling
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(len(labels_index), activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

encoding = to_categorical(labels)
encoding_val = to_categorical(labels_val)
res = model.fit(data, encoding, validation_data=(data_val, encoding_val),
          epochs=10, batch_size=128)

Train on 100000 samples, validate on 14985 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [130]:
from keras.utils import to_categorical
encoding = to_categorical(labels)
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(100))
model.add(Dense(31, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
results = model.fit(data, encoding, epochs=6, validation_data=(data_val, encoding_val), batch_size=64)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_17 (Embedding)     (None, 136, 200)          5855600   
_________________________________________________________________
lstm_6 (LSTM)                (None, 100)               120400    
_________________________________________________________________
dense_22 (Dense)             (None, 31)                3131      
Total params: 5,979,131
Trainable params: 123,531
Non-trainable params: 5,855,600
_________________________________________________________________
None
Train on 100000 samples, validate on 14985 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [142]:
valRaw.shape

(9994, 13)