In [1]:
import pandas as pd
import numpy as np


import logging

#import gensim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfTransformer

import os
from keras.layers import Activation, Conv1D, Dense, Embedding, Flatten, Input, MaxPooling1D
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.metrics import categorical_accuracy, categorical_crossentropy
from keras.utils import to_categorical

Using TensorFlow backend.


In [7]:
from keras.models import load_model

In [2]:
from sklearn.metrics import classification_report

In [3]:
%matplotlib inline

In [4]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 395772404296247133
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 4945621811
locality {
  bus_id: 1
  links {
  }
}
incarnation: 6484935577701039209
physical_device_desc: "device: 0, name: GeForce GTX 1060 with Max-Q Design, pci bus id: 0000:01:00.0, compute capability: 6.1"
]


Ref: https://www.kaggle.com/carlosaguayo/deep-learning-for-text-classification#Loading-GloVe-embeddings
http://nlp.stanford.edu/projects/glove/

In [23]:
vocab_size = 50000
max_sentence_len = 300

In [24]:
data = pd.read_csv("data_level5.tsv", sep="\t")

In [25]:
y_lab = "level5_clean"

In [26]:
data = data.dropna()

In [27]:
len(data["Review"][0].split(" "))

90

In [28]:
len(data[data["Review"].apply(lambda txt: len(txt.split(" "))>max_sentence_len)])

3283

In [11]:
len(data)

517137

3282 of 500k reviews have more than 300 words. Dropping them so we won't train on partial strings

In [29]:
data = data[data["Review"].apply(lambda txt: len(txt.split(" "))<=max_sentence_len)]

In [30]:

texts = data.Review # Extract text
target = data[y_lab] # Extract target

In [31]:
%%time
tokenizer = Tokenizer(num_words=vocab_size) # Setup tokenizer
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts) # Generate sequences

Wall time: 32.5 s


In [32]:
len(tokenizer.word_index)

78967

In [33]:
len(sequences)

513854

In [34]:
len(data)

513854

In [35]:
inv_index = {v:k for k,v in tokenizer.word_index.items()}

for w in sequences[0]:
    x = inv_index.get(w)
    print(x, end=" ")

i bought 3 products from this new mongongo line shampoo moisture seal masque and moisture styling gel it went on smoothly but when it dried my 2 4 c curly hair frizzed it s crispy and dry i wish i could take all of them back the other products in the old line work very well curl enhancing smoothie curling gel souffle and the deep treatment masque work good i was hoping for even more improvement with the new mongongo line not so beware you will have thick dry frizz 

In [19]:
avg_len = sum(map(len, sequences))/len(sequences)
std_len = np.sqrt(sum(map(lambda x:(len(x) - avg_len) **2 , sequences))/len(sequences))

avg_len, std_len

(59.88358366384226, 45.26998350757918)

In [36]:
data = pad_sequences(sequences, maxlen=max_sentence_len)

In [21]:
data.shape

(513854, 300)

In [63]:
target.value_counts(normalize=True)

function/performance    0.718801
appearance              0.154945
packaging/labeling      0.069181
assembly/preparation    0.033654
aroma/flavor            0.022339
others                  0.001080
Name: level5_clean, dtype: float64

In [37]:
from sklearn.preprocessing import LabelBinarizer

In [38]:
encoder = LabelBinarizer()

In [39]:
labels = encoder.fit_transform(target)

In [40]:
print('Shape of data:', data.shape)
print('Shape of labels:', labels.shape)

Shape of data: (513854, 300)
Shape of labels: (513854, 6)


In [41]:
glove_dir = 'glove-global-vectors-for-word-representation' # This is the folder with the dataset

embeddings_index = {} # We create a dictionary of word -> embedding

with open(os.path.join(glove_dir, 'glove.6B.100d.txt'), "rb") as f:
    for n,line in enumerate(f):
        values = line.split()
        word = values[0].decode("utf-8") # The first value is the word, the rest are the values of the embedding
        embedding = np.asarray(values[1:], dtype='float32') # Load embedding
        embeddings_index[word] = embedding # Add embedding to our embedding dictionary

print('Found {:,} word vectors in GloVe.'.format(len(embeddings_index)))

Found 400,000 word vectors in GloVe.


In [47]:
#embeddings_index2 = {}
#for key_ in embeddings_index:
#    embeddings_index2[key_.decode("utf-8")] = embeddings_index[key_]
#embeddings_index = embeddings_index2
#del embeddings_index2

In [42]:
embedding_dim = len(embeddings_index["hello"])

In [43]:
embedding_dim, vocab_size

(100, 50000)

In [44]:
#embedding_dim = 100 # We use 100 dimensional glove vectors

word_index = tokenizer.word_index
nb_words = min(vocab_size, len(word_index)) # How many words are there actually

embedding_matrix = np.zeros((nb_words, embedding_dim))

# The vectors need to be in the same position as their index. 
# Meaning a word with token 1 needs to be in the second row (rows start with zero) and so on

# Loop over all words in the word index
for word, i in word_index.items():
    # If we are above the amount of words we want to use we do nothing
    if i >= vocab_size: 
        continue
    # Get the embedding vector for the word
    embedding_vector = embeddings_index.get(word)
    # If there is an embedding vector, put it in the embedding matrix
    if embedding_vector is not None: 
        embedding_matrix[i] = embedding_vector

In [55]:
model = Sequential()
model.add(Embedding(vocab_size, 
                    embedding_dim, 
                    input_length=max_sentence_len, 
                    weights = [embedding_matrix], 
                    trainable = False)) 
model.add(Conv1D(128, 3, activation='relu'))
model.add(MaxPooling1D(3))
model.add(Conv1D(128, 3, activation='relu'))
model.add(MaxPooling1D(3))
model.add(Conv1D(128, 3, activation='relu'))
model.add(MaxPooling1D(3))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(6, activation='softmax'))
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 300, 100)          5000000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 298, 128)          38528     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 99, 128)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 97, 128)           49280     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 32, 128)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 30, 128)           49280     
_________________________________________________________________
max_

In [None]:
#model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[categorical_accuracy])

#model.fit(data, labels, validation_split=0.2, epochs=2)

Train on 411083 samples, validate on 102771 samples
Epoch 1/2
Epoch 2/2

In [52]:
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size = 0.2, random_state=42)

In [53]:
X_train.shape

(411083, 300)

In [57]:
%%time
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=[categorical_accuracy])
model.fit(X_train, y_train, validation_split=0.2, epochs=10)

Instructions for updating:
Use tf.cast instead.
Train on 328866 samples, validate on 82217 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Wall time: 12min 59s


In [60]:
encoder.classes_

array(['appearance', 'aroma/flavor', 'assembly/preparation',
       'function/performance', 'others', 'packaging/labeling'],
      dtype='<U20')

In [61]:
encoder.transform(encoder.classes_)

array([[1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 1]])

In [64]:
encoder.inverse_transform(model.predict(X_test[:5]))

array(['function/performance', 'function/performance',
       'packaging/labeling', 'function/performance', 'aroma/flavor'],
      dtype='<U20')

In [65]:
encoder.inverse_transform(y_test[:5])

array(['function/performance', 'function/performance',
       'packaging/labeling', 'function/performance', 'aroma/flavor'],
      dtype='<U20')

In [66]:
%%time
y_pred_classes = encoder.inverse_transform(model.predict(X_test))

Wall time: 7.18 s


In [67]:
%%time
y_test_classes =  encoder.inverse_transform(y_test)

Wall time: 3.99 ms


In [69]:
accuracy_score(y_pred_classes, y_test_classes)

0.7854745015617246

In [71]:
print(classification_report(y_pred_classes, y_test_classes))

                      precision    recall  f1-score   support

          appearance       0.45      0.55      0.49     12902
        aroma/flavor       0.46      0.63      0.54      1730
assembly/preparation       0.50      0.58      0.54      2963
function/performance       0.91      0.85      0.88     79347
              others       0.01      0.33      0.02         3
  packaging/labeling       0.46      0.56      0.50      5826

           micro avg       0.79      0.79      0.79    102771
           macro avg       0.46      0.58      0.49    102771
        weighted avg       0.81      0.79      0.80    102771



In [72]:
model.save("cnn_model1.h5")

# Loading a model

In [8]:
model = load_model("cnn_model1.h5")

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


In [9]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 300, 100)          5000000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 298, 128)          38528     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 99, 128)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 97, 128)           49280     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 32, 128)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 30, 128)           49280     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 10, 128)           0         
__________

In [14]:
a = model.get_layer("embedding_1")

In [21]:
len(a.get_weights()[0])

50000

In [15]:
a.get_weights()

[array([[ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
          0.      ],
        [-0.038194, -0.24487 ,  0.72812 , ..., -0.1459  ,  0.8278  ,
          0.27062 ],
        [-0.046539,  0.61966 ,  0.56647 , ..., -0.37616 , -0.032502,
          0.8062  ],
        ...,
        [ 0.49531 ,  0.11368 ,  0.096595, ..., -0.10373 , -1.9915  ,
         -0.069738],
        [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
          0.      ],
        [-0.78288 ,  0.28184 , -0.71599 , ...,  0.455   ,  0.037052,
         -0.23365 ]], dtype=float32)]

### Trying with trainable embeddings

In [68]:
Embedding?

[1;31mInit signature:[0m
[0mEmbedding[0m[1;33m([0m[1;33m
[0m    [1;33m[[0m[1;34m'input_dim'[0m[1;33m,[0m [1;34m'output_dim'[0m[1;33m,[0m [1;34m"embeddings_initializer='uniform'"[0m[1;33m,[0m [1;34m'embeddings_regularizer=None'[0m[1;33m,[0m [1;34m'activity_regularizer=None'[0m[1;33m,[0m [1;34m'embeddings_constraint=None'[0m[1;33m,[0m [1;34m'mask_zero=False'[0m[1;33m,[0m [1;34m'input_length=None'[0m[1;33m,[0m [1;34m'**kwargs'[0m[1;33m][0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m     
Turns positive integers (indexes) into dense vectors of fixed size.
eg. [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]

This layer can only be used as the first layer in a model.

# Example

```python
  model = Sequential()
  model.add(Embedding(1000, 64, input_length=10))
  # the model will take as input an integer matrix of size (batch, input_length).
  # the largest integer (i.e. word index) in the input should be
  # no l

In [59]:
model = Sequential()
model.add(Embedding(vocab_size, 
                    embedding_dim, 
                    input_length=max_sentence_len, 
                    weights = [embedding_matrix], 
                    trainable = False)) 
model.add(Conv1D(128, 3, activation='relu'))
model.add(MaxPooling1D(3))
model.add(Conv1D(128, 3, activation='relu'))
model.add(MaxPooling1D(3))
model.add(Conv1D(128, 3, activation='relu'))
model.add(MaxPooling1D(3))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(rate=0.3)) ## Adding Dropout
model.add(Dense(6, activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 300, 100)          5000000   
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 298, 128)          38528     
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, 99, 128)           0         
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 97, 128)           49280     
_________________________________________________________________
max_pooling1d_11 (MaxPooling (None, 32, 128)           0         
_________________________________________________________________
conv1d_12 (Conv1D)           (None, 30, 128)           49280     
_________________________________________________________________
max_pooling1d_12 (MaxPooling (None, 10, 128)           0         
__________

In [55]:
from keras.layers import Dropout

In [46]:
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size = 0.2, random_state=42)

In [47]:
X_train.shape

(411083, 300)

In [60]:
%%time
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=[categorical_accuracy])
model.fit(X_train, y_train, validation_split=0.2, epochs=10)

Train on 328866 samples, validate on 82217 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Wall time: 13min 3s


In [61]:
8

8

In [64]:
%%time
y_pred_classes = encoder.inverse_transform(model.predict(X_test))

Wall time: 8.49 s


In [65]:
%%time
y_test_classes =  encoder.inverse_transform(y_test)

Wall time: 4.04 ms


In [66]:
accuracy_score(y_pred_classes, y_test_classes)

0.7874108454719717

In [67]:
print(classification_report(y_pred_classes, y_test_classes))

  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


                      precision    recall  f1-score   support

          appearance       0.39      0.60      0.47     10245
        aroma/flavor       0.62      0.57      0.59      2551
assembly/preparation       0.53      0.57      0.55      3239
function/performance       0.92      0.84      0.88     81178
              others       0.00      0.00      0.00         0
  packaging/labeling       0.45      0.57      0.50      5558

           micro avg       0.79      0.79      0.79    102771
           macro avg       0.48      0.53      0.50    102771
        weighted avg       0.83      0.79      0.80    102771



  'recall', 'true', average, warn_for)


Trainable embeddings resulted in worse accuracy than original. In both cases, there is a lot of overfitting. The training loss drop doesn't translate to drop in validation dataset. 

In [53]:
model.save("cnn_model_trainable_embeddings.h5")