In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/hashes.txt
/kaggle/input/database.sqlite
/kaggle/input/Reviews.csv


In [43]:
from sys import getsizeof

In [2]:
import numpy as np # 
import pandas as pd #
import matplotlib.pyplot as plt
import seaborn as sns
import spacy

In [20]:
from sklearn.model_selection import train_test_split

In [32]:
from keras.preprocessing import sequence    # A helper module to handle padding input
from keras.models import Sequential         # The base keras Neural Network model
from keras.layers import Dense, Dropout, Activation   # The layer objects we will pile into the model
from keras.layers import Conv1D, GlobalMaxPooling1D

Using TensorFlow backend.


In [3]:
nlp = spacy.load('en_core_web_lg', disable=["parser", "entity_ruler",
                                           "sentencizer", "merge_noun_chunks",
                                           "merge_entities", "merge_subtokens"])

In [4]:
df_rev = pd.read_csv('../input/Reviews.csv')

In [None]:
df_rev.head()

In [None]:
df_rev.Score.value_counts().plot(kind='bar', figsize=(10, 5))

In [None]:
len(df_rev[df_rev.Score < 3])

### Creating Small Sample:

In [5]:
df_rv = pd.concat([df_rev[df_rev.Score>3].iloc[0:10000][['Score', 'Text']], 
                   df_rev[df_rev.Score<3].iloc[0:8000][['Score', 'Text']]])

In [7]:
len(df_rv)

18000

In [9]:
%%time
df_rv['nlp_docs'] = list(nlp.pipe(df_rv.Text))

CPU times: user 2min 15s, sys: 50.3 s, total: 3min 5s
Wall time: 2min 49s


In [10]:
df_rv.head()

Unnamed: 0,Score,Text,nlp_docs
0,5,I have bought several of the Vitality canned d...,"(I, have, bought, several, of, the, Vitality, ..."
2,4,This is a confection that has been around a fe...,"(This, is, a, confection, that, has, been, aro..."
4,5,Great taffy at a great price. There was a wid...,"(Great, taffy, at, a, great, price, ., , Ther..."
5,4,I got a wild hair for taffy and ordered this f...,"(I, got, a, wild, hair, for, taffy, and, order..."
6,5,This saltwater taffy had great flavors and was...,"(This, saltwater, taffy, had, great, flavors, ..."


In [None]:
# We will pick 100 vectors as a pad/truncate limit

In [12]:
def pad_or_truncate(txt_mat, nsize=100):
    """
    pad or truncate set of GloVe vectors for every review
    """
    # get a length of given review
    nvect = txt_mat.shape[0]
    if nvect < 100:
        return np.concatenate([txt_mat, np.zeros([nsize - nvect, 300])])
    else:
        return txt_mat[:nsize,]

In [13]:
%%time
X_vectors = [pad_or_truncate(np.array([tok.vector for tok in doc])) 
             for doc in df_rv.nlp_docs]

CPU times: user 10.8 s, sys: 3.4 s, total: 14.2 s
Wall time: 14.2 s


In [14]:
len(X_vectors)

18000

In [16]:
X_vectors[10].shape

(100, 300)

In [18]:
y_labels = (df_rv.Score < 3).astype('int')

In [21]:
y_labels.sum()

8000

### Train/Test Split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_vectors, y_labels, test_size=0.33, random_state=2019)

In [26]:
# pad/trunc limit is 100
# emb dim is 300
X_train = np.reshape(X_train, ((len(X_train), 100, 300)))

In [29]:
X_test = np.reshape(X_test, ((len(X_test), 100, 300)))

In [30]:
X_train.shape, X_test.shape

((12060, 100, 300), (5940, 100, 300))

In [28]:
type(y_train)

pandas.core.series.Series

### Model Building

In [38]:
pad_trunc_limit = 100
batch_size = 32
emb_dim = 300
# number of filters to train
n_filters = 250
# the width of the filters
kernel_size = 3
hidden_dim = 250
epochs = 10

In [39]:
model = Sequential()

In [40]:
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1,
                 input_shape=(pad_trunc_limit, emb_dim)))
model.add(GlobalMaxPooling1D())
# hidden layer
model.add(Dense(hidden_dim))
# adding dropout to avoid overfitting
model.add(Dropout(0.1))
model.add(Activation('relu'))
# project into a single unit output layer
model.add(Dense(1))
model.add(Activation('sigmoid'))

In [41]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [42]:
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(X_test, y_test))

Train on 12060 samples, validate on 5940 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f8510254198>

### Memory Usage

In [50]:
getsizeof(X_train), getsizeof(y_train), getsizeof(X_vectors)

(128, 192984, 158200)

In [51]:
X_train.shape, len(y_train), len(X_vectors)

((12060, 100, 300), 12060, 18000)

### Saving arrays into a file

In [52]:
np.save('X_train.npy', X_train)

In [56]:
#np.savetxt('X_vectors.txt', np.array(X_train))

In [57]:
np.savez_compressed('X_train_z_compressed.npy', X_train)

In [59]:
ls -lth

total 3.6G
-rw-r--r-- 1 root root 879M Aug  6 16:46 X_train_z_compressed.npy.npz
-rw-r--r-- 1 root root    0 Aug  6 16:41 X_vectors.txt
-rw-r--r-- 1 root root    0 Aug  6 16:40 X_train.txt
-rw-r--r-- 1 root root 2.7G Aug  6 16:38 X_train.npy
-rw-r--r-- 1 root root  199 Aug  6 15:18 __notebook_source__.ipynb
