In [2]:
import numpy as np # 
import pandas as pd #
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
import sqlite3

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/hashes.txt
/kaggle/input/database.sqlite
/kaggle/input/Reviews.csv


In [3]:
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence    # A helper module to handle padding input
from keras.models import Sequential         # The base keras Neural Network model
from keras.layers import Dense, Dropout, Activation   # The layer objects we will pile into the model
from keras.layers import Conv1D, GlobalMaxPooling1D

Using TensorFlow backend.


In [4]:
nlp = spacy.load('en_core_web_lg', disable=["parser", "entity_ruler",
                                           "sentencizer", "merge_noun_chunks",
                                           "merge_entities", "merge_subtokens"])

### Train/Test Split by Index Id

In [5]:
con = sqlite3.connect('../input/database.sqlite')

In [6]:
df_rev_ids_scores = pd.read_sql_query("SELECT Id, Score FROM Reviews WHERE Score != 3;", con)

In [7]:
df_rev_ids_scores.head()

Unnamed: 0,Id,Score
0,1,5
1,2,1
2,3,4
3,4,2
4,5,5


In [8]:
y_labels = (df_rev_ids_scores.Score < 3).astype('int')

In [9]:
train_ids, test_ids, y_train, y_test = train_test_split(df_rev_ids_scores.Id, y_labels, 
                                                 test_size=0.2, random_state=2019, stratify=y_labels)

In [10]:
y_train.mean(), y_test.mean(), len(train_ids), len(test_ids)

(0.15602007364775075, 0.15601494822323442, 420651, 105163)

### Testing data extraction

In [11]:
pd.read_sql_query("SELECT * FROM Reviews LIMIT 3", con)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...


In [None]:
str(train_ids[:10])

In [None]:
id_list_str = ','.join([str(item) for item in train_ids[:10]])

In [None]:
id_list_str

In [None]:
query = """SELECT Id, Text FROM Reviews WHERE Id IN (%s)""" %id_list_str

In [None]:
query

In [None]:
df_sample = pd.read_sql_query(query, con)

In [None]:
df_sample

### Loading samples for training one by one

In [12]:
pad_trunc_limit = 100
batch_size = 32
emb_dim = 300
# number of filters to train
n_filters = 250
# the width of the filters
kernel_size = 3
hidden_dim = 250
epochs = 10

In [13]:
model = Sequential()

In [14]:
model.add(Conv1D(n_filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1,
                 input_shape=(pad_trunc_limit, emb_dim)))
model.add(GlobalMaxPooling1D())
# hidden layer
model.add(Dense(hidden_dim))
# adding dropout to avoid overfitting
model.add(Dropout(0.1))
model.add(Activation('relu'))
# project into a single unit output layer
model.add(Dense(1))
model.add(Activation('sigmoid'))

In [15]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [16]:
len(train_ids)

420651

In [17]:
list(range(0, len(train_ids), 100000))

[0, 100000, 200000, 300000, 400000]

In [18]:
def pad_or_truncate(txt_mat, nsize=pad_trunc_limit):
    """
    pad or truncate set of GloVe vectors for every review
    """
    # get a length of given review
    nvect = txt_mat.shape[0]
    if nvect < 100:
        return np.concatenate([txt_mat, np.zeros([nsize - nvect, 300])])
    else:
        return txt_mat[:nsize,]

In [None]:

for sample_ix in range(0, len(train_ids), 10000):
    print("sample_ix:", sample_ix)
    y_train_sample = y_train[sample_ix:sample_ix + 10000]
    id_list_str = ','.join([str(item) 
                            for item in train_ids[sample_ix:sample_ix + 10000]])
    df_sample_text = pd.read_sql_query("SELECT Text FROM Reviews WHERE Id IN (%s)" %id_list_str, con)
    sample_docs = list(nlp.pipe(df_sample_text.Text))
    X_train_sample = [pad_or_truncate(np.array([tok.vector for tok in doc])) 
             for doc in sample_docs]
    X_train_sample = np.reshape(X_train_sample, ((len(X_train_sample), pad_trunc_limit, emb_dim)))
    
    # training phase
    model.fit(X_train_sample, y_train_sample,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.1)

sample_ix: 0
Train on 9000 samples, validate on 1000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
sample_ix: 10000
Train on 9000 samples, validate on 1000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
sample_ix: 20000
Train on 9000 samples, validate on 1000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
sample_ix: 30000
Train on 9000 samples, validate on 1000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
sample_ix: 40000
Train on 9000 samples, validate on 1000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
sample_ix: 50000
Train on 9000 samples, validate on 1000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoc

Epoch 9/10
Epoch 10/10
sample_ix: 60000


In [None]:
id_list_str = ','.join([str(item) for item in train_ids[:10]])

In [None]:
df_sample_text = pd.read_sql_query(query, con)

In [None]:
all_docs = list(nlp.pipe(df_rev[df_rev.Score!=3].Text))