In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/glove-vectors/glove_vectors
/kaggle/input/amazon-fine-food-reviews/hashes.txt
/kaggle/input/amazon-fine-food-reviews/Reviews.csv
/kaggle/input/amazon-fine-food-reviews/database.sqlite


In [2]:
data = pd.read_csv("/kaggle/input/amazon-fine-food-reviews/Reviews.csv")
print(data.shape)

(568454, 10)


In [3]:
data.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

In [4]:
data = data[data['Score']!=3]

In [5]:
data['target'] = data['Score'].apply(lambda x : 1 if x>3 else 0)

In [6]:
#Sorting data according to ProductId in ascending order
data=data.sort_values('ProductId', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')
#Deduplication of entries
data=data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first', inplace=False)
print(data.shape)

(364173, 11)


In [7]:
data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,target
150523,150524,6641040,ACITT7DI6IDDL,shari zychinski,0,0,5,939340800,EVERY book is educational,this witty little book makes my son laugh at l...,1
150505,150506,6641040,A2IW4PEEKO2R0U,Tracy,1,1,4,1194739200,"Love the book, miss the hard cover version","I grew up reading these Sendak books, and watc...",1
150506,150507,6641040,A1S4A3IQ2MU7V4,"sally sue ""sally sue""",1,1,4,1191456000,chicken soup with rice months,This is a fun way for children to learn their ...,1
150507,150508,6641040,AZGXZ2UUK6X,"Catherine Hallberg ""(Kate)""",1,1,5,1076025600,a good swingy rhythm for reading aloud,This is a great little book to read aloud- it ...,1
150508,150509,6641040,A3CMRKGE0P909G,Teresa,3,4,5,1018396800,A great way to learn the months,This is a book of poetry about the months of t...,1


In [8]:
# https://stackoverflow.com/a/47091490/4084039
import re

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

stopwords= ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"]


from tqdm import tqdm
def preprocess_text(text_data):
    preprocessed_text = []
    for sentance in tqdm(text_data):
        sent = decontracted(sentance)
        sent = sent.replace('\\r', ' ')
        sent = sent.replace('\\n', ' ')
        sent = sent.replace('\\"', ' ')
        sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
        #sent = ' '.join(e for e in sent.split() if e.lower() not in stopwords)
        preprocessed_text.append(sent.lower().strip())
    return preprocessed_text

In [9]:
preprocessed_reviews = preprocess_text(data['Text'].values)
labels = data['target'].values

100%|██████████| 364173/364173 [00:24<00:00, 14858.32it/s]


In [10]:
pd.Series(labels).value_counts()

1    307063
0     57110
dtype: int64

In [11]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from scipy import sparse
x_train,x_test, y_train, y_test = train_test_split(preprocessed_reviews, labels, test_size=0.2, random_state=42, stratify=labels)
x_train,x_cv, y_train, y_cv = train_test_split(x_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

MAX_SEQUENCE_LENGTH = 4000
MAX_NUM_WORDS = 20000
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\\]^`{|}~\t\n')

tokenizer.fit_on_texts(x_train)

encoded_docs_train = tokenizer.texts_to_sequences(x_train)
padded_text_train=pad_sequences(encoded_docs_train,maxlen=MAX_SEQUENCE_LENGTH, padding="post", truncating="post").astype('int16')
encoded_docs_test = tokenizer.texts_to_sequences(x_test)
padded_text_test=pad_sequences(encoded_docs_test,maxlen=MAX_SEQUENCE_LENGTH, padding="post", truncating="post").astype('int16')
encoded_docs_cv = tokenizer.texts_to_sequences(x_cv)
padded_text_cv=pad_sequences(encoded_docs_cv,maxlen=MAX_SEQUENCE_LENGTH, padding="post", truncating="post").astype('int16')
word_index = tokenizer.word_index

In [12]:
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [13]:
print(padded_text_test[0])

[9 6 4 ... 0 0 0]


In [14]:
print(padded_text_test.shape)

(72835, 4000)


In [15]:
with open('/kaggle/input/glove-vectors/glove_vectors','rb') as f:
    model = pickle.load(f)
    glove_words = set(model.keys())

In [16]:
!keras --version

/bin/sh: 1: keras: not found


In [17]:
from keras.layers import Conv1D, Input, Dense, Flatten, MaxPooling1D, Embedding, Dropout, LSTM, TimeDistributed
from keras.initializers import Constant
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
EMBEDDING_DIM = 300
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in (word_index.items()):
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = model.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

print(embedding_matrix.shape)

embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

(20000, 300)


In [18]:
from sklearn.metrics import roc_auc_score, f1_score
import numpy as np
class Callback(object):
    
    """Abstract base class used to build new callbacks.
      Attributes:
          params: dict. Training parameters
              (eg. verbosity, batch size, number of epochs...).
          model: instance of `keras.models.Model`.
              Reference of the model being trained.
          validation_data: Deprecated. Do not use.
      The `logs` dictionary that callback methods
      take as argument will contain keys for quantities relevant to
      the current batch or epoch.
      Currently, the `.fit()` method of the `Model` class
      will include the following quantities in the `logs` that
      it passes to its callbacks:
          on_epoch_end: logs include `acc` and `loss`, and
          optionally include `val_loss`
          (if validation is enabled in `fit`), and `val_acc`
          (if validation and accuracy monitoring are enabled).
          on_batch_begin: logs include `size`,
          the number of samples in the current batch.
          on_batch_end: logs include `loss`, and optionally `acc`
            (if accuracy monitoring is enabled).
      """

    def __init__(self, training_data, validation_data):
        print(self)
        self.x_tr=training_data[0]
        self.y_tr=training_data[1]
        self.x_val=validation_data[0]
        self.y_val=validation_data[1]
        self.stagnant_count=0
        #self.validation_data = None
        self.model = None
        self.cnt=0
        self.val_acc=-1
        # Whether this Callback should only run on the chief worker in a
        # Multi-Worker setting.
        # TODO(omalleyt): Make this attr public once solution is stable.
        self._chief_worker_only = None

    def set_params(self, params):
        self.params = params

    def set_model(self, model):
        self.model = model

    def on_batch_begin(self, batch, logs=None):
        """A backwards compatibility alias for `on_train_batch_begin`."""

    def on_batch_end(self, batch, logs=None):
        """A backwards compatibility alias for `on_train_batch_end`."""

    def on_epoch_begin(self, epoch, logs=None):

        """Called at the start of an epoch.
        Subclasses should override for any actions to run. This function should only
        be called during TRAIN mode.
        Arguments:
            epoch: integer, index of epoch.
            logs: dict. Currently no data is passed to this argument for this method
              but that may change in the future.
        """
        #print(self.model.get_weights())
        #self.model.optimizer._hyper['learning_rate']=0.1
        print('**'*50)
        self.cnt+=1
        if(self.cnt%3==0):
          self.model.optimizer._hyper['learning_rate']=self.model.optimizer._hyper['learning_rate']*0.95
          #K.set_value(self.model.optimizer.learning_rate,K.get_value(self.model.optimizer.learning_rate)*0.95)
        #print('haha')
        #print(K.get_value(self.model.optimizer.lr))

    def on_epoch_end(self, epoch, logs={}):

        """Called at the end of an epoch.
        Subclasses should override for any actions to run. This function should only
        be called during TRAIN mode.
        Arguments:
            epoch: integer, index of epoch.
            logs: dict, metric results for this training epoch, and for the
              validation epoch if validation is performed. Validation result keys
              are prefixed with `val_`.
        """
        loss = logs.get('loss')
        if loss is not None:
          if np.isnan(loss) or np.isinf(loss):
            print('Batch %d: Invalid loss, terminating training' % (batch))
            self.model.stop_training = True
            return
        y_pred = self.model.predict_classes(self.x_tr)
        #print(self.y_tr)
        #print(y_pred[:5,])
        roc = roc_auc_score(self.y_tr, y_pred)
        y_pred_val = self.model.predict_classes(self.x_val)
        roc_val = roc_auc_score(self.y_val, y_pred_val)
        print('AUC score of training data '+str(roc))
        print('AUC score of validation data '+str(roc_val))
        print('F1-Score on training data '+str(f1_score(self.y_tr, y_pred)))
        print('F1-Score on validation data '+str(f1_score(self.y_val, y_pred_val)))
        #print('we are at an end, My friend')
        if(('nan' in self.model.get_weights()) | (logs['loss']=='nan')| (logs['acc']=='nan')| (logs['val_loss']=='nan')| (logs['val_acc']=='nan')):
          self.model.stop_training = True
        
        if(logs['val_acc']<self.val_acc):
          self.model.optimizer._hyper['learning_rate']=self.model.optimizer._hyper['learning_rate']*0.9
        if(logs['val_acc']==self.val_acc):
          self.stagnant_count += 1
          if(self.stagnant_count==2):
            self.model.stop_training = True
        else:
          self.stagnant_count=0
        self.val_acc=logs['val_acc']
        print('**'*50)

    def on_train_batch_begin(self, batch, logs=None):
        """Called at the beginning of a training batch in `fit` methods.
        Subclasses should override for any actions to run.
        Arguments:
            batch: integer, index of batch within the current epoch.
            logs: dict. Has keys `batch` and `size` representing the current batch
              number and the size of the batch.
        """
        # For backwards compatibility.
        self.on_batch_begin(batch, logs=logs)

    def on_train_batch_end(self, batch, logs=None):
        """Called at the end of a training batch in `fit` methods.
        Subclasses should override for any actions to run.
        Arguments:
            batch: integer, index of batch within the current epoch.
            logs: dict. Metric results for this batch.
        """
        # For backwards compatibility.
        self.on_batch_end(batch, logs=logs)

    def on_test_batch_begin(self, batch, logs=None):
        """Called at the beginning of a batch in `evaluate` methods.
        Also called at the beginning of a validation batch in the `fit`
        methods, if validation data is provided.
        Subclasses should override for any actions to run.
        Arguments:
            batch: integer, index of batch within the current epoch.
            logs: dict. Has keys `batch` and `size` representing the current batch
                  number and the size of the batch.
        """

    def on_test_batch_end(self, batch, logs=None):
        """Called at the end of a batch in `evaluate` methods.
        Also called at the end of a validation batch in the `fit`
        methods, if validation data is provided.
        Subclasses should override for any actions to run.
        Arguments:
            batch: integer, index of batch within the current epoch.
            logs: dict. Metric results for this batch.
        """

    def on_predict_batch_begin(self, batch, logs=None):
        """Called at the beginning of a batch in `predict` methods.
        Subclasses should override for any actions to run.
        Arguments:
            batch: integer, index of batch within the current epoch.
            logs: dict. Has keys `batch` and `size` representing the current batch
                  number and the size of the batch.
        """

    def on_predict_batch_end(self, batch, logs=None):
        """Called at the end of a batch in `predict` methods.
        Subclasses should override for any actions to run.
        Arguments:
            batch: integer, index of batch within the current epoch.
            logs: dict. Metric results for this batch.
        """

    def on_train_begin(self, logs=None):
        """Called at the beginning of training.
        Subclasses should override for any actions to run.
        Arguments:
            logs: dict. Currently no data is passed to this argument for this method
                  but that may change in the future.
        """

    def on_train_end(self, logs=None):
        """Called at the end of training.
        Subclasses should override for any actions to run.
        Arguments:
            logs: dict. Currently no data is passed to this argument for this method
                  but that may change in the future.
        """

    def on_test_begin(self, logs=None):
        """Called at the beginning of evaluation or validation.
        Subclasses should override for any actions to run.
        Arguments:
            logs: dict. Currently no data is passed to this argument for this method
              but that may change in the future.
        """

    def on_test_end(self, logs=None):
        """Called at the end of evaluation or validation.
        Subclasses should override for any actions to run.
        Arguments:
            logs: dict. Currently no data is passed to this argument for this method
              but that may change in the future.
        """

    def on_predict_begin(self, logs=None):
        """Called at the beginning of prediction.
        Subclasses should override for any actions to run.
        Arguments:
            logs: dict. Currently no data is passed to this argument for this method
              but that may change in the future.
        """

    def on_predict_end(self, logs=None):
        """Called at the end of prediction.
        Subclasses should override for any actions to run.
        Arguments:
            logs: dict. Currently no data is passed to this argument for this method
              but that may change in the future.
    """

In [19]:
from keras.layers.merge import concatenate
from keras.models import Model
primary_input = Input(shape=(MAX_SEQUENCE_LENGTH,))
embedding = embedding_layer(primary_input)
print(embedding.shape)
lstm  = LSTM(100, activation="tanh", return_sequences=True)(embedding)
print(lstm.shape)
td = TimeDistributed(Dense(32, activation='relu'))(lstm)
flat = Flatten()(td)
dense1 = Dense(64, activation='relu')(flat)
dropout = Dropout(0.2)(dense1)
output = Dense(1, activation='sigmoid')(dropout)
print(output.shape)

(None, 4000, 300)
(None, 4000, 100)
(None, 1)


In [20]:
model = Model(inputs=primary_input, outputs=output)
print(model.summary())

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 4000)]            0         
_________________________________________________________________
embedding (Embedding)        (None, 4000, 300)         6000000   
_________________________________________________________________
lstm (LSTM)                  (None, 4000, 100)         160400    
_________________________________________________________________
time_distributed (TimeDistri (None, 4000, 32)          3232      
_________________________________________________________________
flatten (Flatten)            (None, 128000)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8192064   
_________________________________________________________________
dropout (Dropout)            (None, 64)               

In [21]:
import keras
callback = Callback(training_data=(padded_text_train, y_train),validation_data=(padded_text_cv, y_cv))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', keras.metrics.AUC()])
model.fit(padded_text_train, y_train,
          batch_size=100,
          epochs=5, 
          validation_data = (padded_text_cv, y_cv),
          verbose =1
         )

<__main__.Callback object at 0x7f7ecb060510>
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f7ecb06fc90>

In [22]:
model.save('model.h5')