In [52]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import bz2
import os

# Any results you write to the current directory are saved as output.

In [53]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## **FastText File Reading** ##

In [54]:
trainfile = bz2.BZ2File('/content/drive/MyDrive/DLA/train.ft.txt.bz2','r')
lines = trainfile.readlines()

In [55]:
lines[1]

b"__label__2 The best soundtrack ever to anything.: I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd write a review to disagree a bit. This in my opinino is Yasunori Mitsuda's ultimate masterpiece. The music is timeless and I'm been listening to it for years now and its beauty simply refuses to fade.The price tag on this is pretty staggering I must say, but if you are going to buy any cd for this much money, this is the only one that I feel would be worth every penny.\n"

In [56]:
docSentimentList=[]
def getDocumentSentimentList(docs,splitStr='__label__'):
    for i in range(len(docs)):
        #print('Processing doc ',i,' of ',len(docs))
        text=str(lines[i])
        #print(text)
        splitText=text.split(splitStr)
        secHalf=splitText[1]
        text=secHalf[2:len(secHalf)-1]
        sentiment=secHalf[0]
        #print('First half:',secHalf[0],'\nsecond half:',secHalf[2:len(secHalf)-1])
        docSentimentList.append([text,sentiment])
    print('Done!!')
    return docSentimentList

In [57]:
docSentimentList=getDocumentSentimentList(lines[:1000000],splitStr='__label__')

Done!!


In [58]:
train_df = pd.DataFrame(docSentimentList,columns=['Text','Sentiment'])
train_df.head()

Unnamed: 0,Text,Sentiment
0,Stuning even for the non-gamer: This sound tra...,2
1,The best soundtrack ever to anything.: I'm rea...,2
2,Amazing!: This soundtrack is my favorite music...,2
3,Excellent Soundtrack: I truly like this soundt...,2
4,"Remember, Pull Your Jaw Off The Floor After He...",2


## **Text Preprocessing**##

In [59]:
train_df['Sentiment'][train_df['Sentiment']=='1'] = 0
train_df['Sentiment'][train_df['Sentiment']=='2'] = 1

In [60]:
train_df['Sentiment'].value_counts()

1    505678
0    494322
Name: Sentiment, dtype: int64

In [61]:
train_df['word_count'] = train_df['Text'].str.lower().str.split().apply(len)
train_df.head()

Unnamed: 0,Text,Sentiment,word_count
0,Stuning even for the non-gamer: This sound tra...,1,80
1,The best soundtrack ever to anything.: I'm rea...,1,97
2,Amazing!: This soundtrack is my favorite music...,1,129
3,Excellent Soundtrack: I truly like this soundt...,1,118
4,"Remember, Pull Your Jaw Off The Floor After He...",1,87


In [62]:
import string 
def remove_punc(s):
    table = str.maketrans({key: None for key in string.punctuation})
    return s.translate(table)

In [63]:
train_df['Text'] = train_df['Text'].apply(remove_punc)
train_df.shape

(1000000, 3)

In [64]:
train_df.head()

Unnamed: 0,Text,Sentiment,word_count
0,Stuning even for the nongamer This sound track...,1,80
1,The best soundtrack ever to anything Im readin...,1,97
2,Amazing This soundtrack is my favorite music o...,1,129
3,Excellent Soundtrack I truly like this soundtr...,1,118
4,Remember Pull Your Jaw Off The Floor After Hea...,1,87


In [65]:
len(train_df['word_count'][train_df['word_count']<=25])

63720

In [66]:
train_df1 = train_df[:][train_df['word_count']<=25]
train_df1.head()

Unnamed: 0,Text,Sentiment,word_count
44,autumn got this for my daughter in NC she is n...,1,22
58,Hunting the Hard Way Thia was a gift for my Hu...,1,25
77,Smells divine This is my second bottle of shee...,1,25
78,Very disappointed This perfume is just AWFUL S...,0,24
99,Caution These tracks are not the original vers...,0,22


In [67]:
train_df1.head()

Unnamed: 0,Text,Sentiment,word_count
44,autumn got this for my daughter in NC she is n...,1,22
58,Hunting the Hard Way Thia was a gift for my Hu...,1,25
77,Smells divine This is my second bottle of shee...,1,25
78,Very disappointed This perfume is just AWFUL S...,0,24
99,Caution These tracks are not the original vers...,0,22


In [68]:
train_df1['Sentiment'].value_counts()

1    39932
0    23788
Name: Sentiment, dtype: int64

In [69]:
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
st_wd = text.ENGLISH_STOP_WORDS
c_vector = CountVectorizer(stop_words = st_wd,min_df=.0001,lowercase=1)
c_vector.fit(train_df1['Text'].values)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=1, max_df=1.0, max_features=None, min_df=0.0001,
                ngram_range=(1, 1), preprocessor=None,
                stop_words=frozenset({'a', 'about', 'above', 'across', 'after',
                                      'afterwards', 'again', 'against', 'all',
                                      'almost', 'alone', 'along', 'already',
                                      'also', 'although', 'always', 'am',
                                      'among', 'amongst', 'amoungst', 'amount',
                                      'an', 'and', 'another', 'any', 'anyhow',
                                      'anyone', 'anything', 'anyway',
                                      'anywhere', ...}),
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [70]:
word_list = list(c_vector.vocabulary_.keys())
stop_words = list(c_vector.stop_words) 

In [71]:
len(stop_words),len(word_list)

(318, 8915)

In [72]:
def remove_words(raw_sen,stop_words):
    sen = [w for w in raw_sen if w not in stop_words]
    return sen

In [73]:
def reviewEdit(raw_sen_list,stop_words):
    sen_list = []
    for i in range(len(raw_sen_list)):
        raw_sen = raw_sen_list[i].split()
        sen_list.append(remove_words(raw_sen,stop_words))
    return sen_list

In [74]:
sen_list = reviewEdit(list(train_df1['Text']),stop_words)

In [75]:
from gensim.models import word2vec
wv_model = word2vec.Word2Vec(sen_list,size=100)

In [76]:
wv_model.wv.syn0.shape

  """Entry point for launching an IPython kernel.


(14350, 100)

In [77]:
wv_model.wv.most_similar("car")

[('pair', 0.938689649105072),
 ('bottle', 0.9342629909515381),
 ('lights', 0.9340288639068604),
 ('lunch', 0.9283933043479919),
 ('computer', 0.9252547025680542),
 ('cream', 0.9235467910766602),
 ('lotion', 0.9227983951568604),
 ('machine', 0.9220359325408936),
 ('socks', 0.9203299880027771),
 ('cards', 0.9177193641662598)]

In [78]:
def fun(sen_list,wv_model):
    word_set = set(wv_model.wv.index2word)
    X = np.zeros([len(sen_list),25,100])
    c = 0
    for sen in sen_list:
        nw=24
        for w in list(reversed(sen)):
            if w in word_set:
                X[c,nw] = wv_model[w]
                nw=nw-1
        c=c+1
    return X

In [79]:
X = fun(sen_list,wv_model)

  if __name__ == '__main__':


In [80]:
from sklearn.model_selection import train_test_split
y = train_df1['Sentiment'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
y_train = y_train.astype('bool')
y_test = y_test.astype('bool')

In [81]:
X_train.shape

(57348, 25, 100)

 ## **Keras NN Model** ##

In [82]:
import keras.backend as K
from keras.models import Sequential,Model
from keras.layers import Dense, Dropout, Activation,LSTM, SimpleRNN ,GRU , Bidirectional,Input ,Concatenate, Multiply,Lambda,Reshape
input_st  = Input(shape=(25,100))
lstm1 = Bidirectional(LSTM(200,input_shape=(25,100),activation='relu',return_sequences=True),merge_mode='mul')(input_st)
lstm2 = Bidirectional(LSTM(1,input_shape=(25,100),activation='relu',return_sequences=True),merge_mode='mul')(lstm1)
print(lstm1.shape,' ',lstm2.shape)
lstm2 = Reshape((-1,))(lstm2)
lstm2 = Activation('sigmoid')(lstm2)
lstm2 = Reshape((-1,1))(lstm2)
mult = Multiply()([lstm1,lstm2])

add = Lambda(lambda x: K.sum(x,axis=1))(mult)
dense = Dense(100,activation='relu')(add)
output = Dense(1,activation='sigmoid')(dense)

model = Model(inputs=input_st, outputs=output)
print(model.summary())

(None, 25, 200)   (None, 25, 1)
Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            [(None, 25, 100)]    0                                            
__________________________________________________________________________________________________
bidirectional_9 (Bidirectional) (None, 25, 200)      481600      input_6[0][0]                    
__________________________________________________________________________________________________
bidirectional_10 (Bidirectional (None, 25, 1)        1616        bidirectional_9[0][0]            
__________________________________________________________________________________________________
reshape_10 (Reshape)            (None, 25)           0           bidirectional_10[0][0]           
____________________________________________________________

In [83]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
hist = model.fit(X_train,y_train,validation_split=0.1,
          epochs=10, batch_size=512)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [88]:
y_test = y_test.astype('bool')
model.evaluate(X_test, y_test, batch_size=64)



[0.2552695870399475, 0.8975204229354858]

In [89]:
model.save("/content/drive/MyDrive/DLA/BidirectionalLSTM")

INFO:tensorflow:Assets written to: /content/drive/MyDrive/DLA/BidirectionalLSTM/assets


In [90]:
del model

In [91]:
import keras.backend as K
from keras.models import Sequential,Model
from keras.layers import Dense, Dropout, Activation,LSTM, SimpleRNN ,GRU , Bidirectional,Input ,Concatenate, Multiply,Lambda,Reshape
input_st  = Input(shape=(25,100))
lstm1 = Bidirectional(GRU(200,input_shape=(25,100),activation='relu',return_sequences=True),merge_mode='mul')(input_st)
lstm2 = Bidirectional(GRU(1,input_shape=(25,100),activation='relu',return_sequences=True),merge_mode='mul')(lstm1)
print(lstm1.shape,' ',lstm2.shape)
lstm2 = Reshape((-1,))(lstm2)
lstm2 = Activation('sigmoid')(lstm2)
lstm2 = Reshape((-1,1))(lstm2)
mult = Multiply()([lstm1,lstm2])

add = Lambda(lambda x: K.sum(x,axis=1))(mult)
dense = Dense(100,activation='relu')(add)
output = Dense(1,activation='sigmoid')(dense)

model = Model(inputs=input_st, outputs=output)
print(model.summary())

(None, 25, 200)   (None, 25, 1)
Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            [(None, 25, 100)]    0                                            
__________________________________________________________________________________________________
bidirectional_11 (Bidirectional (None, 25, 200)      362400      input_7[0][0]                    
__________________________________________________________________________________________________
bidirectional_12 (Bidirectional (None, 25, 1)        1218        bidirectional_11[0][0]           
__________________________________________________________________________________________________
reshape_12 (Reshape)            (None, 25)           0           bidirectional_12[0][0]           
____________________________________________________________

In [92]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
hist = model.fit(X_train,y_train,validation_split=0.1,
          epochs=10, batch_size=512)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [93]:
model.evaluate(X_test, y_test, batch_size=64)



[0.26421162486076355, 0.896264910697937]

In [94]:
model.save("/content/drive/MyDrive/DLA/BidirectionalGRU")

INFO:tensorflow:Assets written to: /content/drive/MyDrive/DLA/BidirectionalGRU/assets
