In [1]:
#Data Manipulation
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#Storing/Loading
import pickle
import bz2

#regular expressions
import re
import keras
#Sklearn, Keras, Nltk
from matplotlib import pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding,CuDNNLSTM,Dropout,Dense,RepeatVector, Activation, Lambda
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Loading Data and reading lines

In [2]:
trainFile='train.ft.txt.bz2'
file=bz2.BZ2File(trainFile,'r')
lines=file.readlines()
print('Done!!!')

Done!!!


In [3]:
docSentimentList=[]

# Function for splitting the data into Text and Sentiment 

In [4]:
def getDocumentSentimentList(docs,splitStr='__label__'):
    t=0
    for i in range(len(docs)):
        if t==0:print('Processing doc ',i,' of ',len(docs))
        text=str(lines[i])
        if t==0:print(text)
        splitText=text.split(splitStr)
        secHalf=splitText[1]
        text=secHalf[2:len(secHalf)-1]
        sentiment=secHalf[0]
        if t==0: print('First half:',secHalf[0],'\nsecond half:',secHalf[2:len(secHalf)-1])
        docSentimentList.append([text,sentiment])
        t+=1
    print('Done!!')
    return docSentimentList

# Example Data
First Half shows sentiment = 1 or 2
Second Half shows the text review 

In [5]:
docSentimentList=getDocumentSentimentList(lines[:1000000],splitStr='__label__')

Processing doc  0  of  1000000
b'__label__2 Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^\n'
First half: 2 
second half: Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^\n
Done!!


In [6]:
docDF=pd.DataFrame(docSentimentList,columns=['TEXT','SENTIMENT'])

# Example more clearly

In [18]:
docDF

Unnamed: 0,TEXT,SENTIMENT
0,Stuning even for the non-gamer: This sound tra...,2
1,The best soundtrack ever to anything.: I'm rea...,2
2,Amazing!: This soundtrack is my favorite music...,2
3,Excellent Soundtrack: I truly like this soundt...,2
4,"Remember, Pull Your Jaw Off The Floor After He...",2
5,an absolute masterpiece: I am quite sure any o...,2
6,"Buyer beware: This is a self-published book, a...",1
7,Glorious story: I loved Whisper of the wicked ...,2
8,A FIVE STAR BOOK: I just finished reading Whis...,2
9,Whispers of the Wicked Saints: This was a easy...,2


In [19]:
docDF['SENTIMENT'].value_counts()

2    1161474
1    1134558
Name: SENTIMENT, dtype: int64

# Cleaning

In [None]:
for i in range(1000000):
    if 'www.' in review or 'http:' in review or 'https:' in review or '.com' in docDF['Sentiment'][i]:
        docDF['Sentiment'][i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", docDF['Sentiment'][i]a)

In [39]:
z=0
print(TextBlob(docDF.TEXT[z]).sentiment.polarity,'\t',docDF.SENTIMENT[z])

-0.02187500000000002 	 2


In [9]:
X=docDF['TEXT']
y=docDF['SENTIMENT']

In [10]:
y=y.astype('int32')
lb=LabelBinarizer(pos_label=1,neg_label=0)
y_binarized=lb.fit_transform(y)

In [11]:
y=to_categorical(num_classes=2,y=y_binarized)
y.shape

(1000000, 2)

# Separating the polarity, sentiment mismatch reviews

In [None]:
from textblob import TextBlob
#new_df=pd.DataFrame()


for i in range(1000000):
    polarity = TextBlob(docDF.TEXT[i]).sentiment.polarity
    sentiment = docDF.SENTIMENT[i]
    if  (polarity>0 and sentiment==1) or(polarity<0 and sentiment==2):
        docDF.SENTIMENT[i]=None 
        docDF.TEXT[i]=None
        
docDF.dropna(inplace=True)   
    

In [11]:
tok=Tokenizer(num_words=100000,lower=True,filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~ 
',)

In [12]:
tok.fit_on_texts(X)
print('Toekizing...done')
seqs=tok.texts_to_sequences(X)
print('Sequencing...done')
padded_seqs=pad_sequences(seqs,maxlen=100)
print('Padding sequences...done')

Toekizing...done
Sequencing...done
Padding sequences...done


In [14]:
padded_seqs.shape,y.shape

((1000000, 100), (1000000, 2))

In [27]:
def createLSTM():
    model=Sequential()
    model.add(Embedding(20000,100))
    model.add(CuDNNLSTM(96,return_sequences=True))
    model.add(Dropout(0.4))
    model.add(CuDNNLSTM(128))
    #model.add(Dense(500,activation='relu'))
    #model.add(Dropout(0.2))
    #model.add(Dense(128,activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(2,activation='sigmoid'))
    return model

In [17]:
def loadData():
    with open('X_train','rb') as f:
        X_train = pickle.load(f)
    with open('X_test','rb') as f:
        X_test = pickle.load(f)
    with open('y_train','rb') as f:
        y_train = pickle.load(f)
    with open('y_test','rb') as f:
        y_test = pickle.load(f)
    return X_train,X_test,y_train,y_test

In [28]:
X_train,X_test,y_train,y_test=loadData()
model2=createLSTM()
model2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, None, 100)         2000000   
_________________________________________________________________
cu_dnnlstm_15 (CuDNNLSTM)    (None, None, 96)          76032     
_________________________________________________________________
dropout_18 (Dropout)         (None, None, 96)          0         
_________________________________________________________________
cu_dnnlstm_16 (CuDNNLSTM)    (None, 128)               115712    
_________________________________________________________________
dropout_19 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 2)                 258       
Total params: 2,192,002
Trainable params: 2,192,002
Non-trainable params: 0
_________________________________________________________________


In [17]:
X_train,X_test,y_train,y_test=train_test_split(padded_seqs,y,train_size=0.80,test_size=0.20,random_state=43)

In [20]:
'''
import pickle
with open('X_train','wb') as f:
    pickle.dump(X_train,f)
with open('X_test','wb') as f:
    pickle.dump(X_test,f)
with open('y_train','wb') as f:
    pickle.dump(y_train,f)
with open('y_test','wb') as f:
    pickle.dump(y_test,f)
'''

In [5]:
np.shape(X_train),np.shape(y_train)

((800000, 100), (800000, 2))

In [29]:
model2.compile(loss='binary_crossentropy',optimizer='adam',metrics=['acc'])
model2.fit(X_train,y_train,epochs=2,validation_data=[X_test,y_test])

Train on 800000 samples, validate on 200000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x2093d046f98>

In [31]:
model2.save('FInalModel')

In [14]:
model2 = keras.models.load_model('FInalModel')

In [20]:
idx=np.random.randint(len(X))
test=[X[idx]]
print(test)
print('RESULT:')
pred=model2.predict(pad_sequences(tok.texts_to_sequences(test),maxlen=100))
print(np.argmax(pred))
if np.argmax(pred)==0:
    print('NEG')
else:
    print('POS')

["Lucky Dog: I loved this book! Wendy Taylor Carlisle's poems are beautiful, true and deeply moving. Every word seems well-chosen, and nothing's extra. I love the sense the poems give of a real person behind the words--a smart, funny, heartful woman looking unblinkingly at herself and the world, and bravely telling what she sees. I love that the poems aren't afraid to speak in their own voice or to talk about things that matter. I admire Wendy Carlisle's use of form, which is so masterful it seems effortless. I'd recommend this book to both poetry lovers and people who've never read poetry before--the poems here are so inviting, and worth reading and re-reading.\\n"]
RESULT:
1
POS
