Installing tensorflow Version 2.0

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.python.keras import models, layers, optimizers
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
import bz2
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
import re

In [0]:
%matplotlib inline

In [0]:
!pip install -U -q kaggle
!mkdir -p ~/.kaggle

In [0]:
from google.colab import files
files.upload()

In [0]:
!cp kaggle.json ~/.kaggle/
!kaggle datasets list

ref                                                      title                                               size  lastUpdated          downloadCount  
-------------------------------------------------------  -------------------------------------------------  -----  -------------------  -------------  
rajeevw/ufcdata                                          UFC-Fight historical data from 1993 to 2019          3MB  2019-07-05 09:58:02           8400  
gustavomodelli/forest-fires-in-brazil                    Forest Fires in Brazil                              31KB  2019-08-24 16:09:16          14203  
chirin/africa-economic-banking-and-systemic-crisis-data  Africa Economic, Banking and Systemic Crisis Data   14KB  2019-07-21 02:00:17           4904  
ruslankl/european-union-lgbt-survey-2012                 EU LGBT Survey                                     610KB  2019-07-19 11:15:25           2005  
akhilv11/border-crossing-entry-data                      Border Crossing Entry Data     

**DOWNLOADING THE AMAZON CUSTOMER PRODUCT REVIEWS FOR SENTIMENTAL ANALYSIS**

In [0]:
!kaggle datasets download -d bittlingmayer/amazonreviews -w -f train.ft.txt.bz2 


Downloading train.ft.txt.bz2.zip to .
 98% 433M/443M [00:09<00:00, 42.2MB/s]
100% 443M/443M [00:09<00:00, 47.8MB/s]


In [0]:
!ls

kaggle.json  sample_data  train.ft.txt.bz2.zip


In [0]:
#UNZIPPING THE FILE
!unzip train.ft.txt.bz2.zip

Archive:  train.ft.txt.bz2.zip
  inflating: train.ft.txt.bz2        


In [0]:

def get_labels_and_texts(file):
    labels = []
    texts = []
    for line in bz2.BZ2File(file):
        x = line.decode("utf-8")
        labels.append(int(x[9]) - 1)
        texts.append(x[10:].strip())
    return np.array(labels), texts

In [0]:
Labels,Features = get_labels_and_texts('train.ft.txt.bz2')

In [0]:
print('Labels:',Labels)
print('Shape of Labels:',Labels.shape)
print('Total Number of Texts(CUSTOMER REVIEWS):',len(Features))


Labels: [1 1 1 ... 0 0 1]
Shape of Labels: (3600000,)
Total Number of Texts(CUSTOMER REVIEWS): 3600000


In [0]:
import re
NON_ALPHANUM = re.compile(r'[\W]')
NON_ASCII = re.compile(r'[^a-z0-1\s]')
def normalize_texts(texts):
    normalized_texts = []
    for text in texts:
        lower = text.lower()
        no_punctuation = NON_ALPHANUM.sub(r' ', lower)
        no_non_ascii = NON_ASCII.sub(r'', no_punctuation)
        normalized_texts.append(no_non_ascii)
    return normalized_texts

In [0]:
Features = normalize_texts(Features)


In [0]:
Features=Features[:20000]

In [0]:
Features[0]

'stuning even for the non gamer  this sound track was beautiful  it paints the senery in your mind so well i would recomend it even to people who hate vid  game music  i have played the game chrono cross but out of all of the games i have ever played it has the best music  it backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras  it would impress anyone who cares to listen    '

In [0]:
len(Features)

20000

In [0]:
Labels[0]

1

In [0]:
Labels=Labels[:20000]
len(Labels)

20000

**SPLITTING THE DATASET INTO TRAIN, VALIDATION AND TEST SETS**

In [0]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(
    Features, Labels, random_state=1, test_size=0.2)

In [0]:
x_train,x_val,y_train,y_val=train_test_split(x_train,y_train,test_size=0.25,random_state=1)

In [0]:
print('Length of Training Data set:',len(x_train))
print('Length of Validation Data set:',len(x_val))
print('Length of Testing Data set:',len(x_test))

Length of Training Data set: 12000
Length of Validation Data set: 4000
Length of Testing Data set: 4000


In [0]:
#SHOWS 60% Training Data and Remaining Data is For Validation and Testing

for i in [x_train,x_val,x_test]:
  print(round(len(i)/len(Labels),2))

0.6
0.2
0.2


Keras(A Deep Learning Framework) provides some tools for converting text to formats that are useful in deep learning models. 

In [0]:
MAX_FEATURES =4000 
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(x_train)
x_train = tokenizer.texts_to_sequences(x_train)
x_val = tokenizer.texts_to_sequences(x_val)
x_test = tokenizer.texts_to_sequences(x_test)

In [0]:
MAX_LENGTH = max(len(train_ex) for train_ex in x_train)
x_train = pad_sequences(x_train, maxlen=MAX_LENGTH)
x_val = pad_sequences(x_val, maxlen=MAX_LENGTH)
x_test = pad_sequences(x_test, maxlen=MAX_LENGTH)

In [0]:
#Training data After turning all the sentences into same Length
x_test

array([[   0,    0,    0, ...,   74,   93,  134],
       [   0,    0,    0, ...,    2,  265,    6],
       [   0,    0,    0, ...,  627,    3, 1066],
       ...,
       [   0,    0,    0, ...,   83,    8,  139],
       [   0,    0,    0, ...,  858,   78,  850],
       [   0,    0,    0, ...,   11,  376,   14]], dtype=int32)

CONVOLUTIONAL NEURAL NETWORK FOR SENTIMENTAL ANALYSIS

In [0]:
def build_rnn_model():
    sequences = layers.Input(shape=(MAX_LENGTH,))
    embedded = layers.Embedding(MAX_FEATURES, 64)(sequences)
    x = layers.GRU(128, return_sequences=True)(embedded)
    x = layers.GRU(128)(x)
    x = layers.Dense(32, activation='relu')(x)
    x = layers.Dense(100, activation='relu')(x)
    predictions = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inputs=sequences, outputs=predictions)
    model.compile(
        optimizer='rmsprop',
        loss='binary_crossentropy',
        metrics=['binary_accuracy']
    )
    return model
    
rnn_model = build_rnn_model()

In [0]:
rnn_model.fit(
    x_train, 
    y_train, 
    batch_size=64,
    epochs=5,
    validation_data=(x_val, y_val), )

Train on 12000 samples, validate on 4000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f98a3fe1320>

EVALUATING ON THE TEST DATA

In [0]:
loss, accuracy = rnn_model.evaluate(x_test,y_test)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

Loss:  0.35005152216553687
Accuracy:  0.88975
