# Connecting with google drive for dataset from kaggle
Dataset used: https://www.kaggle.com/kazanova/sentiment140

Downloading dataset into Google drive directly from kaggle and importing data: https://medium.com/analytics-vidhya/how-to-fetch-kaggle-datasets-into-google-colab-ea682569851a

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content/gdrive/My Drive/Kaggle"
# /content/gdrive/My Drive/Kaggle is the path where kaggle.json is present in the Google Drive

In [None]:
#changing the working directory
%cd /content/gdrive/My Drive/Kaggle
#Check the present working directory using pwd command

/content/gdrive/My Drive/Kaggle


In [None]:
!ls

best_model_state.bin  training.1600000.processed.noemoticon.csv
kaggle.json	      training_simple_nn


In [None]:
import pandas as pd
import tensorflow as tf
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import numpy as np

In [None]:
# Checking if GPU available
physical_devices = tf.config.experimental.list_physical_devices("GPU")
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [None]:
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin1', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [None]:
print('Initial Shape:', df.shape)
df = df.iloc[:, [0,5]]
df.shape

Initial Shape: (1600000, 6)


(1600000, 2)

In [None]:
df = df.sample(n=10000, random_state=78, replace=False)
print(df.shape)
df.head()

(10000, 2)


Unnamed: 0,0,5
1238240,4,@kitty_k4t kletterwald in kassel bec tomorrow...
814974,4,"@JoshPyke, Radio 2 session was beautiful."
1089986,4,counting stars and sheep and trading thoughts ...
1510136,4,Sleepy bear requires a little fox For soothing...
1343137,4,"goin to thefuck bed, goodnight really good ni..."


In [None]:
df.columns = ['label', 'text']
for i, row in df.iterrows():
    if row[0]==4:
        df.at[i,'label'] = 1

df.head()

Unnamed: 0,label,text
1238240,1,@kitty_k4t kletterwald in kassel bec tomorrow...
814974,1,"@JoshPyke, Radio 2 session was beautiful."
1089986,1,counting stars and sheep and trading thoughts ...
1510136,1,Sleepy bear requires a little fox For soothing...
1343137,1,"goin to thefuck bed, goodnight really good ni..."


In [None]:
df['label'].value_counts()
max(len(i) for i in df['text'])
len(df)
df.iloc[6]['text']

'Home from FL. Amazing how vacation time passes so much more quickly than &quot;regular&quot; time.  Already making plans 4 next trip 2 Sanibel.'

## Text Preprocessing

1. Removing unrequired text like urls, hashtags, social media entities, stopwords

2. Lexicon Normalization write, writing, written etc, are all different variations of write

In [None]:
nltk.download('stopwords')
corpus = []
for i, row in df.iterrows():
    text = re.sub('[^a-zA-Z]', ' ', df.at[i, 'text'])
    text = text.lower()
    text = text.split()
    ps = PorterStemmer()
    text = [ps.stem(word) for word in text if not word in set(stopwords.words('english'))]
    text = ' '.join(text)
    df.at[i, 'text'] = text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df.head()

Unnamed: 0,label,text
1238240,1,kitti k kletterwald kassel bec tomorrow good luck
814974,1,joshpyk radio session beauti
1089986,1,count star sheep trade thought dream forget ze...
1510136,1,sleepi bear requir littl fox sooth cuddl appli
1343137,1,goin thefuck bed goodnight realli good night c...


In [None]:
# Padding Sequences so that all are of same length
max_fatures = 30000
tokenizer = Tokenizer(nb_words=max_fatures, split=' ')
tokenizer.fit_on_texts(df['text'].values)
x = tokenizer.texts_to_sequences(df['text'].values)
x = pad_sequences(x)
y = pd.get_dummies(df['label'])
x_train, x_test, y_train, y_test = train_test_split(x,y, random_state = 42)
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)



(7500, 25) (7500, 2)
(2500, 25) (2500, 2)


In [None]:
embed_dim = 150
lstm_out = 200
model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = x.shape[1], dropout=0.2))
model.add(LSTM(lstm_out, dropout_U=0.2,dropout_W=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

  after removing the cwd from sys.path.
  """


Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 25, 150)           4500000   
_________________________________________________________________
lstm_3 (LSTM)                (None, 200)               280800    
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 402       
Total params: 4,781,202
Trainable params: 4,781,202
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
checkpoint_path = "lstm/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)


h = model.fit(
    x_train, y_train,
    epochs=10,
    batch_size = batch_size
)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
score,acc = model.evaluate(x_test, y_test, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

score: 1.85
acc: 0.68


In [None]:
print(x_test.shape)
print(y_test.shape)
print(y_test.iloc[3])

(2500, 25)
(2500, 2)
0    1
1    0
Name: 287, dtype: uint8


In [None]:
pos_cnt, neg_cnt, pos_correct, neg_correct = 0, 0, 0, 0
for x in range(len(x_test)):
    
    result = model.predict(x_test[x].reshape(1,x_test.shape[1]),batch_size=1,verbose = 2)[0]
    
    # print(result)
    # print(x)
    
    # print(y_test[x])
    # print(np.argmax(result))

    if np.argmax(result) == np.argmax(y_test.iloc[x]):
        if np.argmax(y_test.iloc[x]) == 0:
            neg_correct += 1
        else:
            pos_correct += 1
       
    if np.argmax(y_test.iloc[x]) == 0:
        neg_cnt += 1
    else:
        pos_cnt += 1
    # break
# print(result)

print("pos_acc", pos_correct/pos_cnt*100, "%")
print("neg_acc", neg_correct/neg_cnt*100, "%")

pos_acc 68.55753646677472 %
neg_acc 66.58767772511848 %


In [None]:
model.save('lstm_model.h5')

In [None]:
!ls

best_model_state.bin  training.1600000.processed.noemoticon.csv
kaggle.json	      training_simple_nn
lstm_model.h5
