In [27]:
### Import required packages


import pandas as pd
import numpy as np

import re
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

import tensorflow as tf 

from keras.models import Sequential
from keras.layers import Dense, LSTM, Bidirectional, Dropout, Embedding
from keras.layers import Conv1D, MaxPooling1D
from keras.layers import Activation, Flatten
from keras.utils import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy, CategoricalCrossentropy

from tensorflow.keras.utils import to_categorical

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import warnings
warnings.filterwarnings("ignore")

In [28]:
## Loading Data

df = pd.read_csv("Twitter_Data.csv")
df.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [29]:
df.category.unique()

array([-1.,  0.,  1., nan])

In [30]:
df.isna().sum()

clean_text    4
category      7
dtype: int64

In [31]:
df= df.dropna().reset_index(drop=True)
df.duplicated().sum()

0

In [32]:
## Count the categories 

df['category'].value_counts()

 1.0    72249
 0.0    55211
-1.0    35509
Name: category, dtype: int64

In [33]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['clean_text'])
df['clean_text'] = tokenizer.texts_to_sequences(df['clean_text'])
df['clean_text']


0         [42, 1, 307, 42492, 66, 1726, 42493, 1119, 40,...
1         [286, 16, 2, 1104, 3, 718, 16, 2, 809, 7, 38, ...
2         [24, 53, 41, 99, 38, 4, 1, 1003, 21, 508, 5, 5...
3         [383, 19, 496, 4730, 113, 43, 1221, 1, 53, 126...
4         [390, 22, 813, 79, 2, 173, 926, 127, 136, 124,...
                                ...                        
162964    [32, 79, 5172, 474, 570, 2429, 1, 6, 2715, 17,...
162965    [305, 363, 529, 13161, 10149, 24, 36, 1, 1195,...
162966        [53, 5, 1718, 234, 2583, 7526, 118, 191, 397]
162967    [63, 183, 558, 360, 192, 9, 1, 924, 558, 15, 6...
162968    [14, 5, 260, 686, 36, 26, 35329, 118, 4882, 10...
Name: clean_text, Length: 162969, dtype: object

In [34]:
## Split into train test set

X_train, X_test, y_train, y_test = train_test_split(df['clean_text'],df['category'], test_size=0.2, random_state=40)


In [35]:
print('X_train:',len(X_train))
print('y_train:',len(y_train))
print('X_test:',len(X_test))
print('y_test:',len(y_test))

X_train: 130375
y_train: 130375
X_test: 32594
y_test: 32594


In [36]:
X_train = pad_sequences( X_train, maxlen=100 ,dtype='float32')
X_test = pad_sequences( X_test, maxlen=100 ,dtype='float32')

In [37]:
len(tokenizer.index_word)

113678

In [38]:
from keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor="val_loss",patience=5,verbose=True)

In [39]:
X_train = np.asarray(X_train).astype(np.float32)
X_test = np.asarray(X_test).astype(np.float32)

y_train = np.asarray(y_train).astype('float32').reshape((-1,1))
y_test = np.asarray(y_test).astype('float32').reshape((-1,1))

In [40]:
y_train = to_categorical(y_train, 3)
y_test = to_categorical(y_test, 3)

In [41]:
model = Sequential()
model.add(Embedding(len(tokenizer.index_word)+1, input_length= 100 ,output_dim =100))
model.add(LSTM(100))
model.add(Dense(128, activation='relu'))
model.add(Dense(3, activation='sigmoid'))

model.compile("adam", loss= 'categorical_crossentropy' ,metrics=["accuracy"])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          11367900  
                                                                 
 lstm (LSTM)                 (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 128)               12928     
                                                                 
 dense_1 (Dense)             (None, 3)                 387       
                                                                 
Total params: 11461615 (43.72 MB)
Trainable params: 11461615 (43.72 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [42]:
history = model.fit(X_train , y_train ,batch_size=256, epochs=4,
                    validation_data=(X_test , y_test),callbacks=[early_stop])

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [74]:
results_train = model.predict(X_train)
results_test = model.predict(X_test)



In [75]:
classes_train = np.argmax(results_train, axis = 1)
classes_test = np.argmax(results_test, axis = 1)


In [88]:
final_pred = np.concatenate((classes_train, classes_test), axis=0)

In [89]:
final_pred[final_pred == 2] = -1

In [90]:
np.unique(final_pred, return_counts=True)

(array([-1,  0,  1], dtype=int64), array([35561, 54362, 73046], dtype=int64))

In [102]:
pred = pd.DataFrame(final_pred, columns=['Predicted_Sentiment']) 
pred.head()

Unnamed: 0,Predicted_Sentiment
0,1
1,0
2,1
3,1
4,0
