# **Sentiment Analysis on Twitter Tweets**



# Importing Packages


In [97]:
import re
import pandas as pd
import matplotlib as plt
%matplotlib inline
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Conv1D,Dropout,MaxPooling1D,Bidirectional,Dense,BatchNormalization
from keras.metrics import Recall,Precision,BinaryAccuracy
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [0]:
import warnings
warnings.filterwarnings("ignore")

# Reading Dataset

In [0]:
df = pd.read_csv("/content/drive/My Drive/twitter.csv")

In [101]:
df['label'].value_counts(normalize = True)

0    0.929854
1    0.070146
Name: label, dtype: float64

In [0]:
stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer() 

# Data Cleaning

In [0]:
def preprocess(df):
    for i in range(len(df)):
        k = []
        df['tweet'][i] = df['tweet'][i].lower()
        df['tweet'][i] = re.sub('@[\w]*','',df['tweet'][i])
        df['tweet'][i] = re.sub('[^a-zA-Z ]','',df['tweet'][i])
        k = []
        for j in df['tweet'][i].split():
            if j not in stop:
                j = lemmatizer.lemmatize(j)
                k.append(j)
        df['tweet'][i] = ' '.join(k)
    return df

In [0]:
df = preprocess(df)

#Train Test Split

In [0]:
x = df['tweet']
y = df['label']
from sklearn.model_selection import train_test_split
train_x,test_x,train_y,test_y = train_test_split(x,y,test_size = 0.33,random_state = 100,shuffle = True,stratify = y) 

#Tokenizeing And Vectorizing Comments

In [0]:
vocab = []
for i in df['tweet']:
  for j in i.split():
    vocab.append(j)

In [0]:
vocab = sorted(set(vocab))
len_vocab = len(vocab)

In [0]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
t = Tokenizer(oov_token = '<OOV>',num_words = len_vocab,filters = '')
t.fit_on_texts(vocab)

In [109]:
f = []
max = 0
for i in train_x:
  f.append(i.split())
  if len(i.split()) > max:
    max = len(i.split()) 
max

23

In [0]:
seq = t.texts_to_sequences(f)
feature = pad_sequences(seq,padding = 'post',maxlen = 20,truncating = 'post')

#Over Sampling

In [0]:
import numpy as np
label = np.reshape(np.array(train_y),(train_y.shape[0],1))

In [0]:
from imblearn.over_sampling import RandomOverSampler
os = RandomOverSampler(ratio = 0.8,random_state = 100)
feature,label = os.fit_sample(feature,label.ravel())

In [114]:
label.shape,feature.shape

((35841,), (35841, 20))

In [0]:
label = np.reshape(label,(label.shape[0],1))

In [116]:
label.shape,feature.shape

((35841, 1), (35841, 20))

#Model With Cross Validation


In [119]:
# MLP for Pima Indians Dataset with 10-fold cross validation
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import StratifiedKFold
import numpy
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)
kfold = StratifiedKFold(n_splits=6, shuffle=True, random_state=seed)
member = []
for train, test in kfold.split(feature, label):
  model = Sequential()
  model.add(Embedding(len_vocab,1000,input_length =20))
  
  
  model.add(Bidirectional(LSTM(250,return_sequences=True,dropout = 0.2,recurrent_dropout=0.1)))
  model.add(Bidirectional(LSTM(250,return_sequences=True,dropout = 0.2,recurrent_dropout=0.1)))
  model.add(Bidirectional(LSTM(250,return_sequences=True,dropout = 0.2,recurrent_dropout=0.1)))
  model.add(Bidirectional(LSTM(125,return_sequences=True))) 
  
  model.add(Conv1D(125,1,padding = 'same',strides = 1))
  model.add(MaxPooling1D(1))
  model.add(BatchNormalization())

  model.add(LSTM(50))

  model.add(Dense((200)))
  model.add(Dense(100,activation = 'relu'))
  model.add(Dropout(0.2))
  model.add(Dense(1,activation = 'sigmoid'))

  model.compile(loss='binary_crossentropy', optimizer='adam', metrics = [Recall(name = 'r'),Precision(name = 'p'),BinaryAccuracy(name = 'abs')])
  
  model.fit(feature[train], label[train], epochs=3, batch_size=500,shuffle = True)
  scores = model.evaluate(feature[test], label[test])
  member.append(model)
  print(scores)
  

	

	


Epoch 1/3
Epoch 2/3
Epoch 3/3
[0.0622819475626157, 0.9928436875343323, 0.9755736589431763, 0.985771656036377]
Epoch 1/3
Epoch 2/3
Epoch 3/3
[0.0340549220130453, 0.9969868063926697, 0.9821892380714417, 0.9906260371208191]
Epoch 1/3
Epoch 2/3
Epoch 3/3
[0.05045772663224941, 0.9992467164993286, 0.977523922920227, 0.9894543290138245]
Epoch 1/3
Epoch 2/3
Epoch 3/3
[0.03953534823484466, 0.9977392554283142, 0.9796522259712219, 0.9897873997688293]
Epoch 1/3
Epoch 2/3
Epoch 3/3
[0.0647636305200656, 0.9992467164993286, 0.9619289636611938, 0.9820860624313354]
Epoch 1/3
Epoch 2/3
Epoch 3/3
[0.05396246125527475, 0.9992467164993286, 0.9707281589508057, 0.9862715601921082]


In [120]:
member

[<keras.engine.sequential.Sequential at 0x7f5620f45c18>,
 <keras.engine.sequential.Sequential at 0x7f5620f10fd0>,
 <keras.engine.sequential.Sequential at 0x7f56060fecc0>,
 <keras.engine.sequential.Sequential at 0x7f56048fae48>,
 <keras.engine.sequential.Sequential at 0x7f5602df4e48>,
 <keras.engine.sequential.Sequential at 0x7f5602f6ca20>]

#Evaluation

In [121]:
test = []
max = 0
for i in test_x:
  test.append(i.split())
  if len(i.split()) > max:
    max = len(i.split()) 
max

21

In [123]:
seq = t.texts_to_sequences(test)
tes = pad_sequences(seq,padding = 'post',maxlen = 20,truncating = 'post')
test_y.shape,label.shape,tes.shape

((10548,), (35841, 1), (10548, 20))

In [0]:
test_y = np.array(test_y)
test_y = np.reshape(test_y,(10548,1))

In [0]:
pred = []
for i in member:
 pred.append(i.predict_classes(tes))

In [0]:
p = []
for i in pred:
  i = np.reshape(i,(i.shape[0],))
  p.append(i)
  

In [0]:
from scipy import stats
m = stats.mode(p)[0]
m = np.reshape(m,(10548,1))

In [129]:
m.shape,test_y.shape

((10548, 1), (10548, 1))

In [130]:
from sklearn.metrics import classification_report
print(classification_report(m,test_y))

              precision    recall  f1-score   support

           0       0.98      0.97      0.98      9954
           1       0.58      0.73      0.65       594

    accuracy                           0.96     10548
   macro avg       0.78      0.85      0.81     10548
weighted avg       0.96      0.96      0.96     10548

