In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
import tensorflow as tf
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense , Input , LSTM , Embedding, Dropout , Activation, Flatten,Bidirectional


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Read the Data from the Given excel file.

In [2]:
df=pd.read_csv('Twitter_Data.csv')
df.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [3]:
df.shape

(32004, 2)

Change our dependent variable to categorical. (0 to “Neutral,”-1 to “Negative”, 1 to “Positive”)

In [4]:
df['category'].loc[df['category']==-1.0]="negative"
df['category'].loc[df['category']==0.0]="neutral"
df['category'].loc[df['category']==1.0]="positive"
df.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,negative
1,talk all the nonsense and continue all the dra...,neutral
2,what did just say vote for modi welcome bjp t...,positive
3,asking his supporters prefix chowkidar their n...,positive
4,answer who among these the most powerful world...,positive


Do Missing value analysisand drop all null/missing values

In [5]:
df.isnull().sum()

clean_text    1
category      1
dtype: int64

In [6]:
df=df.dropna()

In [7]:
df.isnull().sum()

clean_text    0
category      0
dtype: int64

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32002 entries, 0 to 32002
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   clean_text  32002 non-null  object
 1   category    32002 non-null  object
dtypes: object(2)
memory usage: 750.0+ KB


Do text cleaning. (remove every symbol except alphanumeric, transform all words to lower case, and remove punctuation and stopwords )

In [9]:
df['lower_text'] = df['clean_text'].str.lower()

In [10]:
df['stopped_text'] = df['clean_text'].str.replace('\W', ' ', regex=True)
df.head()

Unnamed: 0,clean_text,category,lower_text,stopped_text
0,when modi promised “minimum government maximum...,negative,when modi promised “minimum government maximum...,when modi promised minimum government maximum...
1,talk all the nonsense and continue all the dra...,neutral,talk all the nonsense and continue all the dra...,talk all the nonsense and continue all the dra...
2,what did just say vote for modi welcome bjp t...,positive,what did just say vote for modi welcome bjp t...,what did just say vote for modi welcome bjp t...
3,asking his supporters prefix chowkidar their n...,positive,asking his supporters prefix chowkidar their n...,asking his supporters prefix chowkidar their n...
4,answer who among these the most powerful world...,positive,answer who among these the most powerful world...,answer who among these the most powerful world...


In [11]:
df['tokenized'] = df['stopped_text'].apply(word_tokenize)
df=df.drop(['lower_text','stopped_text'],axis=1)

In [12]:
df.head()

Unnamed: 0,clean_text,category,tokenized
0,when modi promised “minimum government maximum...,negative,"[when, modi, promised, minimum, government, ma..."
1,talk all the nonsense and continue all the dra...,neutral,"[talk, all, the, nonsense, and, continue, all,..."
2,what did just say vote for modi welcome bjp t...,positive,"[what, did, just, say, vote, for, modi, welcom..."
3,asking his supporters prefix chowkidar their n...,positive,"[asking, his, supporters, prefix, chowkidar, t..."
4,answer who among these the most powerful world...,positive,"[answer, who, among, these, the, most, powerfu..."


In [13]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [14]:
stop_words = set(stopwords.words("english"))
df['stopped_text'] = df['tokenized'].apply(
    lambda x: [word for word in x if word not in stop_words])
df.head()

Unnamed: 0,clean_text,category,tokenized,stopped_text
0,when modi promised “minimum government maximum...,negative,"[when, modi, promised, minimum, government, ma...","[modi, promised, minimum, government, maximum,..."
1,talk all the nonsense and continue all the dra...,neutral,"[talk, all, the, nonsense, and, continue, all,...","[talk, nonsense, continue, drama, vote, modi]"
2,what did just say vote for modi welcome bjp t...,positive,"[what, did, just, say, vote, for, modi, welcom...","[say, vote, modi, welcome, bjp, told, rahul, m..."
3,asking his supporters prefix chowkidar their n...,positive,"[asking, his, supporters, prefix, chowkidar, t...","[asking, supporters, prefix, chowkidar, names,..."
4,answer who among these the most powerful world...,positive,"[answer, who, among, these, the, most, powerfu...","[answer, among, powerful, world, leader, today..."


Create a new column and find the length of each sentence (how many words they contain)

In [15]:
df['length']=df['clean_text'].str.split().str.len()

In [16]:
df.head()

Unnamed: 0,clean_text,category,tokenized,stopped_text,length
0,when modi promised “minimum government maximum...,negative,"[when, modi, promised, minimum, government, ma...","[modi, promised, minimum, government, maximum,...",33
1,talk all the nonsense and continue all the dra...,neutral,"[talk, all, the, nonsense, and, continue, all,...","[talk, nonsense, continue, drama, vote, modi]",13
2,what did just say vote for modi welcome bjp t...,positive,"[what, did, just, say, vote, for, modi, welcom...","[say, vote, modi, welcome, bjp, told, rahul, m...",22
3,asking his supporters prefix chowkidar their n...,positive,"[asking, his, supporters, prefix, chowkidar, t...","[asking, supporters, prefix, chowkidar, names,...",34
4,answer who among these the most powerful world...,positive,"[answer, who, among, these, the, most, powerfu...","[answer, among, powerful, world, leader, today...",14


Split data into dependent(X) and independent(y) dataframe

In [17]:
x=df['stopped_text']
y=df['category']

Do operations on text data 

One-hot encoding for each sentence

In [18]:
from keras.preprocessing.text import Tokenizer
tokenizer=Tokenizer()
tokenizer.fit_on_texts(df['stopped_text'])
df['stopped_text']=tokenizer.texts_to_sequences(df['stopped_text'])
df['stopped_text']

0        [1, 71, 272, 30, 1573, 547, 855, 3079, 1128, 1...
1                              [213, 957, 661, 1254, 7, 1]
2        [46, 7, 1, 1176, 3, 347, 11, 447, 2986, 1, 37,...
3        [261, 325, 3081, 45, 866, 1, 94, 1726, 3197, 1...
4          [277, 710, 642, 115, 72, 110, 693, 3924, 1, 66]
                               ...                        
31998      [36, 16899, 1885, 3656, 16899, 131, 7, 3, 7, 1]
31999             [4051, 540, 6, 3592, 1873, 263, 4347, 1]
32000                        [1, 3230, 1710, 235, 87, 337]
32001    [683, 862, 262, 455, 39, 7013, 8681, 9771, 533...
32002             [45, 9, 1, 8065, 135, 621, 116, 255, 93]
Name: stopped_text, Length: 32002, dtype: object

Add padding from the front side (use Tensorflow)

Build an LSTM model and compile it(describe features, input length, vocabulary size, information drop-out layer, activation function for output, )

In [19]:
vocab_size=df['length'].sum()
vocab_size

661962

In [20]:
model = Sequential()
model.add(Embedding(len(tokenizer.index_word)+1, input_length= 100 ,output_dim =50))
model.add(Bidirectional(LSTM(100)))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='softmax'))

In [21]:
from tensorflow import keras
model.compile(optimizer=keras.optimizers.Adam(),
                loss=keras.losses.BinaryCrossentropy(),metrics=["accuracy"])

In [22]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 50)           1968900   
                                                                 
 bidirectional (Bidirectiona  (None, 200)              120800    
 l)                                                              
                                                                 
 flatten (Flatten)           (None, 200)               0         
                                                                 
 dense (Dense)               (None, 250)               50250     
                                                                 
 dropout (Dropout)           (None, 250)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 251       
                                                        

Do dummy variable creation for the dependent variable

In [23]:
df['category'].mask(df['category'] == 'negative',-1,  inplace=True)
df['category'].mask(df['category'] == 'neutral',0,  inplace=True)
df['category'].mask(df['category'] == 'positive',1,  inplace=True)
df['category']

0        -1
1         0
2         1
3         1
4         1
         ..
31998    -1
31999     1
32000    -1
32001     1
32002     1
Name: category, Length: 32002, dtype: object

split the data into tests and train

In [24]:
x_train,x_test,y_train,y_test=train_test_split(df['stopped_text'],df['category'],test_size=0.2,random_state=10)
x_train.shape,y_train.shape

((25601,), (25601,))

In [25]:
from keras_preprocessing.sequence import pad_sequences
x_train = pad_sequences( x_train, maxlen=100 ,dtype='float32')
x_test = pad_sequences( x_test, maxlen=100 ,dtype='float32')

In [26]:
x_train = np.asarray(x_train).astype(np.float32)
x_test = np.asarray(x_test).astype(np.float32)

y_train = np.asarray(y_train).astype('float32').reshape((-1,1))
y_test = np.asarray(y_test).astype('float32').reshape((-1,1))

In [27]:
x_train

array([[0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.3840e+03, 3.6990e+03,
        9.0160e+03],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.0000e+00, 4.4910e+03,
        4.9300e+02],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.3640e+03, 4.0980e+03,
        2.0950e+03],
       ...,
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.6600e+02, 1.4047e+04,
        1.0000e+00],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.3110e+03, 7.7000e+01,
        1.0490e+03],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.4270e+03, 2.8100e+02,
        3.4330e+03]], dtype=float32)

In [28]:
x_test

array([[0.000e+00, 0.000e+00, 0.000e+00, ..., 3.100e+01, 4.900e+01,
        1.000e+00],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 4.760e+02, 2.080e+02,
        2.480e+02],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 1.000e+00, 3.000e+01,
        4.630e+02],
       ...,
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 7.500e+02, 1.000e+01,
        1.221e+03],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 6.700e+01, 1.274e+03,
        8.360e+02],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 2.840e+02, 1.160e+02,
        3.020e+02]], dtype=float32)

Train new model

In [29]:
model.fit(x_train,y_train, batch_size=200, 
          epochs=10, shuffle=True, 
          validation_data=(x_test,y_test), verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f5e38751b50>

Normalize the prediction as same as the originaldata(prediction might be in decimal, so whoever is nearest to 1 is predicted as yes and set other as 0)

In [30]:
results1 = model.evaluate(y_test, y_test, batch_size=200)



Measure performance metrics and accuracy

In [31]:
y_pred=model.predict(x_test)
y_pred



array([[1.],
       [1.],
       [1.],
       ...,
       [1.],
       [1.],
       [1.]], dtype=float32)

print Classification report

In [34]:
import sklearn
d=sklearn.metrics.classification_report(y_test, y_pred)
print(d)

              precision    recall  f1-score   support

        -1.0       0.00      0.00      0.00      1554
         0.0       0.00      0.00      0.00      2206
         1.0       0.41      1.00      0.58      2641

    accuracy                           0.41      6401
   macro avg       0.14      0.33      0.19      6401
weighted avg       0.17      0.41      0.24      6401

