In [None]:
#Mini Project – Twitter Sentimental Analysis Using NLP and Python

The following tasks are to be performed:

•Read the Data from the Given excel file.
•Change our dependent variable to categorical. (0 to “Neutral,”-1 to “Negative”, 1 to “Positive”)
•Do Missing value analysis and drop all null/missing values
•Do text cleaning. (remove every symbol except alphanumeric, transform all words to lower case, and remove punctuationand stopwords)
•Create a new column and find the length of each sentence (how many words they contain)
•Split data into dependent(X) and independent(y) dataframe
•Do operations on text data
     Hints:
     o Do one-hot encoding for each sentence(use TensorFlow) 
     o Add padding from the front side (use Tensorflow) 
     o Build an LSTM model and compile it(describe features, input length, vocabulary size, information drop-out layer, activation function for output)      o Do dummy variable creation for the dependent variable
     o Split the data into tests and train
•Train new model
•Normalize the prediction as same as the original data(prediction might be in decimal, so whoever is nearest to 1 is predicted as yes and set other as 0)
•Measure performance metrics and accuracy
•print Classification report


In [3]:
import numpy as np
import pandas as pd

#visualization
import matplotlib.pyplot as plt
import seaborn as sns

#nlp
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

#text vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

#model selection
from sklearn.model_selection import train_test_split

#classifiers
from sklearn.linear_model import LogisticRegression


#classification reports
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score

#warnings
import warnings

#settings
warnings.filterwarnings("ignore")
%matplotlib inline

In [4]:
tweets_data = pd.read_csv('Twitter_Data.csv')

In [6]:
tweets_data.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [7]:
tweets_data.dtypes

clean_text     object
category      float64
dtype: object

In [8]:
tweets_data["category"].unique()

array([-1.,  0.,  1., nan])

In [9]:
tweets_data["category"].replace(to_replace=0.0,value='Neutral',inplace=True)
tweets_data["category"].replace(to_replace=1.0,value='Positive',inplace=True)
tweets_data["category"].replace(to_replace=-1.0,value='Negative',inplace=True)

tweets_data.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,Negative
1,talk all the nonsense and continue all the dra...,Neutral
2,what did just say vote for modi welcome bjp t...,Positive
3,asking his supporters prefix chowkidar their n...,Positive
4,answer who among these the most powerful world...,Positive


In [10]:
tweets_data.shape[0]

162980

In [11]:
tweets_data.isnull().sum()

clean_text    4
category      7
dtype: int64

In [12]:
tweets_filtered = tweets_data.dropna()

In [13]:
tweets_filtered.shape[0]

162969

In [14]:
tweets_filtered.isnull().sum()

clean_text    0
category      0
dtype: int64

In [15]:
tweets_sample = tweets_filtered.sample(n = 50000, random_state = 42)

In [16]:
tweets_sample

Unnamed: 0,clean_text,category
42229,news flash modi address the nation cancels tri...,Neutral
22035,according congress ecosystem bjp divided house...,Positive
79982,friends did not have the guts let drdo test mo...,Negative
118493,did modi violate the model code conduct speaki...,Negative
12815,raoul gandhy asked modi about nirav for years ...,Neutral
...,...,...
64857,you dont know modi has money plant his bjp hea...,Neutral
83955,appeal all the bhakts followers and sympathize...,Negative
46467,after modi announcement,Neutral
9346,because modis five years power not could have ...,Neutral


In [17]:
import contractions

def expand_contractions(text):
    text = contractions.fix(text)
    return text

tweets_sample["clean_text"] = tweets_sample["clean_text"].apply(expand_contractions)
tweets_sample

Unnamed: 0,clean_text,category
42229,news flash modi address the nation cancels tri...,Neutral
22035,according congress ecosystem bjp divided house...,Positive
79982,friends did not have the guts let drdo test mo...,Negative
118493,did modi violate the model code conduct speaki...,Negative
12815,raoul gandhy asked modi about nirav for years ...,Neutral
...,...,...
64857,you do not know modi has money plant his bjp h...,Neutral
83955,appeal all the bhakts followers and sympathize...,Negative
46467,after modi announcement,Neutral
9346,because modis five years power not could have ...,Neutral


In [19]:
import re

wordnet_lemmatizer = WordNetLemmatizer()

def my_normalizer(text):
    
  #lowercase the input text
  text = text.lower()

  #remove punctuation
  text = re.sub(r'[^\w\s]', '', text)  

  #tokenize (split string into words)
  tokens = nltk.tokenize.word_tokenize(text)

  #remove all symbol except alphanumeric
  tokens = [t for t in tokens if (c.isalnum() for c in t)]

  #lemmatazing words (put words into base form)
  tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens]

  #remove stopwords
  tokens = [t for t in tokens if t not in set(stopwords.words('english'))]

  return tokens

In [20]:
tweets_sample["final_text"] = tweets_sample["clean_text"].apply(my_normalizer)
tweets_sample

Unnamed: 0,clean_text,category,final_text
42229,news flash modi address the nation cancels tri...,Neutral,"[news, flash, modi, address, nation, cancel, t..."
22035,according congress ecosystem bjp divided house...,Positive,"[according, congress, ecosystem, bjp, divided,..."
79982,friends did not have the guts let drdo test mo...,Negative,"[friend, gut, let, drdo, test, modi, please, e..."
118493,did modi violate the model code conduct speaki...,Negative,"[modi, violate, model, code, conduct, speaking..."
12815,raoul gandhy asked modi about nirav for years ...,Neutral,"[raoul, gandhy, asked, modi, nirav, year, neve..."
...,...,...,...
64857,you do not know modi has money plant his bjp h...,Neutral,"[know, modi, ha, money, plant, bjp, headquarter]"
83955,appeal all the bhakts followers and sympathize...,Negative,"[appeal, bhakts, follower, sympathizer, fake, ..."
46467,after modi announcement,Neutral,"[modi, announcement]"
9346,because modis five years power not could have ...,Neutral,"[modis, five, year, power, could, ten, year, 2..."


In [21]:
tweets_sample["word_length"] = tweets_sample["final_text"].str.len()
tweets_sample

Unnamed: 0,clean_text,category,final_text,word_length
42229,news flash modi address the nation cancels tri...,Neutral,"[news, flash, modi, address, nation, cancel, t...",8
22035,according congress ecosystem bjp divided house...,Positive,"[according, congress, ecosystem, bjp, divided,...",17
79982,friends did not have the guts let drdo test mo...,Negative,"[friend, gut, let, drdo, test, modi, please, e...",25
118493,did modi violate the model code conduct speaki...,Negative,"[modi, violate, model, code, conduct, speaking...",22
12815,raoul gandhy asked modi about nirav for years ...,Neutral,"[raoul, gandhy, asked, modi, nirav, year, neve...",24
...,...,...,...,...
64857,you do not know modi has money plant his bjp h...,Neutral,"[know, modi, ha, money, plant, bjp, headquarter]",7
83955,appeal all the bhakts followers and sympathize...,Negative,"[appeal, bhakts, follower, sympathizer, fake, ...",14
46467,after modi announcement,Neutral,"[modi, announcement]",2
9346,because modis five years power not could have ...,Neutral,"[modis, five, year, power, could, ten, year, 2...",9


In [22]:
tweets_sample["final_text"]

42229     [news, flash, modi, address, nation, cancel, t...
22035     [according, congress, ecosystem, bjp, divided,...
79982     [friend, gut, let, drdo, test, modi, please, e...
118493    [modi, violate, model, code, conduct, speaking...
12815     [raoul, gandhy, asked, modi, nirav, year, neve...
                                ...                        
64857      [know, modi, ha, money, plant, bjp, headquarter]
83955     [appeal, bhakts, follower, sympathizer, fake, ...
46467                                  [modi, announcement]
9346      [modis, five, year, power, could, ten, year, 2...
130064    [janata, dal, secular, chief, alleged, raid, p...
Name: final_text, Length: 50000, dtype: object

In [23]:
tweets_sample["final_text"] = tweets_sample["final_text"].apply(lambda x : ' '.join(x))

In [24]:
tweets_sample

Unnamed: 0,clean_text,category,final_text,word_length
42229,news flash modi address the nation cancels tri...,Neutral,news flash modi address nation cancel trip atm,8
22035,according congress ecosystem bjp divided house...,Positive,according congress ecosystem bjp divided house...,17
79982,friends did not have the guts let drdo test mo...,Negative,friend gut let drdo test modi please explain p...,25
118493,did modi violate the model code conduct speaki...,Negative,modi violate model code conduct speaking armed...,22
12815,raoul gandhy asked modi about nirav for years ...,Neutral,raoul gandhy asked modi nirav year never asked...,24
...,...,...,...,...
64857,you do not know modi has money plant his bjp h...,Neutral,know modi ha money plant bjp headquarter,7
83955,appeal all the bhakts followers and sympathize...,Negative,appeal bhakts follower sympathizer fake chowki...,14
46467,after modi announcement,Neutral,modi announcement,2
9346,because modis five years power not could have ...,Neutral,modis five year power could ten year 20042014m...,9


In [25]:
#dependent and independent variables

X = tweets_sample['final_text']
y = tweets_sample['category']

In [26]:
#tfidf vectorizer

vectorizer = TfidfVectorizer(min_df = 3)
X_vectorized = vectorizer.fit_transform(X)

In [27]:
print('Vocabulary len:', len(vectorizer.get_feature_names_out()))
print('Longest word:', max(vectorizer.vocabulary_, key=len))

Vocabulary len: 13815
Longest word: actorturnedpolitician


In [28]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, 
                                                    shuffle = True, 
                                                    test_size = 0.33, 
                                                    random_state = 42)

In [29]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)

#evaluation
print("Confusion Matrix for Logistic Regression:")
print(confusion_matrix(y_test,y_pred_lr))
print("Score:",round(accuracy_score(y_test,y_pred_lr) * 100, 2))
print("Classification Report:")
print(classification_report(y_test,y_pred_lr))

Confusion Matrix for Logistic Regression:
[[2349  634  616]
 [ 127 5237  273]
 [ 304  732 6228]]
Score: 83.72
Classification Report:
              precision    recall  f1-score   support

    Negative       0.84      0.65      0.74      3599
     Neutral       0.79      0.93      0.86      5637
    Positive       0.88      0.86      0.87      7264

    accuracy                           0.84     16500
   macro avg       0.84      0.81      0.82     16500
weighted avg       0.84      0.84      0.83     16500

