<a href="https://colab.research.google.com/github/debgit/NLP/blob/main/Assign1/Sentiment_Analysis_using_Naive_Bayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment Analysis using Naive Bayes

In this assignment, we will attempt to label tweets with sentiments (positive, neutral and negative) using Naive Bayes classifier. Naive Bayes is a very basic approach to this problem, but gives surprisingly good accuracy sometimes.

**Fill in the Blanks**

In [1]:
mount_drive = True
if mount_drive:
    from google.colab import drive
    drive.mount('/content/gdrive')

Mounted at /content/gdrive


## Importing required libraries

In [2]:
import pandas as pd
import re
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

## Reading dataset

In [4]:
project_path='/content/gdrive/My Drive/NLP/'
data_file = project_path+'tweets.csv'

In [5]:
data=pd.read_csv(data_file,encoding='UTF8')
data.drop(data.columns[0],axis=1,inplace=True)
data.sample(10)

Unnamed: 0,tweets,labels
54,"Barack Obama LONGBOARD Package CORE 7"" TRUCKS ...",0
314,One Direction Tolak Undangan Michelle Obama! h...,0
942,Concerning allegations about the firing of som...,1
408,RT @Talkmaster: Oh now I get it. Obama was tal...,1
5,Video shows federal officials joking about cos...,0
1021,#WhatsRomneyHiding his secret 'more flexible' ...,1
515,"RT @WhatTheFFacts: In his teen years, Obama ha...",0
1121,"RT @wcptersn: If Obama were white, he'd be Mit...",1
1174,http://t.co/5XJZbGSV Naked Sarah Palin The pol...,0
1244,Obama to celebrate Passover with Seder http://...,0


In [6]:
data.isnull().sum()
## there are 5 tweets which are null,should be dropped from the training

tweets    5
labels    0
dtype: int64

In [7]:
data.dropna(inplace=True)
data.reset_index(inplace=True)

## Text processing for the tweets

In [8]:
import nltk 
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [9]:
from nltk.tokenize import word_tokenize
from string import punctuation 
from nltk.corpus import stopwords 

stopwords = set(stopwords.words('english') + list(punctuation) + ['AT_USER','URL'])
    
def processTweet(tweet):
    # tweet is the text we will pass for preprocessing 
    # convert passed tweet to lower case 
    tweet=  tweet.lower()
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) # remove URLs
    tweet = re.sub('@[^\s]+', 'AT_USER', tweet) # remove usernames
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag
    
    # use work_tokenize imported above to tokenize the tweet
    tweet=word_tokenize(tweet)
    return [word for word in tweet if len(word)>2 if word not in stopwords]

## Process all tweets

In [10]:
processed=[]

for tweet in data['tweets']:
    # process all tweets using processTweet function above - store in variable 'cleaned' 
    cleaned=processTweet(tweet)
    processed.append(' '.join(cleaned))

In [11]:
data['processed'] = processed

In [12]:
data.sample(10)

Unnamed: 0,index,tweets,labels,processed
650,652,RT @LouTommoBum: Michelle Obama invited the bo...,0,michelle obama invited boys easter party l0l
976,978,@benhurleycomedy haha is the first person you ...,0,haha first person followed obama
1261,1265,Napolitano Caught Hiring Muslim Brotherhood Te...,1,napolitano caught hiring muslim brotherhood te...
12,12,Here's How Obama and the Democrats Will Win in...,1,obama democrats win 2012 let start going back ...
234,234,Interesting take: Obama Misquotes Bible on Wea...,1,interesting take obama misquotes bible wealth ...
510,510,I favorited a @YouTube video http://t.co/LdPff...,1,favorited video tales fantasy obama look
289,289,Obama says knock you out -- http://t.co/PUZRq7...,0,obama says knock screwytees
1153,1156,RT @markknoller: Obama Campaign seizing on Was...,0,obama campaign seizing washington post report ...
561,563,Dick riding obama obama -thugnificent,0,dick riding obama obama -thugnificent
742,744,@Devils_for_Life lol No Obama is :),0,lol obama


In [13]:
## from the above output it is noticed that not all the puntuations are removed hence there is a need to refine the function.

## Create pipeline and define parameters for GridSearch

In [14]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())])

tuned_parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': [1, 1e-1, 1e-2]
}

## Split data into test and train

In [15]:
# split data into train and test with split as 0.2 
X = data['processed']
Y = data['labels']

from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size =.2,random_state=7,stratify=data['labels'])

In [16]:
print('X_train',X_train.shape)
print('X_test',X_test.shape)
print('Y_train',Y_train.shape)
print('Y_test',Y_test.shape)

X_train (1100,)
X_test (275,)
Y_train (1100,)
Y_test (275,)


## Perform classification (using GridSearch)

In [17]:
# perform GridSearch CV with 10 fold CV using pipeline and tuned_paramters defined above 
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(text_clf,tuned_parameters,cv=10)
clf.fit(X_train,Y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        pre

In [18]:
clf.best_params_

{'clf__alpha': 0.1,
 'tfidf__norm': 'l2',
 'tfidf__use_idf': False,
 'vect__ngram_range': (1, 2)}

In [19]:
Y_pred=clf.predict(X_test)

## Classification report 

In [20]:
# print classification report after predicting on test set with best model obtained in GridSearch
from sklearn.metrics import classification_report,confusion_matrix
Y_pred=clf.predict(X_test)
print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

           0       0.87      0.97      0.92       188
           1       0.88      0.72      0.79        71
           2       1.00      0.38      0.55        16

    accuracy                           0.87       275
   macro avg       0.92      0.69      0.75       275
weighted avg       0.88      0.87      0.86       275



In [21]:
print(confusion_matrix(Y_test,Y_pred)) ## the recall is not good for class 1 and 2.

[[183   5   0]
 [ 20  51   0]
 [  8   2   6]]


## Important:

In [22]:
counts = data.labels.value_counts()
print(counts)

0    942
1    352
2     81
Name: labels, dtype: int64


In [23]:
!pip install imblearn



We can see above that the class distribution is highly imbalanced, this would not lead to good sampling of the data for the classifier. For your learning, try using [SMOTE](https://imbalanced-learn.readthedocs.io/en/stable/api.html) to oversample the minority classes and then evaluate the performance with Naive Bayes and compare.

In [24]:
from imblearn.over_sampling import SMOTE, SVMSMOTE
from imblearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')



In [25]:
sampler = SMOTE(random_state=0)

text_clf_sm = Pipeline([                        
                      ('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('smot',SMOTE()),
                     ('clf', MultinomialNB())
                     ])

#p1=Pipeline(sampler,text_clf_sm)

tuned_parameters_sm = {
    'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'smot__k_neighbors' : [5,10,15,20],
    #'sampling_strategy':(str),
    'clf__alpha': [1, 1e-1, 1e-2]
}

In [26]:
clf_sm = GridSearchCV(text_clf_sm,tuned_parameters_sm,cv=10)
clf_sm.fit(X_train,Y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        pre

In [27]:
clf_sm.best_params_

{'clf__alpha': 0.01,
 'smot__k_neighbors': 20,
 'tfidf__norm': 'l1',
 'tfidf__use_idf': True,
 'vect__ngram_range': (2, 2)}

In [28]:
## We can see now that the recall value has increased for class 1 and 2.
Y_pred=clf_sm.predict(X_test)
print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

           0       0.86      0.90      0.88       188
           1       0.80      0.68      0.73        71
           2       0.33      0.38      0.35        16

    accuracy                           0.81       275
   macro avg       0.67      0.65      0.66       275
weighted avg       0.82      0.81      0.81       275

