# IV. Rule-Based Sentiment Analysis

In [222]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.tokenize import sent_tokenize
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.metrics.distance  import edit_distance
from nltk.corpus import words
import os
from nltk.corpus import stopwords
from textblob import TextBlob
from wordcloud import WordCloud
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from autocorrect import Speller

In [223]:
data=pd.read_csv('A1_dataset.csv')
data_new = data[['LABEL', 'TEXT']].copy()
data_new.head(2)

Unnamed: 0,LABEL,TEXT
0,0,About to get threaded and scared
1,1,@awaisnaseer I like Shezan Mangooo too!!! I ha...


In [224]:
#pip install vaderSentiment

In [225]:
#nltk.download('vader_lexicon')

## Helper Functions

In [226]:
def tokenize(data):
    data_new['TEXT'] = data_new['TEXT'].apply(word_tokenize) 
    return data_new['TEXT']

def lower(data):
    data_new['TEXT'] = data_new['TEXT'].astype(str).str.lower()
    return data_new['TEXT']

def remove_punctuations(data):
    for punctuation in string.punctuation:
        data = data.replace(punctuation, '')
    return data

def remove_punctuation_tokens(data):
    data_new['TEXT'] = data_new['TEXT'].apply(remove_punctuations)
    return data_new['TEXT']

def remove_stopwords(data):
    stop = list(stopwords.words("english"))
    data_new['TEXT'] = data_new['TEXT'].apply(lambda x: ' '.join([w for w in x.split() if w not in stop]))
    return data_new['TEXT']

def remove_URL(data):
    return re.sub(r'http\S+','', data)

def remove_URL_data(data):
    data_new['TEXT'] = data_new['TEXT'].apply(remove_URL)
    return data_new['TEXT']

def remove_HTMLTag(data):
    return re.sub(r'&\w+;','', data)

def remove_HTMLTag_data(data):
    data_new['TEXT'] = data_new['TEXT'].apply(remove_HTMLTag)
    return data_new['TEXT']

def lemmatize(data):
        lemmatizer = WordNetLemmatizer()
        lemmatize = [lemmatizer.lemmatize(x) for x in data]
        return lemmatize
    
def lemmatize_data(data):
    data_new['TEXT'] = data_new['TEXT'].apply(lemmatize)
    return data_new['TEXT']


def stemming(data):
        # Porter stemmer
        ps = PorterStemmer()
        stem = [ps.stem(x) for x in data]
        return stem
    
def stemming_data(data):
    data_new['TEXT'] = data_new['TEXT'].apply(stemming)
    return data_new['TEXT']

def remove_username_func(data):
    return re.sub(r'\@\w+|\#|\d+', '', data)

def remove_username(data):
    data_new['TEXT'] = data_new['TEXT'].apply(remove_username_func)
    return data_new['TEXT']

In [227]:
def remove_words_func(data):
    return re.sub(r'\b\w{1,3}\b', '', data)

def remove_words(data):
    data_new['TEXT'] = data_new['TEXT'].apply(remove_words_func)
    return data_new['TEXT']

def remove_white_spaces(data):
    data_new['TEXT'] = data_new['TEXT'].apply(lambda x: x.strip())
    return data_new['TEXT']

def spell_correction(data):
    spell = Speller()
    data_new['TEXT'] = [' '.join([spell(i) for i in x.split()]) for x in data_new['TEXT']]
    return data_new['TEXT']

## VADER

In [245]:
def a(data):
    analyser = SentimentIntensityAnalyzer()
    
    polarity_score = []
    #scores = []
    
    for text in data_new['TEXT']:
        score = analyser.polarity_scores(text)
        polarity_score.append(score)
        # data_new['SCORE'] = data_new['TEXT'].apply(lambda text: sid.polarity_scores(text))

    polarity_score = pd.DataFrame(polarity_score) #list of dictionaries of four values, 1 each for every sentence

    data_new['Compound'] = polarity_score['compound']
    data_new['Negative'] = polarity_score['neg']
    data_new['Neutral'] = polarity_score['neu']
    data_new['Positive'] = polarity_score['pos']
    
    return data_new

In [3]:
# import pandas as pd
# d = [{1:10, 2:20}, {1:30, 2:40}]
# pd.DataFrame(d)

Unnamed: 0,1,2
0,10,20
1,30,40


In [246]:
def b(data):

    pol_label = []
    
    for i in range(len(data)):
        if(data_new['Compound'][i] >= 0.05):
                pol_label.append("Positive")
             
        elif(data_new['Compound'][i] <= -0.05):
                pol_label.append("Negative")
               
        else:
            pol_label.append("Neutral")

    pol_label = pd.DataFrame(pol_label)
    data_new['Polarity'] = pol_label
    return data_new

## Main

In [230]:
def main():
        t = remove_URL_data(data_new)       
        t2 = remove_HTMLTag_data(t)
        t3 = remove_username((t2))
        x = spell_correction(str(t3))
        t4 = tokenize(t3)
        t5 = lemmatize_data(t4)
        t6 = stemming_data(t5)
        t7 = lower(t5)
        t8 = remove_punctuation_tokens(t7)
        t9 = remove_stopwords(t8)
        t10 = remove_words(t9)
        x2 = remove_white_spaces(t10)
        t11 = a(t10)
        t12 = b(t11)
       # print(t12)          

In [231]:
main()

In [232]:
data_new.head(5)

Unnamed: 0,LABEL,TEXT,Compound,Negative,Neutral,Positive,Polarity
0,0,thread scare,-0.4939,0.762,0.238,0.0,Negative
1,1,like sean mango yesterday,0.3612,0.0,0.545,0.455,Positive
2,1,work work show sooooooooooo tire sparrow si...,0.0,0.0,1.0,0.0,Neutral
3,1,actual start afternoon someth slow process ...,0.0,0.0,1.0,0.0,Neutral
4,1,gido worryw vote nonstop love much,0.6369,0.0,0.543,0.457,Positive


## Accuracy On Processed Text

In [243]:
count_pos = 0
count_neg = 0
tot_pos_count = 0
tot_neg_count = 0
    
for i in range(len(data)):
    if(data_new['LABEL'][i] == 1):
        if(data_new['Compound'][i] > -0.01):
            count_pos += 1
        tot_pos_count += 1
                
    elif(data_new['LABEL'][i] == 0):    
        if(data_new['Compound'][i] <= -0.01):
            count_neg += 1
        tot_neg_count += 1
                
    else:
        polarity.append("Neutral")
                    
print("Actual positive count:" + str(count_pos))
print("Actual negative count:" + str(count_neg))
print("Total positive label:" + str(tot_pos_count))
print("Total negative label:" + str(tot_neg_count))

print("Positive accuracy = {}% of total {} samples".format(count_pos / tot_pos_count * 100.0, tot_pos_count))
print("Negative accuracy = {}% of total {} samples".format(count_neg / tot_neg_count * 100.0, tot_neg_count))
print("Overall Accuracy = {}% of total {} samples".format(((count_pos + count_neg) / (tot_pos_count + tot_neg_count)) * 100, tot_pos_count + tot_neg_count))

Actual positive count:2105
Actual negative count:573
Total positive label:2287
Total negative label:2000
Positive accuracy = 92.04197638828158% of total 2287 samples
Negative accuracy = 28.65% of total 2000 samples
Overall Accuracy = 62.46792628878003% of total 4287 samples


## Accuracy On  Raw Text

In [257]:
data_new = data[['LABEL', 'TEXT']].copy()

In [258]:
def raw_text():
    t11 = a(data_new)
    t12 = b(t11)
    #print(t12)  

In [259]:
raw_text()

In [260]:
data_new.head(10)

Unnamed: 0,LABEL,TEXT,Compound,Negative,Positive,Polarity
0,0,About to get threaded and scared,-0.4404,0.367,0.0,Negative
1,1,@awaisnaseer I like Shezan Mangooo too!!! I ha...,0.5229,0.0,0.273,Positive
2,1,worked on my car after work. showering then go...,-0.4404,0.146,0.0,Negative
3,1,@Marama Actually we start this afternoon! I w...,0.0,0.0,0.0,Positive
4,1,@gfalcone601 Aww Gi.don't worry.we'll vote for...,0.6369,0.0,0.244,Positive
5,1,@mrstessyman What ever you do have a good day....,0.7964,0.0,0.415,Positive
6,1,"@GetMeVideo Sorry, not my forte, Ask me about...",-0.0772,0.08,0.0,Negative
7,1,Getting ready for church and bummed I cannot w...,0.3612,0.0,0.122,Positive
8,1,Up early tomorrow. Last open home. Goodnight,0.0,0.0,0.0,Positive
9,0,Needs to shake this gloomy feeling!! Maybe ...,-0.4721,0.35,0.115,Negative


In [263]:
count_pos = 0
count_neg = 0
tot_pos_count = 0
tot_neg_count = 0
    
for i in range(len(data)):
    if(data_new['LABEL'][i] == 1):
        if(data_new['Compound'][i] > -0.01):
            count_pos += 1 # predicted pos
        tot_pos_count += 1 # actual pos
                
    elif(data_new['LABEL'][i] == 0):    
        if(data_new['Compound'][i] <= -0.01):
            count_neg += 1 # predicted neg
        tot_neg_count += 1 # actual pos
                
    else:
        polarity.append("Neutral")
                    
print("Actual positive count:" + str(count_pos))
print("Actual negative count:" + str(count_neg))
print("Total positive label:" + str(tot_pos_count))
print("Total negative label:" + str(tot_neg_count))

print("Positive accuracy = {}% of total {} samples".format(count_pos / tot_pos_count * 100.0, tot_pos_count))
print("Negative accuracy = {}% of total {} samples".format(count_neg / tot_neg_count * 100.0, tot_neg_count))
print("Overall Accuracy = {}% of total {} samples".format(((count_pos + count_neg) / (tot_pos_count + tot_neg_count)) * 100, tot_pos_count + tot_neg_count))

Actual positive count:2057
Actual negative count:884
Total positive label:2287
Total negative label:2000
Positive accuracy = 89.94315697420201% of total 2287 samples
Negative accuracy = 44.2% of total 2000 samples
Overall Accuracy = 68.60275250758106% of total 4287 samples


In [262]:
#!pip install autocorrect