In [7]:
# import libraries
import pandas as pd
import pickle
import re
import warnings
import numpy as np

warnings.filterwarnings('ignore')

In [8]:
# read dataset
df = pd.read_csv("A2_dataset.csv")
df.head()

Unnamed: 0,LABEL,DATE_TIME,TEXT
0,0,Fri Jun 05 14:26:50 2009,About to get threaded and scared
1,1,Thu May 14 10:13:55 2009,@awaisnaseer I like Shezan Mangooo too!!! I ha...
2,1,Fri Jun 05 21:02:20 2009,worked on my car after work. showering then go...
3,1,Sun Jun 14 22:25:52 2009,@Marama Actually we start this afternoon! I w...
4,1,Sun May 31 00:42:12 2009,@gfalcone601 Aww Gi.don't worry.we'll vote for...


In [9]:
# check count of each labels in dataset

df.LABEL.value_counts()

1    2287
0    2000
Name: LABEL, dtype: int64

In [10]:
# # install libraries

# import nltk

# !pip install autocorrect
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

In [11]:
# import libraries

from nltk.stem.snowball import stopwords
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from autocorrect import Speller

In [6]:
def preprocess(df_temp):
    # create required columns and initialize with 0
    df_temp['white_space_removed'] = 0
    df_temp['tokenized_data'] = 0
    df_temp['stopword_removed_data'] = 0
    df_temp['punct_removed_data'] = 0
    df_temp['url_removed_data'] = 0
    df_temp['spelling_checked_data'] = 0
    df_temp['lemmetized_data'] = 0



    # create object for lemmetizer and spelling checking
    lemmatizer = WordNetLemmatizer()
    spell = Speller(lang='en')


    # iterate over each row of dataset and preprocess data
    for i in range(df_temp.shape[0]):

      # white space removel
      df_temp['white_space_removed'][i] = re.sub("\s+", " ", df_temp.TEXT[i])


      # tokenization
      lower = df_temp['white_space_removed'][i].lower()
      tokenized_data = word_tokenize(lower)
      df_temp['tokenized_data'][i] = tokenized_data
      # print(tokenized_data)


      # remove stopwords
      stop_words = "|".join(stopwords.words('english'))
      pattern = re.compile(r'\b(' + stop_words + r')\b\s*')
      stopword_removed_data = [pattern.sub("", text) for text in tokenized_data]
      stopword_removed_data = [x for x in stopword_removed_data if x]
      df_temp['stopword_removed_data'][i] = stopword_removed_data 
      # print(stopword_removed_data)


      # punctuation removel
      punct_removed_data = [x for x in stopword_removed_data if x.isalnum()]
      df_temp['punct_removed_data'][i] = punct_removed_data
      # print(punct_removed_data)


      # remove urls and html tags
      urls = re.findall("https?://[a-zA-Z0-9_\?=\@\/#=.~-]+", " ".join(punct_removed_data))
      url_removed_data = [x for x in punct_removed_data if x not in urls]
      df_temp['url_removed_data'][i] = url_removed_data
      # print(url_removed_data)  


      # spelling checking
      spelling_checked_data = [spell(x) for x in url_removed_data]
      df_temp['spelling_checked_data'][i] = spelling_checked_data
      # print(spelling_checked_data)


      # lemmetization
      lemmas = []
      for w in spelling_checked_data:
        lemmas.append(lemmatizer.lemmatize(w, wordnet.VERB))
      df_temp['lemmetized_data'][i] = lemmas


    df_temp['preprocessed_txt']=0
    for i in range(df_temp.shape[0]):
        df_temp['preprocessed_txt'][i]=" ".join(df_temp['lemmetized_data'][i])

    display(df_temp.head(3))

    df_temp = df_temp[['LABEL','DATE_TIME','preprocessed_txt']]
    display(df_temp.head(3))


    for i in range(df_temp.shape[0]):
        x=df_temp['preprocessed_txt'][i]
        if(type(x)!='str'):
            x=str(x)
        x = '<s> ' + x + ' </s>'
        df_temp['preprocessed_txt'][i]=x
        
    
    display(df_temp.head(3))
    return df_temp

In [7]:
# data = preprocess(df)
# data.to_csv('A2_dataset_processed.csv',encoding='utf-8-sig', index=False)

In [12]:
data = pd.read_csv("A2_dataset_processed.csv")
data.head(3)

Unnamed: 0,LABEL,DATE_TIME,preprocessed_txt
0,0,Fri Jun 05 14:26:50 2009,<s> get thread scar </s>
1,1,Thu May 14 10:13:55 2009,<s> awaisnaseer like sedan mango one yesterday...
2,1,Fri Jun 05 21:02:20 2009,<s> work car work show go bed sooooooooooo tir...


## Question 1- smoothed frequencies of bigrams


In [13]:
#count unigram frequencies and prepare vocabulary set

vocab = set()
unigram_count={}
for x in data['preprocessed_txt'].to_list():
    
    for y in x.split():
        vocab.add(y)
        if y in unigram_count:
            unigram_count[y]+=1
        else:
            unigram_count[y]=1

print(len(vocab))
print(len(unigram_count))
vocab=list(vocab)

7348
7348


In [14]:
#count bigram frequencies of only existing bigrams; rest are trivially 0
bigram_count={}      
            
            
for line in data.preprocessed_txt.to_list():
            list_words = line.split()
            for k in range(len(list_words)-1):
                i=list_words[k]
                j=list_words[k+1]
                if(i,j) in bigram_count:
                    bigram_count[(i,j)]+=1
                else:
                    bigram_count[(i,j)]=1                
                
list(bigram_count.items())[:5]

[(('<s>', 'get'), 67),
 (('get', 'thread'), 1),
 (('thread', 'scar'), 1),
 (('scar', '</s>'), 2),
 (('<s>', 'awaisnaseer'), 1)]

In [11]:
# calculate updated bigram probability using laplace transform over entire dataset and print top 4 
v=vocab
length=len(vocab)
p_bigrams_lap={}
for i in v:
    for j in v:
        if(i,j) in bigram_count:
            p_bigrams_lap[(i,j)] = (bigram_count[(i,j)]+1)/(unigram_count[i]+length) 
        else:
            p_bigrams_lap[(i,j)] = 1/(unigram_count[i]+length)
            

            
list(p_bigrams_lap.items())[:5]

## Save smoothed bigram language model


In [25]:
with open('p_bigrams_lap.pickle', 'wb') as handle:
    pickle.dump(p_bigrams_lap, handle, protocol=pickle.HIGHEST_PROTOCOL)
del p_bigrams_lap

In [15]:
with open('p_bigrams_lap.pickle', 'rb') as handle:
    p_bigrams_lap = pickle.load(handle)

## Top-4 bigrams and their score after smoothing.

In [8]:
from collections import Counter
value ,count=[]
value, count = Counter(p_bigrams_lap.values()).most_common(4)

((‘http’, </s>), 0.014)
((‘day’, </s>), 0.0089)
((‘lol’, </s>), 0.0082)
((‘work’, </s>), 0.0064)


In [16]:
import random

def next_word(context):#generate most probable next word after 'context'
    r=random.random()
    map_to_probs = {}
    
    for token in vocab:
        map_to_probs[token] = p_bigrams_lap[(context, token)] #prob. of occurrence of each word 'token' after 'context'

    summ = 0  #summ stores cumulative probabilities of occurrence of a word after a 'context' 
    for token in (map_to_probs):
        summ += map_to_probs[token] 
        if (summ > r):
            return token

In [17]:
def generate_text(): #returns 1 generated sentence

        minn=7
        maxx=20
        context_queue = '<s>' #current last word seen/generated; sent as 'context' to next_word() function
        result = ['<s>']  #entire sentence generated upto present time 
        
        c = 1
        while  c<=maxx :
            obj = next_word(context_queue)
            if obj == '</s>' and c <= minn:
                continue
            elif obj == '</s>' and c > minn:
                break
            else: 
                context_queue=obj
                result.append(obj)
                c += 1
                            
        result.append('</s>')
        return ' '.join(result)

In [19]:
# generate required 500 sentences using smoothed probabilities
sentences=[]
for i in range(500):
    sentences.append(generate_text())
sentences[:5]

['<s> joeymcintyre rfargnoli studios chicks durham perezhilton pictureeee tedmurphy by chess bakerzin di garage nude problems coke iilovejbxox eat sharenwatson shell </s>',
 '<s> split wb goodness jennluvs2sing partially jimmycostello everybody yard gps greater emmys20 goooooooooooooood swim you iris mnr barney iris plain sal </s>',
 '<s> back bacon pirrofina potency bendaubney oo online foreverivy bathroom lace shame business calibre passion unlike tinyurl stub mi88s mmmmmmm broadband </s>',
 '<s> michaels create dizzee phoooone configurable stardust channel foundation mygoldenchild17 2getting hahahahah facebook york ff curious soil purple tho caitlinlynn nose </s>',
 '<s> bananabby lucyfurleaps cinetrip 09 devonmarie78 fill subset chesterfield fly boleyn explorations 100 melissa officer plague paloooos case quieter assets 3rd </s>']

In [20]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

scores = []
mod_scores = []
labels = []

def sentiment_scores(sentence):
    sid_obj = SentimentIntensityAnalyzer()
 
    # polarity_scores method of SentimentIntensityAnalyzer
    # object gives a sentiment dictionary.
    # which contains pos, neg, neu, and compound scores.
    
    sentiment_dict = sid_obj.polarity_scores(sentence) 
    scores.append(sentiment_dict['compound'])
    mod_scores.append(abs(sentiment_dict['compound']))
    
    # decide sentiment as positive, negative and neutral
    if sentiment_dict['compound'] >= 0 :
        labels.append(1)
    else:
        labels.append(0)
        
        
for s in sentences:
    sentiment_scores(s) 

In [21]:
df_no_beta = pd.DataFrame({'sentences': sentences, 'labels': labels, 'vader_scores': scores, 'mod_vader_scores': mod_scores})
df_no_beta.head(5)

Unnamed: 0,sentences,labels,vader_scores,mod_vader_scores
0,<s> joeymcintyre rfargnoli studios chicks durh...,0,-0.4019,0.4019
1,<s> split wb goodness jennluvs2sing partially ...,1,0.6705,0.6705
2,<s> back bacon pirrofina potency bendaubney oo...,0,-0.0258,0.0258
3,<s> michaels create dizzee phoooone configurab...,1,0.7351,0.7351
4,<s> bananabby lucyfurleaps cinetrip 09 devonma...,1,0.25,0.25


In [20]:
df_no_beta.to_csv('df_no_beta', index=False)

In [21]:
df_no_beta = pd.read_csv('df_no_beta')
mod_scores = df_no_beta.mod_vader_scores

In [87]:
avg_vader_score_no_beta = sum(mod_scores) / len(mod_scores)
print(avg_vader_score_no_beta)

0.38433359999999983


In [22]:
df1 = data.loc[data.LABEL == 1]
display(df1.head())

Unnamed: 0,LABEL,DATE_TIME,preprocessed_txt
1,1,Thu May 14 10:13:55 2009,<s> awaisnaseer like sedan mango one yesterday...
2,1,Fri Jun 05 21:02:20 2009,<s> work car work show go bed sooooooooooo tir...
3,1,Sun Jun 14 22:25:52 2009,<s> drama actually start afternoon try somethi...
4,1,Sun May 31 00:42:12 2009,<s> falcon601 www vote col love much </s>
5,1,Sun May 17 03:26:30 2009,<s> mrstessyman ever good day love knitpicks </s>


In [23]:
# unigram counts for datasets with only positive labels separately

#unigram frequencies
v1 = set()
unigram_count1={}
for x in df1['preprocessed_txt'].to_list():
    
    for y in x.split():
        v1.add(y)
        if y in unigram_count1:
            unigram_count1[y]+=1
        else:
            unigram_count1[y]=1

print(len(v1))
print(len(unigram_count1))
v1=list(v1)

5038
5038


In [24]:
#count bigram frequencies of only existing bigrams for positive label sentences; rest are trivially 0
bigram_count1={}      
            
            
for line in df1.preprocessed_txt.to_list():
            list_words = line.split()
            for k in range(len(list_words)-1):
                i=list_words[k]
                j=list_words[k+1]
                if(i,j) in bigram_count1:
                    bigram_count1[(i,j)]+=1
                else:
                    bigram_count1[(i,j)]=1                
                
print(list(bigram_count1.items())[:5])

[(('<s>', 'awaisnaseer'), 1), (('awaisnaseer', 'like'), 1), (('like', 'sedan'), 1), (('sedan', 'mango'), 1), (('mango', 'one'), 1)]


In [33]:
#calculate updated bigram probability using laplace transform over entire dataset
length=len(v1)
p_bigrams_lap1={}
for i in v1:
    for j in v1:
        if (i,j) in bigram_count1:
            p_bigrams_lap1[(i,j)] = (bigram_count1[(i,j)]+1)/(unigram_count1[i]+length) 
        else:
            p_bigrams_lap1[(i,j)] = 1/(unigram_count1[i]+length)
            
list(p_bigrams_lap1.items())[:5]

In [28]:
with open('p_bigrams_lap1.pickle', 'wb') as handle:
    pickle.dump(p_bigrams_lap1, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [29]:
with open('p_bigrams_lap1.pickle', 'rb') as handle:
    p_bigrams_lap1 = pickle.load(handle)

In [30]:
del p_bigrams_lap1

In [31]:
# calculating smoothed bigram prob including beta
length = len(vocab)
p_bigrams_beta1 = {}

for i in vocab:
    for j in vocab:
        if (i,j) not in bigram_count and (i,j) not in bigram_count1 and i not in unigram_count1:
            p_bigrams_beta1[(i,j)] = ((0 + 1 + 5*0) / (unigram_count[i] + length + 5*0))
        elif (i,j) in bigram_count and (i,j) not in bigram_count1 and i not in unigram_count1:
            p_bigrams_beta1[(i,j)] = ((bigram_count[(i,j)] + 1 + 5*0) / 
                                     (unigram_count[i] + length + 5*0))        
        elif (i,j) not in bigram_count and (i,j) not in bigram_count1 and i in unigram_count1:
            p_bigrams_beta1[(i,j)] = ((0 + 1 + 5*0) / 
                                     (unigram_count[i] + length + 5*unigram_count1[i]))
        elif (i,j) in bigram_count and (i,j) not in bigram_count1 and i in unigram_count1:
            p_bigrams_beta1[(i,j)] = ((bigram_count[(i,j)] + 1 + 5*0) / 
                                     (unigram_count[i] + length + 5*unigram_count1[i]))
        elif (i,j) in bigram_count and (i,j) in bigram_count1 and i in unigram_count1:
            p_bigrams_beta1[(i,j)] = ((bigram_count[(i,j)] + 1 + 5*bigram_count1[(i,j)]) / 
                                     (unigram_count[i] + length + 5*unigram_count1[i]))        
list(p_bigrams_beta1.items())[:5]

In [32]:
# with open('p_bigrams_beta1.pickle', 'wb') as handle:
#     pickle.dump(p_bigrams_beta1, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [41]:
del p_bigrams_beta1

In [42]:
with open('p_bigrams_beta1.pickle', 'rb') as handle:
    p_bigrams_beta1 = pickle.load(handle)

In [43]:
import random

def next_word(context):#generate most probable next word after 'context'
    r=random.random()
    map_to_probs = {}
    
    for token in vocab:
        map_to_probs[token] = p_bigrams_beta1[(context, token)] #prob. of occurrence of each word 'token' after 'context'

    summ = 0  #summ stores cumulative probabilities of occurrence of a word after a 'context' 
    for token in (map_to_probs):
        summ += map_to_probs[token] 
        if (summ > r):
            return token


def generate_text(): #returns 1 generated sentence

        minn=7
        maxx=20
        context_queue = '<s>' #current last word seen/generated; sent as 'context' to next_word() function
        result = ['<s>']  #entire sentence generated upto present time 
        
        c = 1
        while  c<=maxx :
            obj = next_word(context_queue)
            if obj == '</s>' and c <= minn:
                continue
            elif obj == '</s>' and c > minn:
                break
            else: 
                context_queue=obj
                result.append(obj)
                c += 1
                            
        result.append('</s>')
        return ' '.join(result)

In [44]:
#genrate 500 sentences using smoothed probabilities with beta positive
sentences_beta1 = []

for i in range(500):
    sentences_beta1.append(generate_text())
sentences_beta1[:5]

['<s> bnp nicole unblock conference tight ahahahhahaahhahh range91 bball comformfortable itschristablack fancy reign bella vis stevebrunton carcassonne brothers teemwilliams receive survive </s>',
 '<s> poster quit piratesswoop cartoon mynameisforge rd kudos africa airlines advantage brothers god4movers thunderstorm algebra iranelection kris daviddjfrancis twice scratch macabroso </s>',
 '<s> thelarssan 45sinyoureyes hairbrained nuggets straight gotta menu19 arent soo kirstiealley stay joey stuff woofwednesday cant decline gv odds handy jaffacakes </s>',
 '<s> get lucas mrsmcflygrimmy redemption teenhearts worth katie freshman form nm animal chloe border lay greatness food broadcast creamcheese mwuahmwuah nanny </s>',
 '<s> goodnight boil downstairs u ataxia pink doingwork xiaomantous moscow mate recognition marshal app mitchelmusso lonely psp ten latitude euphemism faire </s>']

In [39]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

scores_beta1 = []
mod_scores_beta1 = []
labels_beta1 = []

def sentiment_scores(sentence):
    sid_obj = SentimentIntensityAnalyzer()
 
    # polarity_scores method of SentimentIntensityAnalyzer
    # object gives a sentiment dictionary.
    # which contains pos, neg, neu, and compound scores.
    
    sentiment_dict = sid_obj.polarity_scores(sentence) 
    scores_beta1.append(sentiment_dict['compound'])
    mod_scores_beta1.append(abs(sentiment_dict['compound']))
    
    # decide sentiment as positive, negative and neutral
    if sentiment_dict['compound'] >= 0 :
        labels_beta1.append(1)
    else:
        labels_beta1.append(0)
        
        
for s in sentences_beta1:
    sentiment_scores(s) 


df_beta1 = pd.DataFrame({'sentences with beta positive': sentences_beta1, 'labels': labels_beta1, 'vader_scores': scores_beta1, 'mod_vader_scores': mod_scores_beta1})
display(df_beta1.head(5))

df_beta1.to_csv('df_beta1')

df_beta1 = pd.read_csv('df_beta1')
mod_scores_beta1 = df_beta1.mod_vader_scores

avg_vader_score_beta1 = sum(mod_scores_beta1) / len(mod_scores_beta1)
print(avg_vader_score_beta1)

In [45]:
del p_bigrams_beta1

## Negative sentance generation

In [46]:
df2 = data.loc[data.LABEL == 0]
display(df2.head())

Unnamed: 0,LABEL,DATE_TIME,preprocessed_txt
0,0,Fri Jun 05 14:26:50 2009,<s> get thread scar </s>
9,0,Wed Jun 17 09:18:19 2009,<s> need shake gloomy feel maybe rain </s>
10,0,Mon Jun 22 13:51:56 2009,<s> minecraft ride sarah still afraid ride any...
12,0,Fri May 22 00:37:02 2009,<s> sokendrakouture yea alone </s>
18,0,Thu May 21 23:50:48 2009,<s> flyingbolt good without </s>


In [47]:
# unigram counts, bigram counts, bigram prob(unsmoothed), bigram prob(smoothed):  for datasets with only negative
# labels 
#unigram frequencies

v2 = set()
unigram_count2 = {}

for x in df2['preprocessed_txt'].to_list():
    for y in x.split():
        v2.add(y)
        if y in unigram_count2:
            unigram_count2[y]+=1
        else:
            unigram_count2[y]=1

print(len(v2))
print(len(unigram_count2))
v2 = list(v2)

4216
4216


In [48]:
#count bigram frequencies of only existing bigrams for negative label sentences; rest are trivially 0

bigram_count2 = {}
            
for line in df2.preprocessed_txt.to_list():
            list_words = line.split()
            for k in range(len(list_words)-1):
                i=list_words[k]
                j=list_words[k+1]
                if(i,j) in bigram_count2:
                    bigram_count2[(i,j)]+=1
                else:
                    bigram_count2[(i,j)]=1                
                
print(list(bigram_count2.items())[:5])

[(('<s>', 'get'), 28), (('get', 'thread'), 1), (('thread', 'scar'), 1), (('scar', '</s>'), 2), (('<s>', 'need'), 9)]


In [45]:
# calculating smoothed bigram prob including beta
length = len(vocab)
p_bigrams_beta2 = {}

for i in vocab:
    for j in vocab:
        if (i,j) not in bigram_count and (i,j) not in bigram_count2 and i not in unigram_count2:
            p_bigrams_beta2[(i,j)] = ((0 + 1 + 3*0) / (unigram_count[i] + length + 3*0))
        elif (i,j) in bigram_count and (i,j) not in bigram_count2 and i not in unigram_count2:
            p_bigrams_beta2[(i,j)] = ((bigram_count[(i,j)] + 1 + 3*0) / 
                                     (unigram_count[i] + length + 3*0))        
        elif (i,j) not in bigram_count and (i,j) not in bigram_count2 and i in unigram_count2:
            p_bigrams_beta2[(i,j)] = ((0 + 1 + 3*0) / 
                                     (unigram_count[i] + length + 3*unigram_count2[i]))
        elif (i,j) in bigram_count and (i,j) not in bigram_count2 and i in unigram_count2:
            p_bigrams_beta2[(i,j)] = ((bigram_count[(i,j)] + 1 + 3*0) / 
                                     (unigram_count[i] + length + 3*unigram_count2[i]))
        elif (i,j) in bigram_count and (i,j) in bigram_count2 and i in unigram_count2:
            p_bigrams_beta2[(i,j)] = ((bigram_count[(i,j)] + 1 + 3*bigram_count2[(i,j)]) / 
                                     (unigram_count[i] + length + 3*unigram_count2[i]))        
list(p_bigrams_beta2.items())[:5]

In [60]:
del p_bigrams_beta2

In [61]:
# with open('p_bigrams_beta2.pickle', 'wb') as handle:
#     pickle.dump(p_bigrams_beta2, handle, protocol=pickle.HIGHEST_PROTOCOL)

# del p_bigrams_beta2

with open('p_bigrams_beta2.pickle', 'rb') as handle:
    p_bigrams_beta2 = pickle.load(handle)

In [65]:
import random

def next_word(context):#generate most probable next word after 'context'
    r=random.random()
    map_to_probs = {}
    
    for token in vocab:
        map_to_probs[token] = p_bigrams_beta2[(context, token)] #prob. of occurrence of each word 'token' after 'context'

    summ = 0  #summ stores cumulative probabilities of occurrence of a word after a 'context' 
    for token in (map_to_probs):
        summ += map_to_probs[token] 
        if (summ > r):
            return token


def generate_text(): #returns 1 generated sentence

        minn=7
        maxx=20
        context_queue = '<s>' #current last word seen/generated; sent as 'context' to next_word() function
        result = ['<s>']  #entire sentence generated upto present time 
        
        c = 1
        while  c<=maxx :
            obj = next_word(context_queue)
            if obj == '</s>' and c <= minn:
                continue
            elif obj == '</s>' and c > minn:
                break
            else: 
                context_queue=obj
                result.append(obj)
                c += 1
                            
        result.append('</s>')
        return ' '.join(result)
    

#genrate 500 sentences using smoothed probabilities with beta negative
sentences_beta2 = []
for i in range(500):
    sentences_beta2.append(generate_text())
sentences_beta2[:5]

In [45]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

scores_beta2 = []
mod_scores_beta2 = []
labels_beta2 = []

def sentiment_scores(sentence):
    sid_obj = SentimentIntensityAnalyzer()
 
    # polarity_scores method of SentimentIntensityAnalyzer
    # object gives a sentiment dictionary.
    # which contains pos, neg, neu, and compound scores.
    
    sentiment_dict = sid_obj.polarity_scores(sentence) 
    scores_beta2.append(sentiment_dict['compound'])
    mod_scores_beta2.append(abs(sentiment_dict['compound']))
    
    # decide sentiment as positive, negative and neutral
    if sentiment_dict['compound'] > 0 :
        labels_beta2.append(1)
    else:
        labels_beta2.append(0)
        
        
for s in sentences_beta2:
    sentiment_scores(s) 


df_beta2 = pd.DataFrame({'sentences with beta negative': sentences_beta2, 'labels': labels_beta2, 'vader_scores': scores_beta2, 'mod_vader_scores': mod_scores_beta2})
display(df_beta2.head(5))

df_beta2.to_csv('df_beta2', index=False)

df_beta2 = pd.read_csv('df_beta2')
mod_scores_beta2 = df_beta2.mod_vader_scores

avg_vader_score_beta2 = sum(mod_scores_beta2) / len(mod_scores_beta2)
print(avg_vader_score_beta2)

Unnamed: 0,sentences with beta negative,labels,vader_scores,mod_vader_scores
0,<s> physics engineer renew lucyfurleaps aghhhh...,0,0.0,0.0
1,<s> catch stanmorerob scene itsconsiderate den...,0,-0.2755,0.2755
2,<s> drop asleep filter pump cincinnati label22...,0,-0.7783,0.7783
3,<s> moanyboot nut kimboyee rotors line sex gal...,0,0.0,0.0
4,<s> ery barrios lee outdoors loveeee ubuntu pa...,1,0.34,0.34


0.39034340000000023


# 5 positive generated samples

In [175]:
df_beta1 = pd.read_csv('df_beta1')
df_beta1.drop(['Unnamed: 0'], axis=1, inplace=True)
df_beta1 = df_beta1.sort_values('vader_scores', ascending=False)
df_beta1.rename(columns={'sentences with beta positive': 'sentences with beta'}, inplace=True)
display(df_beta1.head())
df_beta_pos_250 = df_beta1.iloc[:250, :]
df_beta_pos_250.shape

Unnamed: 0,sentences with beta,labels,vader_scores,mod_vader_scores
88,<s> schoooool espana nikkithebee splendid bobo...,1,0.9349,0.9349
380,<s> rickyambury hon jackawful menu19 modes eri...,1,0.9201,0.9201
62,<s> well amuse amazingphil moreover light bedr...,1,0.8834,0.8834
467,<s> boyle markrosenbauer handle calibre muddle...,1,0.8807,0.8807
484,<s> welshdrew award doc kk bid braintree rende...,1,0.8519,0.8519


(250, 4)

# 5 negative generated samples

In [176]:
df_beta2 = pd.read_csv('df_beta2')
df_beta2 = df_beta2.sort_values('vader_scores')
df_beta2.rename(columns={'sentences with beta negative': 'sentences with beta'}, inplace=True)
display(df_beta2.head())
df_beta_neg_250 = df_beta2.iloc[:250, :]
df_beta_neg_250.shape

Unnamed: 0,sentences with beta,labels,vader_scores,mod_vader_scores
54,<s> cut plane student hatchet raaaaaaek nasty ...,0,-0.9186,0.9186
478,<s> kporcalla dead blackberry funky grp episod...,0,-0.91,0.91
269,<s> damn terror treypeezy sit milk lightfoot 2...,0,-0.9042,0.9042
27,<s> oh paperwork wb allabtanimation heeeyshelb...,0,-0.9001,0.9001
367,<s> evil curry windy6 rebecca372 overweight ha...,0,-0.8885,0.8885


(250, 4)

## generate 500 sentiment oriented sentences using beta included smoothed probabilities

In [177]:
df_beta = pd.concat([df_beta_pos_250, df_beta_neg_250])
df_beta.head()

Unnamed: 0,sentences with beta,labels,vader_scores,mod_vader_scores
88,<s> schoooool espana nikkithebee splendid bobo...,1,0.9349,0.9349
380,<s> rickyambury hon jackawful menu19 modes eri...,1,0.9201,0.9201
62,<s> well amuse amazingphil moreover light bedr...,1,0.8834,0.8834
467,<s> boyle markrosenbauer handle calibre muddle...,1,0.8807,0.8807
484,<s> welshdrew award doc kk bid braintree rende...,1,0.8519,0.8519


In [189]:
df_beta.head(360)

Unnamed: 0,sentences with beta,labels,vader_scores,mod_vader_scores
88,<s> schoooool espana nikkithebee splendid bobo...,1,0.9349,0.9349
380,<s> rickyambury hon jackawful menu19 modes eri...,1,0.9201,0.9201
62,<s> well amuse amazingphil moreover light bedr...,1,0.8834,0.8834
467,<s> boyle markrosenbauer handle calibre muddle...,1,0.8807,0.8807
484,<s> welshdrew award doc kk bid braintree rende...,1,0.8519,0.8519
...,...,...,...,...
458,<s> mizushima mwahahaha shore herreee angelo v...,0,-0.4019,0.4019
241,<s> everyone n asia travelpilot game138 choice...,0,-0.4019,0.4019
379,<s> staceybug damn judo suggest soulpoetrysite...,0,-0.4019,0.4019
215,<s> stream eddidit moiswashere sha museum mini...,0,-0.4019,0.4019


In [202]:
mod_vader_scores = df_beta.mod_vader_scores
avg_vader_score_beta = sum(mod_vader_scores) / len(mod_vader_scores)
print(avg_vader_score_beta)
# print(avg_vader_score_no_beta)

0.3939313999999999


In [203]:
data.head()

Unnamed: 0,LABEL,sentences
0,0,<s> get thread scar </s>
1,1,<s> awaisnaseer like sedan mango one yesterday...
2,1,<s> work car work show go bed sooooooooooo tir...
3,1,<s> drama actually start afternoon try somethi...
4,1,<s> falcon601 www vote col love much </s>


In [204]:
data.head()
# data.drop(['DATE_TIME'], axis=1, inplace=True)
data.rename(columns={'preprocessed_txt': 'sentences'}, inplace=True)
data.head()

Unnamed: 0,LABEL,sentences
0,0,<s> get thread scar </s>
1,1,<s> awaisnaseer like sedan mango one yesterday...
2,1,<s> work car work show go bed sooooooooooo tir...
3,1,<s> drama actually start afternoon try somethi...
4,1,<s> falcon601 www vote col love much </s>


## temp- 500 sentiment oriented sentences with their labels

In [205]:
df_beta.head()
temp = df_beta.iloc[:, :2].copy()
temp.rename(columns={'labels': 'LABEL', 'sentences with beta': 'sentences'}, inplace=True)
temp.head()


Unnamed: 0,sentences,LABEL
88,<s> schoooool espana nikkithebee splendid bobo...,1
380,<s> rickyambury hon jackawful menu19 modes eri...,1
62,<s> well amuse amazingphil moreover light bedr...,1
467,<s> boyle markrosenbauer handle calibre muddle...,1
484,<s> welshdrew award doc kk bid braintree rende...,1


In [206]:
dataset_B = data.append(temp)
print(data.shape)
print(dataset_B.shape)

(4287, 2)
(4787, 2)


In [207]:
dataset_B.sentences[10]

10    <s> minecraft ride sarah still afraid ride any...
10    <s> ataxia special spring ajanay7 agingbackwar...
10    <s> modem amendment spanish jesse 1classediva ...
Name: sentences, dtype: object

In [208]:
dataset_B.to_csv('dataset_B.csv', index=False)

## Perplexity evalutation

In [209]:
import math


def perpleixty_positive(sentence):
    words = sentence.split()
    n = len(words)
    result = 0
    
    for k in range(n-1):
        if k == 0:
            result += math.log(unigram_count1[words[k]] / len(vocab))
        else:
            w1 = words[k]
            w2 = words[k+1]
            result += math.log(p_bigrams_beta1[(w1, w2)])

    result = result * (-1 / n)
    result = math.exp(result)
    return result


def perpleixty_negative(sentence):
    words = sentence.split()
    n = len(words)
    result = 0
    
    for k in range(n-1):
        if k == 0:
            result += math.log(unigram_count2[words[k]] / len(vocab))
        else:
            w1 = words[k]
            w2 = words[k+1]
            result += math.log(p_bigrams_beta2[(w1, w2)])

    result = result * (-1 / n)
    result = math.exp(result)
    return result

## perplexity of positive sentiment generated sentences

In [200]:
with open('p_bigrams_beta1.pickle', 'rb') as handle:
    p_bigrams_beta1 = pickle.load(handle)
    
df_beta1 = df_beta1.sort_values('mod_vader_scores', ascending=False)
postive_perplexity_score  = df_beta1['sentences with beta'][:250].apply(perpleixty_positive).mean()
print("postive perplexity score is :", postive_perplexity_score)

del p_bigrams_beta1

postive perplexity score is : 3382.3087853833845


## perplexity of negative sentiment generated sentences

In [201]:
with open('p_bigrams_beta2.pickle', 'rb') as handle:
    p_bigrams_beta2 = pickle.load(handle)
    
df_beta2 = df_beta2.sort_values('mod_vader_scores', ascending=False)
negative_perplexity_score  = df_beta2['sentences with beta'][:250].apply(perpleixty_negative).mean()
print("negative perplexity score is :", negative_perplexity_score)

del p_bigrams_beta2

negative perplexity score is : 3402.3628079430305


## average perplexity of 500 sentiment oriented sentences

In [63]:
ans = round((postive_perplexity_score + negative_perplexity_score) / 2, 2)
print("average perplexity score of 500 generated sentences : ", ans)

average perplexity score of 500 generated sentences :  3391.67


## Extrinsic evaluation

In [10]:
test_data = preprocess(pd.read_csv('test.csv'))

In [211]:
test_data.to_csv('preprocessed_test_data.csv', index=False)

In [212]:
test_data = pd.read_csv('preprocessed_test_data.csv')
test_data.shape

(644, 3)

In [213]:
test_data.head()

Unnamed: 0,LABEL,DATE_TIME,preprocessed_txt
0,1,Fri May 29 22:24:26 2009,<s> mileycyrus cheer miley whats wrong </s>
1,1,Sun Jun 07 01:37:36 2009,<s> get back belcourt saw quot fifth quot awes...
2,1,Wed May 13 23:41:18 2009,<s> http video </s>
3,1,Sun May 31 16:43:58 2009,<s> chloebli hey carnavilistic day wood make w...
4,1,Fri May 29 10:36:59 2009,<s> deadlyseagal http nice day </s>


In [214]:
# dataset A
data.head()

Unnamed: 0,LABEL,sentences
0,0,<s> get thread scar </s>
1,1,<s> awaisnaseer like sedan mango one yesterday...
2,1,<s> work car work show go bed sooooooooooo tir...
3,1,<s> drama actually start afternoon try somethi...
4,1,<s> falcon601 www vote col love much </s>


In [247]:
dataset_B = pd.read_csv('dataset_B.csv')
dataset_B = dataset_B.sample(frac=1)
dataset_B

Unnamed: 0,LABEL,sentences
511,0,<s> try sleep night shift last one tonight man...
2869,0,<s> first start think twitter block account so...
4783,0,<s> work 2day recognize uhhhhh samsung usernam...
2524,1,<s> stefanieex0 didnt think would either lol </s>
2935,0,<s> arch crazy way much school work 3 semester...
...,...,...
2042,0,<s> hate quot new improve quot software get ne...
4355,1,<s> june pee bak emotional india grand grid ka...
4067,0,<s> roland yet alone tonight </s>
4661,0,<s> fearnecotton barn alcohol mixtape bed mimi...


## ML models defn: gets trained and predict labels on test set

In [252]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score


def train_and_evaluate(train_sentences, train_labels, test_sentences, test_labels):
    '''
    parameters:
    train_sentences : list of training sentences
    train_labels : list of training labels
    test_sentences : list of test sentences
    test_labels : list of test labels
    output:
    accuracy : accuracy of the test set
    '''
    
    # Model building
    model = make_pipeline(TfidfVectorizer(), RandomForestClassifier(max_depth=21, random_state=333))
    
    # Training the model with the training data
    model.fit(train_sentences, train_labels)
    
    # Predicting the test data categories
    predicted_test_labels = model.predict(test_sentences)
    return accuracy_score(test_labels, predicted_test_labels)

## Accuracy of test set using dataset A and B for training

In [None]:
acc_A = train_and_evaluate(data.sentences.to_list(), data.LABEL.to_list(), 
                   test_data.preprocessed_txt.to_list(), test_data.LABEL.to_list())

acc_B = train_and_evaluate(dataset_B.sentences.to_list(), dataset_B.LABEL.to_list(), 


                           test_data.preprocessed_txt.to_list(), test_data.LABEL.to_list())

In [253]:
print(acc_A)
print(acc_B)

0.7857142857142857
0.796583850931677
