In [1]:
# !pip install --upgrade pandas

import pandas as pd
import numpy as np
# pd.set_option("max_rows", None)
import pickle
from string import punctuation
import re

from sklearn.metrics import cohen_kappa_score
import matplotlib.pyplot as plt
import seaborn as sns
# from astropy.visualization import hist
sns.set_theme(style="whitegrid")
plt.rc('axes', titlesize=14)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=12)
plt.rc('ytick', labelsize=12)
plt.rcParams.update({'font.size': 16})
plt.rcParams['axes.titlesize'] = 16
plt.rcParams["figure.figsize"] = (10, 6)
plt.rcParams.update({'lines.markeredgewidth': 1})
plt.rcParams.update({'errorbar.capsize': 2})
import random

import plotly.express as px

sentiments = "anger fear joy sadness".split()
intensities = "low medium high".split()

root_data = ""
file_names = {
    "train/anger": "anger-train.txt",
    "train/fear": "fear-train.txt",
    "train/joy": "joy-train.txt",
    "train/sadness": "sadness-train.txt",
    "target/anger": "anger-target.txt",
    "target/fear": "fear-target.txt",
    "target/joy": "joy-target.txt",
    "target/sadness": "sadness-target.txt"
}

def txt_to_df(file_name):   
    with open(file_name, encoding="utf-8") as f:
        data = []
        for l in f.read().split("\n")[:-1]:
            o = {}
            for k, v in enumerate(l.split("\t")):
                col_name = "id text sen int".split()[k]
                o[col_name] = v
            data.append(o)
    return pd.DataFrame(data).set_index("id")

In [2]:
df_train = pd.DataFrame()
for sen in sentiments:
    file_name = root_data+file_names[f"train/{sen}"]
    df_sen = txt_to_df(file_name)
    df_train = pd.concat([df_train, df_sen], axis=0)
df_train

Unnamed: 0_level_0,text,sen,int
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10000,How the fu*k! Who the heck! moved my fridge!.....,anger,high
10001,So my Indian Uber driver just called someone t...,anger,high
10002,@DPD_UK I asked for my parcel to be delivered ...,anger,high
10003,so ef whichever butt wipe pulled the fire alar...,anger,high
10004,Don't join @BTCare they put the phone down on ...,anger,high
...,...,...,...
40855,Common app just randomly logged me out as I wa...,sadness,high
40856,"I'd rather laugh with the rarest genius, in be...",sadness,high
40857,If you #invest in my new #film I will stop ask...,sadness,medium
40858,"Just watched Django Unchained, Other people ma...",sadness,low


In [17]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
tt = TweetTokenizer()

def lemmatize_text(text):
    lemmatized_list = [lemmatizer.lemmatize(token) for token in tt.tokenize(text)]
    return [token for token in lemmatized_list if token not in (stop)]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [19]:
df_train['token'] = df_train['text'].apply(lemmatize_text)
df_train['token']

id
10000    [How, fu, *, k, !, Who, heck, !, moved, fridge...
10001    [So, Indian, Uber, driver, called, someone, N,...
10002    [@DPD_UK, I, asked, parcel, delivered, pick, s...
10003    [ef, whichever, butt, wipe, pulled, fire, alar...
10004    [Don't, join, @BTCare, put, phone, ,, talk, ru...
                               ...                        
40855    [Common, app, randomly, logged, I, wa, writing...
40856    [I'd, rather, laugh, rarest, genius, ,, beauti...
40857    [If, #invest, new, #film, I, stop, asking, inv...
40858    [Just, watched, Django, Unchained, ,, Other, p...
40859    [@KeithOlbermann, depressing, despicable, Trum...
Name: token, Length: 3960, dtype: object

In [46]:
def count_words(l):
  return len(l)

def count_token(l, token):
  count = 0
  for x in l:
    if x == token:
      count += 1
  return count

class Sentiment:
  def __init__(self, df, sen):
    self.sen = sen
    self.high_prob = 0
    self.low_prob = 0
    self.medium_prob = 0

    self.high_row_count = len(df_train.loc[(df_train['sen'] == sen) & (df_train['int'] == 'high')])
    self.medium_row_count = len(df_train.loc[(df_train['sen'] == sen) & (df_train['int'] == 'medium')])
    self.low_row_count = len(df_train.loc[(df_train['sen'] == sen) & (df_train['int'] == 'low')])
    self.total_row_count = len(df_train.loc[df_train['sen'] == sen])
    print(sen + " rows: " + str(self.total_row_count))

    self.high_token_count = df_train.loc[(df_train['sen'] == sen) & (df_train['int'] == 'high')]['token'].apply(count_words).sum()
    self.medium_token_count = df_train.loc[(df_train['sen'] == sen) & (df_train['int'] == 'medium')]['token'].apply(count_words).sum()
    self.low_token_count = df_train.loc[(df_train['sen'] == sen) & (df_train['int'] == 'low')]['token'].apply(count_words).sum()
    self.total_token_count = self.high_token_count + self.medium_token_count + self.low_token_count
    print(sen + " tokens: " + str(self.total_token_count))
  
  def set_prob(self, total):
    self.high_prob = self.high_row_count / total
    self.medium_prob = self.medium_row_count / total
    self.low_prob = self.low_row_count / total
  
  def get_prob(self, token, intensity, v):
    if intensity == "low":
      count = df_train.loc[(df_train['sen'] == self.sen) & (df_train['int'] == 'low')]['token'].apply(count_token, token=token).sum()
      return self.low_prob * (count + 1) / (self.low_token_count + v)
    if intensity == "high":
      count = df_train.loc[(df_train['sen'] == self.sen) & (df_train['int'] == 'high')]['token'].apply(count_token, token=token).sum()
      return self.high_prob * (count + 1) / (self.high_token_count + v)
    if intensity == "medium":
      count = df_train.loc[(df_train['sen'] == self.sen) & (df_train['medium'] == 'low')]['token'].apply(count_token, token=token).sum()
      return self.medium_prob * (count + 1) / (self.medium_token_count + v)


word_count = df_train['token'].apply(count_words).sum()
print("total:" + str(word_count))

anger_sen = Sentiment(df_train, "anger")
fear_sen = Sentiment(df_train, "fear")
joy_sen = Sentiment(df_train, "joy")
sadness_sen = Sentiment(df_train, "sadness")

for sen in [anger_sen, fear_sen, joy_sen, sadness_sen]:
  sen.set_prob(word_count)

word_list = []
for index, row in df_train.iterrows():
    word_list += row['token']

vocabulary_size = len(list(set(word_list)))
print("unique tokens: " + str(vocabulary_size))


total:50321
anger rows: 941
anger tokens: 11482
fear rows: 1257
fear tokens: 16130
joy rows: 902
joy tokens: 11389
sadness rows: 860
sadness tokens: 11320
unique tokens: 12580


In [47]:
anger_sen.get_prob("How", "low", vocabulary_size)

2.217688688016433e-07