In [1]:
# !pip install --upgrade pandas

import pandas as pd
import numpy as np
# pd.set_option("max_rows", None)
import pickle
from string import punctuation
import re

from sklearn.metrics import cohen_kappa_score
import matplotlib.pyplot as plt
import seaborn as sns
# from astropy.visualization import hist
sns.set_theme(style="whitegrid")
plt.rc('axes', titlesize=14)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=12)
plt.rc('ytick', labelsize=12)
plt.rcParams.update({'font.size': 16})
plt.rcParams['axes.titlesize'] = 16
plt.rcParams["figure.figsize"] = (10, 6)
plt.rcParams.update({'lines.markeredgewidth': 1})
plt.rcParams.update({'errorbar.capsize': 2})
import random

import plotly.express as px

sentiments = "anger fear joy sadness".split()
intensities = "low medium high".split()

root_data = "assigment_1/data/"
file_names = {
    "train/anger": "train/anger-train.txt",
    "train/fear": "train/fear-train.txt",
    "train/joy": "train/joy-train.txt",
    "train/sadness": "train/sadness-train.txt",
    "target/anger": "target/anger-target.txt",
    "target/fear": "target/fear-target.txt",
    "target/joy": "target/joy-target.txt",
    "target/sadness": "target/sadness-target.txt"
}

def txt_to_df(file_name):   
    with open(file_name, encoding="utf-8") as f:
        data = []
        for l in f.read().split("\n")[:-1]:
            o = {}
            for k, v in enumerate(l.split("\t")):
                col_name = "id text sen int".split()[k]
                o[col_name] = v
            data.append(o)
    return pd.DataFrame(data).set_index("id")

In [2]:
df_train = pd.DataFrame()
for sen in sentiments:
    file_name = root_data+file_names[f"train/{sen}"]
    df_sen = txt_to_df(file_name)
    df_train = pd.concat([df_train, df_sen], axis=0)
df_train

Unnamed: 0_level_0,text,sen,int
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10000,How the fu*k! Who the heck! moved my fridge!.....,anger,high
10001,So my Indian Uber driver just called someone t...,anger,high
10002,@DPD_UK I asked for my parcel to be delivered ...,anger,high
10003,so ef whichever butt wipe pulled the fire alar...,anger,high
10004,Don't join @BTCare they put the phone down on ...,anger,high
...,...,...,...
40855,Common app just randomly logged me out as I wa...,sadness,high
40856,"I'd rather laugh with the rarest genius, in be...",sadness,high
40857,If you #invest in my new #film I will stop ask...,sadness,medium
40858,"Just watched Django Unchained, Other people ma...",sadness,low


In [7]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
tt = TweetTokenizer()

def lemmatize_text(text):
    lemmatized_list = [lemmatizer.lemmatize(token) for token in tt.tokenize(text)]
    return [token for token in lemmatized_list if token not in (stop)]
  
def count_words(l):
  return len(l)

def count_token(l, token):
  count = 0
  for x in l:
    if x == token:
      count += 1
  return count

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
df_train['token'] = df_train['text'].apply(lemmatize_text)
word_count = df_train['token'].apply(count_words).sum()

word_list = []
for index, row in df_train.iterrows():
    word_list += row['token']

vocabulary_size = len(list(set(word_list)))

In [24]:
class WordClass:
  def __init__(self, df, vocab_size, sen, intensity):
    self.sen = sen
    self.intensity = intensity
    self.sub_df = df.loc[(df['sen'] == sen) & (df['int'] == intensity)]
    self.row_count = len(self.sub_df)
    self.token_count = self.sub_df['token'].apply(count_words).sum()
    self.word_class_prob = self.row_count / len(df)
    self.v = vocab_size
  
  def get_token_prob(self, token):
    count = self.sub_df['token'].apply(count_token, token=token).sum()
    return self.word_class_prob * (count + 1) / (self.token_count + self.v)
  
  def get_token_list_prob(self, token_list):
    prob = self.word_class_prob
    for token in token_list:
      count = self.sub_df['token'].apply(count_token, token=token).sum()
      prob *= (count + 1) / (self.token_count + self.v)
    return prob


angry_high = WordClass(df_train, vocabulary_size, "anger", "high")
angry_medium = WordClass(df_train, vocabulary_size, "anger", "medium")
angry_low = WordClass(df_train, vocabulary_size, "anger", "low")
fear_high = WordClass(df_train, vocabulary_size, "fear", "high")
fear_medium = WordClass(df_train, vocabulary_size, "fear", "medium")
fear_low = WordClass(df_train, vocabulary_size, "fear", "low")
joy_high = WordClass(df_train, vocabulary_size, "joy", "high")
joy_medium = WordClass(df_train, vocabulary_size, "joy", "medium")
joy_low = WordClass(df_train, vocabulary_size, "joy", "low")
sadness_high = WordClass(df_train, vocabulary_size, "sadness", "high")
sadness_medium = WordClass(df_train, vocabulary_size, "sadness", "medium")
sadness_low = WordClass(df_train, vocabulary_size, "sadness", "low")

word_classes = [angry_low, angry_medium, angry_high, fear_low, fear_medium, fear_high, joy_low, joy_medium, joy_high, sadness_low, sadness_medium, sadness_high]

In [25]:
df_target = pd.DataFrame()
for sen in sentiments:
    file_name = root_data+file_names[f"target/{sen}"]
    df_sen = txt_to_df(file_name)
    df_target = pd.concat([df_target, df_sen], axis=0)

df_target['token'] = df_target['text'].apply(lemmatize_text)
df_target

Unnamed: 0_level_0,text,sen,int,token
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10941,At the point today where if someone says somet...,anger,NONE,"[At, point, today, someone, say, something, re..."
10942,@CorningFootball IT'S GAME DAY!!!! T MIN...,anger,NONE,"[@CorningFootball, IT'S, GAME, DAY, !, !, !, T..."
10943,This game has pissed me off more than any othe...,anger,NONE,"[This, game, ha, pissed, game, year, ., My, bl..."
10944,@spamvicious I've just found out it's Candice ...,anger,NONE,"[@spamvicious, I've, found, Candice, Candace, ..."
10945,@moocowward @mrsajhargreaves @Melly77 @GaryBar...,anger,NONE,"[@moocowward, @mrsajhargreaves, @Melly77, @Gar..."
...,...,...,...,...
41528,Why does Candice constantly pout #GBBO 💄😒,sadness,NONE,"[Why, doe, Candice, constantly, pout, #GBBO, 💄..."
41529,"@redBus_in #unhappy with #redbus CC, when I ta...",sadness,NONE,"[@redBus_in, #unhappy, #redbus, CC, ,, I, talk..."
41530,"@AceOperative789 no pull him afew weeks ago, s...",sadness,NONE,"[@AceOperative789, pull, afew, week, ago, ,, s..."
41531,I'm buying art supplies and I'm debating how s...,sadness,NONE,"[I'm, buying, art, supply, I'm, debating, seri..."


In [42]:
predicted_int = [[]]

for index, row in df_target.iterrows():
  token_list = row['token']
  sen = row['sen']
  predicted_int[-1].append(index)
  for word_class in word_classes:
    if word_class.sen != sen:
      continue
    score = word_class.get_token_list_prob(token_list)
    predicted_int[-1].append(score)
  predicted_int.append([])

predicted_int.pop()

df_pred = pd.DataFrame(predicted_int, columns=["id", "prob_low", "prob_medium", "prob_high"])
df_pred

Unnamed: 0,id,prob_low,prob_medium,prob_high
0,10941,3.110746e-47,6.903732e-44,1.123741e-46
1,10942,8.657163e-41,3.181742e-42,2.475344e-42
2,10943,5.707286e-56,2.074242e-50,2.546549e-53
3,10944,3.864310e-39,4.203788e-36,2.478636e-39
4,10945,6.862897e-53,1.934126e-52,3.398841e-53
...,...,...,...,...
3137,41528,7.747635e-34,7.531558e-32,1.047482e-33
3138,41529,1.134948e-49,3.413934e-47,6.621728e-48
3139,41530,1.315852e-64,3.960677e-62,3.308366e-64
3140,41531,4.980243e-42,1.472764e-40,1.444757e-42
