In [25]:
import pickle
import random
from typing import List

from gensim.corpora import Dictionary
import numpy as np

In [2]:
with open(r"../data/interim/positive_words.pkl", "rb") as input_file:
    positive_docs = pickle.load(input_file)

In [3]:
with open(r"../data/interim/negative_words.pkl", "rb") as input_file:
    negative_docs = pickle.load(input_file)

In [4]:
def create_dictionary(documents: List[List[str]]):
    return Dictionary(documents)

In [5]:
def split_data(data: List, weights: List = (0.8, 0.2, 0.0)):
    split = {
        'train': [],
        'test': [],
        'validation': []
    }
    for word in data:
        subset = random.choices(['train', 'test', 'validation'], weights=weights)[0]
        split[subset].append(word)

    return split

In [6]:
negative_words = [item for sublist in negative_docs for item in sublist]
positive_words = [item for sublist in positive_docs for item in sublist]

In [7]:
dictionary = create_dictionary([negative_words, positive_words])

In [8]:
negative_split = split_data(negative_words)
positive_split = split_data(positive_words)

In [21]:
negative_bow = dictionary.doc2bow(negative_split['train'])
positive_bow = dictionary.doc2bow(positive_split['train'])

In [12]:
total_negative = len(negative_split['train']) + len(negative_bow)
total_positive = len(positive_split['train']) + len(positive_bow)

In [27]:
negative_word_probs = {}
for id, count in negative_bow:
    negative_word_probs[dictionary[id]] = {
        'id': id,
        'logprob': np.log((count + 1)/total_negative),
    }

In [28]:
positive_word_probs = {}
for id, count in positive_bow:
    positive_word_probs[dictionary[id]] = {
        'id': id,
        'logprob': np.log((count + 1)/total_negative),
    }

In [29]:
negative_word_probs

{'abandon': {'id': 0, 'logprob': -10.437257920797217},
 'abate': {'id': 1, 'logprob': -10.437257920797217},
 'abhorrent': {'id': 2, 'logprob': -10.437257920797217},
 'ability': {'id': 3, 'logprob': -10.437257920797217},
 'able': {'id': 4, 'logprob': -7.179161382775734},
 'able_get': {'id': 5, 'logprob': -9.744110740237272},
 'abroad': {'id': 6, 'logprob': -10.437257920797217},
 'abrupt': {'id': 7, 'logprob': -10.437257920797217},
 'absence': {'id': 8, 'logprob': -10.437257920797217},
 'absent': {'id': 9, 'logprob': -10.437257920797217},
 'absolute': {'id': 10, 'logprob': -10.437257920797217},
 'absolutely': {'id': 11, 'logprob': -7.441525647243226},
 'absurd': {'id': 14, 'logprob': -10.031792812689051},
 'absurdly': {'id': 15, 'logprob': -10.437257920797217},
 'abundance': {'id': 16, 'logprob': -10.031792812689051},
 'abuse': {'id': 17, 'logprob': -10.031792812689051},
 'abysmal': {'id': 18, 'logprob': -10.437257920797217},
 'ac': {'id': 19, 'logprob': -9.520967188923061},
 'accept': {

In [30]:
positive_word_probs

{'abandon': {'id': 0, 'logprob': -9.520967188923061},
 'ability': {'id': 3, 'logprob': -8.491347771741903},
 'able': {'id': 4, 'logprob': -5.3970638244594165},
 'abroad': {'id': 6, 'logprob': -8.827820008363116},
 'absence': {'id': 8, 'logprob': -9.744110740237272},
 'absent': {'id': 9, 'logprob': -10.437257920797217},
 'absolute': {'id': 10, 'logprob': -7.218382095929015},
 'absolutely': {'id': 11, 'logprob': -5.261108188223387},
 'absorb': {'id': 12, 'logprob': -10.437257920797217},
 'abundance': {'id': 16, 'logprob': -8.933180524020942},
 'ac': {'id': 19, 'logprob': -8.13467282780317},
 'accept': {'id': 20, 'logprob': -8.039362647998846},
 'acceptable': {'id': 21, 'logprob': -8.35781637911738},
 'access': {'id': 23, 'logprob': -6.018417313000619},
 'accessible': {'id': 24, 'logprob': -7.575057039867748},
 'accessory': {'id': 25, 'logprob': -8.933180524020942},
 'accident': {'id': 26, 'logprob': -8.732509828558792},
 'accidentally': {'id': 27, 'logprob': -9.184494952301849},
 'accomm