In [6]:
import json

import tmtoolkit as tm
from nltk.corpus import opinion_lexicon

import pandas as pd

In [7]:
def load_json(filename):
    with open(filename, 'r') as f:
        return json.load(f)

def load_texts(file_name):
    with open(file_name, "r") as file:
        content = file.read()
        texts = content.split("---\n")
        texts = [s.strip() for s in texts if s.strip()]

    return texts

In [43]:
members_of_congress_wiki_pages_names = load_json("members_of_congress_wiki_pages.json")

In [44]:
republicans_texts = load_texts('republicans.txt')
democrats_texts = load_texts('democrats.txt')

In [45]:
neg = set(opinion_lexicon.negative())
pos = set(opinion_lexicon.positive())

In [46]:
corp_rep = tm.corpus.Corpus(dict(zip(range(len(republicans_texts)), republicans_texts)), language = "en", load_features=[])
tm.corpus.remove_punctuation(corp_rep)
tm.corpus.filter_clean_tokens(corp_rep)
tm.corpus.to_lowercase(corp_rep)



df_rep = tm.corpus.tokens_table(corp_rep)
sent = pd.DataFrame([(tok in pos, tok in neg) for tok in df_rep["token"]], columns=["pos", "neg"])

df_rep = pd.concat([df_rep, sent], axis = 1)

df_rep

Unnamed: 0,doc,position,token,is_punct,is_stop,like_num,pos,neg
0,0,0,felix,False,False,False,False,False
1,0,1,barry,False,False,False,False,False
2,0,2,moore,False,False,False,False,False
3,0,3,born,False,False,False,False,False
4,0,4,september,False,False,False,False,False
...,...,...,...,...,...,...,...,...
148665,154,520,vote,False,False,False,False,False
148666,154,521,smart,False,False,False,True,False
148667,154,522,appearances,False,False,False,False,False
148668,154,523,c,False,False,False,False,False


In [47]:
df_rep_count = df_rep.groupby("doc")[["pos","neg"]].sum()
df_rep_count

Unnamed: 0_level_0,pos,neg
doc,Unnamed: 1_level_1,Unnamed: 2_level_1
0,25,22
1,45,44
2,46,25
3,46,55
4,16,15
...,...,...
150,20,9
151,10,2
152,15,27
153,44,40


In [50]:
corp_dem = tm.corpus.Corpus(dict(zip(range(len(democrats_texts)), democrats_texts)), language = "en", load_features=[])
tm.corpus.remove_punctuation(corp_dem)
tm.corpus.filter_clean_tokens(corp_dem)
tm.corpus.to_lowercase(corp_dem)



df_dem = tm.corpus.tokens_table(corp_dem)
sent = pd.DataFrame([(tok in pos, tok in neg) for tok in df_dem["token"]], columns=["pos", "neg"])

df_dem = pd.concat([df_dem, sent], axis = 1)

df_dem

Unnamed: 0,doc,position,token,is_punct,is_stop,like_num,pos,neg
0,0,0,shomari,False,False,False,False,False
1,0,1,coleman,False,False,False,False,False
2,0,2,figures,False,False,False,False,False
3,0,3,born,False,False,False,False,False
4,0,4,september,False,False,False,False,False
...,...,...,...,...,...,...,...,...
173007,159,451,library,False,False,False,False,False
173008,159,452,congress,False,False,False,False,False
173009,159,453,profile,False,False,False,False,False
173010,159,454,vote,False,False,False,False,False


In [51]:
df_dem_count = df_dem.groupby("doc")[["pos","neg"]].sum()
df_dem_count

Unnamed: 0_level_0,pos,neg
doc,Unnamed: 1_level_1,Unnamed: 2_level_1
0,17,3
1,25,5
2,14,12
3,33,32
4,30,30
...,...,...
155,13,5
156,59,54
157,20,10
158,38,30


In [52]:
print(df_dem_count['pos'].sum())

7338


In [53]:
print(df_dem_count['neg'].sum())

5151


In [54]:
print(df_rep_count['neg'].sum())

4731


In [55]:
print(df_rep_count['pos'].sum())

6266


In [57]:
print(df_dem_count['pos'].sum()/df_dem_count['neg'].sum())
print(df_rep_count['pos'].sum()/df_rep_count['neg'].sum())

1.4245777518928364
1.324455717607271
