### Challenge: What are the top ten words mentioned by Biden in the 105 congress (after stopwords removal)? For each most frequent word, find the 10 most simlar words generated using word2vec. Find the most frequent bigrams in the text. Explore some bigrams and figure out if the tokens appear in the list of most similar words.

In [1]:
import numpy as np
from numpy import array
import pickle
import pandas as pd
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

In [2]:
text = open('Inputs/105-extracted-date/105-biden-de.txt', "r").read()

In [3]:
file = open('Inputs/105-extracted-date/105-biden-de.txt', "r").read()
soup = BeautifulSoup(file) #use BeautifulSoup library for parsing contents since documents have XML format
doc = ''
for word in soup.find_all('text'): #loop for extracting only text within <text> tag
    find_all_syntax=word.get_text()
    doc = doc + find_all_syntax

In [4]:
pars = re.split('\n| \n', doc)
pars = [x for x in pars if x.strip()]

In [5]:
#the stopword list provided by the NLTK library
stop_words1 = stopwords.words('english')

#the stopword list provided by the professor
drop_file = open('Inputs/droplist.txt', "r").read()
drop_file = drop_file.replace('"', '')
drop_list = drop_file.split("\n")
stop_words2 = drop_list[1:]
stop_words = set(stop_words1+stop_words2)

In [6]:
def get_wordnet_pos(word):
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [7]:
def preprocessing_text(text):
    text = re.sub(r'[^A-Za-z0-9 ]+', ' ', text) #remove all non‐alphanumeric characters except white space
    words = word_tokenize(text.lower())
    tokens = [word for word in words if word not in stop_words]
    tokens = [token for token in tokens if not any(c.isdigit() for c in token)] #remove everything containing digits
    tokens = [token for token in tokens if len(token)>=2]
    lemmatizer = WordNetLemmatizer()
    tokens_lematized = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]
    tokens_lematized = [token for token in tokens_lematized if token not in stop_words]
    preprocessed_text = ' '.join(tokens_lematized)
    return preprocessed_text 

In [8]:
%%time 
text = [preprocessing_text(par) for par in pars]

In [58]:
# to save prepocessed text as pickle 
import pickle
with open("Inputs/prepocessed_biden105.pkl", "wb") as fp:   #Pickling
    pickle.dump(text, fp)

In [59]:
with open("Inputs/prepocessed_biden105.pkl", "rb") as fp:   # Unpickling
    text_biden = pickle.load(fp)
    
text = text_biden

In [60]:
text[:5]

['president pleased senate passing substitute amendment hopeful body pas measure congress adjourns',
 'legislation simple current federal law faith credit child custody custody define include visitation evidence country court automatically recognize visitation particularly visitation child parent grandparent court suppose honor arduous process',
 'legislation simply clarifies faith credit law include visitation absolutely court visitation enter consistently provision federal faith credit statute faith credit narrow legal sense current federal law law explicit hopefully eliminate hassle obstacle delay confront valid visitation federal law follow',
 'president author idea representative rob andrew jersey deserves credit bring issue attention day introduce bill visitation tireless push passage commend congratulate',
 'finally thank senator willingness move bill final day session lot press issue lose final crunch chairman staff gracious pas bill']

In [10]:
from gensim.models import Phrases

sentence_stream = [par.split(" ") for par in text]
bigram = Phrases(sentence_stream, min_count=20, threshold=2)

In [11]:
text_bigram =[]

for par in text:
    bigram_tokens = bigram[par.split()]
    preprocessed_text = ' '.join(bigram_tokens)
    text_bigram.append(preprocessed_text)

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
count = vectorizer.fit_transform(text_bigram)

In [22]:
df = pd.DataFrame(count.toarray().transpose(),index=vectorizer.get_feature_names())
df.loc[:,'Total'] = df.sum(axis=1) 

df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5791,5792,5793,5794,5795,5796,5797,5798,5799,Total
abandon,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,7
abandonment,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
abate,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
abbey,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
abduction,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zone,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,34
zubak,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
zumwalt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
zurich,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [23]:
df_sorted = df.sort_values(by='Total', ascending=False)
df_sorted.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5791,5792,5793,5794,5795,5796,5797,5798,5799,Total
president,0,0,0,1,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1359
senator,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,817
nato,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,750
time,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,653
country,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,575
amendment,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,531
bill,0,0,0,1,2,1,2,1,0,0,...,0,0,0,0,0,0,0,0,0,521
senate,1,0,0,0,0,1,2,0,0,0,...,0,0,0,0,0,0,0,0,0,478
united,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,444
treaty,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,442


In [25]:
top10 = list(df_sorted.index[:10])
top10

['president',
 'senator',
 'nato',
 'time',
 'country',
 'amendment',
 'bill',
 'senate',
 'united',
 'treaty']

In [26]:
def find_bigram(text):
    words = word_tokenize(text)
    tokens = [word for word in words if '_' in word]
    bigrams = ' '.join(tokens)
    return bigrams

In [27]:
all_bigrams = [find_bigram(par) for par in text_bigram]

In [28]:
bigram_count = vectorizer.fit_transform(all_bigrams)

In [29]:
df_bigram = pd.DataFrame(bigram_count.toarray().transpose(),index=vectorizer.get_feature_names())
df_bigram.loc[:,'Total'] = df_bigram.sum(axis=1) 

df_bigram

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5791,5792,5793,5794,5795,5796,5797,5798,5799,Total
address_chair,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,35
advice_consent,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,23
american_people,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,48
appropriation_bill,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,27
appropriation_committee,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
world_war,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,56
yield_floor,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,92
yield_minute,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,31
yield_time,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,21


In [30]:
df_bigram_sorted = df_bigram.sort_values(by='Total', ascending=False)
df_bigram_sorted.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5791,5792,5793,5794,5795,5796,5797,5798,5799,Total
united_nation,0,0,0,0,0,0,1,2,0,0,...,0,0,0,0,0,0,0,0,0,164
foreign_policy,0,0,0,0,0,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,161
chemical_weapon,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,155
nato_enlargement,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,115
foreign_relation,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,110
nuclear_weapon,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,96
yield_floor,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,92
madam_president,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,75
arm_control,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,72
law_enforcement,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,67


In [31]:
top10_bigram = list(df_bigram_sorted.index[:10])
top10_bigram

['united_nation',
 'foreign_policy',
 'chemical_weapon',
 'nato_enlargement',
 'foreign_relation',
 'nuclear_weapon',
 'yield_floor',
 'madam_president',
 'arm_control',
 'law_enforcement']

In [32]:
sentence_stream = [par.split(" ") for par in text_bigram]

In [33]:
from gensim.models import Word2Vec

model = Word2Vec(sentences=sentence_stream, vector_size=100, window=5, min_count=1, workers=4)

In [35]:
# create a data frame of similar words
df_sim = pd.DataFrame()

for i in top10:
    column = []
    for j in range(10):
        column.append(model.wv.most_similar(i, topn=10)[j][0])
    df_sim[i] = column

In [36]:
df_sim

Unnamed: 0,president,senator,nato,time,country,amendment,bill,senate,united,treaty
0,issue,amendment,russia,friend,democracy,senator,legislation,legislation,lead,ratify
1,wish,friend,alliance,speak,united,vote,offer,floor,nation,ratification
2,offer,minute,security,yield,political,minute,president,debate,country,sign
3,hope,vote,threat,wish,nation,colleague,introduce,colleague,democracy,comprehensive
4,debate,colleague,germany,understand,threat,floor,provision,president,arm,opponent
5,republican,floor,country,debate,alliance,friend,support,bill,maintain,convention
6,suggest,time,european,floor,european,time,issue,offer,economy,pas
7,understand,speak,europe,president,lead,senate,condition,time,ability,provision
8,believe,senate,stability,colleague,economic,speak,hope,speak,china,require
9,thank,chairman,military,offer,american,debate,record,condition,organization,join


In [43]:
model.wv.most_similar('president', topn=10)

[('issue', 0.9997406005859375),
 ('wish', 0.9997261762619019),
 ('offer', 0.9997196793556213),
 ('hope', 0.9997009634971619),
 ('debate', 0.9996963143348694),
 ('republican', 0.9996792078018188),
 ('suggest', 0.9996716976165771),
 ('understand', 0.9996699094772339),
 ('believe', 0.9996604919433594),
 ('thank', 0.9996579885482788)]

In [42]:
model.wv.most_similar('bill', topn=10)

[('legislation', 0.9996781349182129),
 ('offer', 0.9995875358581543),
 ('president', 0.9995414614677429),
 ('introduce', 0.999495804309845),
 ('provision', 0.9994747638702393),
 ('support', 0.9994534254074097),
 ('issue', 0.9994510412216187),
 ('condition', 0.9994432330131531),
 ('hope', 0.9994184374809265),
 ('record', 0.999399721622467)]

In [39]:
df_sim2 = pd.DataFrame()

for i in top10_bigram:
    column = []
    for j in range(20):
        column.append(model.wv.most_similar(i, topn=20)[j][0])
    df_sim2[i] = column

In [40]:
df_sim2

Unnamed: 0,united_nation,foreign_policy,chemical_weapon,nato_enlargement,foreign_relation,nuclear_weapon,yield_floor,madam_president,arm_control,law_enforcement
0,body,action,convention,decision,arm_service,nuclear,suggest,tell,plan,public
1,action,term,china,leader,chairman,china,majority,look,start,woman
2,require,plan,ratify,democratic,committee,step,disagree,name,action,service
3,include,step,require,serve,rank,war,congress,view,word,court
4,decision,hand,step,begin,hearing,police,believe,try,step,officer
5,result,result,sign,bring,distinguish,weapon,name,able,milosevic,child
6,foreign_policy,continue,effect,plan,senate,system,close,approach,hand,criminal
7,word,commitment,produce,process,colleague,attack,try,add,father,police
8,hand,milosevic,protocol,opportunity,finance,test,tell,issue,decision,federal
9,attempt,require,party,matter,appropriation_committee,chemical,moment,reason,example,community


In [44]:
model.wv.most_similar('foreign_relation', topn=10)

[('arm_service', 0.9982528686523438),
 ('chairman', 0.9975391030311584),
 ('committee', 0.9973975419998169),
 ('rank', 0.9968673586845398),
 ('hearing', 0.9957178831100464),
 ('distinguish', 0.9955019950866699),
 ('senate', 0.9950818419456482),
 ('colleague', 0.9950717091560364),
 ('finance', 0.9950012564659119),
 ('appropriation_committee', 0.9948432445526123)]

### Challenge: Use the senator speeches in the folder 105-extracted-date and use doc2vec to find whose senator speech is closest to senator Biden. Use sen105kh_fix.csv and/or Wikipedia to validate your findings (i.e., understand if the most similar speeches are senators from the same state and/party).  Describe your findings. Compare with the outcome you got/will get using cosine similarity.

In [45]:
import os
import gensim

my_path = os.path.join('Inputs/105-extracted-date')
files = os.listdir('Inputs/105-extracted-date')
print(files)

['105-abraham-mi.txt', '105-akaka-hi.txt', '105-allard-co.txt', '105-ashcroft-mo.txt', '105-baucus-mt.txt', '105-bennett-ut.txt', '105-biden-de.txt', '105-bingaman-nm.txt', '105-bond-mo.txt', '105-boxer-ca.txt', '105-breaux-la.txt', '105-brownback-ks.txt', '105-bryan-nv.txt', '105-bumpers-ar.txt', '105-burns-mt.txt', '105-byrd-wv.txt', '105-campbell-co.txt', '105-chafee-ri.txt', '105-cleland-ga.txt', '105-coats-in.txt', '105-cochran-ms.txt', '105-collins-me.txt', '105-conrad-nd.txt', '105-coverdell-ga.txt', '105-craig-id.txt', '105-damato-ny.txt', '105-daschle-sd.txt', '105-dewine-oh.txt', '105-dodd-ct.txt', '105-domenici-nm.txt', '105-dorgan-nd.txt', '105-durbin-il.txt', '105-enzi-wy.txt', '105-faircloth-nc.txt', '105-feingold-wi.txt', '105-feinstein-ca.txt', '105-ford-ky.txt', '105-frist-tn.txt', '105-glenn-oh.txt', '105-gorton-wa.txt', '105-graham-fl.txt', '105-gramm-tx.txt', '105-grams-mn.txt', '105-grassley-ia.txt', '105-gregg-nh.txt', '105-hagel-ne.txt', '105-harkin-ia.txt', '105

> To save time, I use the saved pickle from the previous assignment that contains already preprocessed text. I will further preprocess the text a bit (previously lemmatization was done without taking part of speech into account). 

In [48]:
with open("Inputs/prepocessed_docs105.pkl", "rb") as fp:   # Unpickling
    speech_list = pickle.load(fp)
    
speech_list[0][:501]

'president debate final passage omnibus appropriation bill american competitiveness workforce improvement act included title subdivision unanimous consent document printed record included document received administration negotiation inclusion seeking help illuminate meaning provision legislation key document change july version september version copy submitted change marked redlining marking unfortunately submitted copy version copy fax marking appear effect september version unintelligible result'

In [51]:
def preprocessing_tokens(text):
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    tokens_lematized = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]
    tokens_lematized = [token for token in tokens_lematized if token not in stop_words]
    return tokens_lematized

In [52]:
tokens = [preprocessing_tokens(speech) for speech in speech_list]

CPU times: total: 1h 32min 41s
Wall time: 1h 35min 27s


In [61]:
with open("Inputs/prepocessed_tokens105.pkl", "rb") as fp:   # Unpickling
    tokens = pickle.load(fp)

In [65]:
docs_corpus = []

for i in range(len(tokens)):
    docs_corpus.append(gensim.models.doc2vec.TaggedDocument(words=tokens[i], tags=[i]))

In [None]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=100, min_count=2, epochs=40)

In [None]:
model.build_vocab(docs_corpus)

In [None]:
model.train(docs_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
ranks = []
second_ranks = []

for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])