## Imports

In [1]:
from gensim import models



In [2]:
from doc2vec.doc2vec import doc2vec
from utilities import load_scotus_network
from utilities import get_name_to_date
from utilities import get_list_of_docs

## Train Doc2Vec Model and Save

In [3]:
# so I don't accidentally start training again
NOT_YET_TRAINED = False
if NOT_YET_TRAINED:
    doc_list, names = get_list_of_docs(dir_path='../data/scotus/textfiles/*.txt')
    Doc2vec = doc2vec(doc_list=doc_list,names=names)
    Doc2vec.run_doc2vec()
    del Doc2vec

## Load Doc2Vec and Plot Similarity

In [4]:
d2v_model = doc2vec(model=models.Doc2Vec.load("../data/scotus_model.doc2vec"),label_docs=False)

In [5]:
G, issue_areas = load_scotus_network(file_path="../data/scotus_network.graphml")

FUNCTION NOT FULLY TESTED: load_scotus_network


In [6]:
dates = get_name_to_date(G)
print(len(dates))

27885


In [7]:
print('Are the dates how we expect?')
for key in list(dates.keys())[:10]:
    print(type(key),key,'--',dates[key],type(dates[key]))

print('Are the names how we expect?')
wrong = []
for v in G.nodes:
    if type(G.nodes[v]["name"])!=str:
        wrong.append(v)
if len(wrong) > 0:
    print('The ones with non-string labels are:\n',wrong)
else:
    print('Yes! The names are all perfectly fine.')
        

Are the dates how we expect?
<class 'str'> 87122 -- 1857 <class 'int'>
<class 'str'> 105799 -- 1958 <class 'int'>
<class 'str'> 105102 -- 1953 <class 'int'>
<class 'str'> 95658 -- 1902 <class 'int'>
<class 'str'> 88008 -- 1869 <class 'int'>
<class 'str'> 110976 -- 1983 <class 'int'>
<class 'str'> 87845 -- 1867 <class 'int'>
<class 'str'> 98228 -- 1914 <class 'int'>
<class 'str'> 86741 -- 1852 <class 'int'>
<class 'str'> 95837 -- 1903 <class 'int'>
Are the names how we expect?
Yes! The names are all perfectly fine.


In [8]:
# so I don't accidentally plot every single case's similarity plot
PLOT_ALL_CASES = False
if PLOT_ALL_CASES:
    for case_name in G.vs["name"][:len(G.vs)//10]:
        d2v_model.similarity_time_plot(case_name,name_to_year=dates,fig_size=(30,15),num_to_plot=27884,outname='../plots/similarity_plots/'+str(case_name)+'_similarity_plot.png')

In [9]:
# Roe v. Wade as an example
# d2v_model.similarity_time_plot("108713",show=True,name_to_year=dates,fig_size=(30,15),num_to_plot=27884,outname='../plots/similarity_plots/'+str(case_name)+'_similarity_plot.png')

# Issue Area Clustering

In [10]:
import numpy as np

In [11]:
nodes = np.random.permutation([n for n in G.nodes])

ia_to_name = {i : [] for i in range(15)}
name_to_ia = {}
i = 0
for n in nodes:
    if i > 99:
        break
    ia = int(float(G.nodes[n]['issueArea']))
    ia_to_name[ia].append(n)
    name_to_ia[n] = ia
    i+=1

total = 0
for k in list(ia_to_name.keys()):
    print('Key : ',k," "*(3-len(str(k))),'Length(list at key): ',len(ia_to_name[k]))
    total += len(ia_to_name[k])
    
print('Total: ',total)

print('Number of keys of name_to_ia: ',len(name_to_ia.keys()))
print('The above two numbers should be equal.')

Key :  0    Length(list at key):  1
Key :  1    Length(list at key):  9
Key :  2    Length(list at key):  5
Key :  3    Length(list at key):  3
Key :  4    Length(list at key):  4
Key :  5    Length(list at key):  0
Key :  6    Length(list at key):  0
Key :  7    Length(list at key):  0
Key :  8    Length(list at key):  30
Key :  9    Length(list at key):  16
Key :  10   Length(list at key):  3
Key :  11   Length(list at key):  3
Key :  12   Length(list at key):  8
Key :  13   Length(list at key):  0
Key :  14   Length(list at key):  18
Total:  100
Number of keys of name_to_ia:  100
The above two numbers should be equal.


In [12]:
for v in G.nodes:
    print(v)
    break

87122


In [None]:
d2v_model.run_clustering(n_clusters=len(set(ia_to_name.keys())),labels_dict=name_to_ia,evaluate=True)

In [None]:
a ={4,5}
b = {4,6}

In [None]:
for _ in a.union(b):
    print(_)