In [23]:
import json
import random
from gensim import models
from random import shuffle
import numpy as np
from sklearn.linear_model import LogisticRegressionCV
from doc2vec.doc2vec import doc2vec
from node2vec.node2vec import node2vec

random.seed(42)
np.random.seed(42)

# Load Doc2Vec Model & Get Vectors

In [2]:
d2v_model = doc2vec(model=models.Doc2Vec.load("../data/scotus/scotus_model.doc2vec"),label_docs=False)
n2v_model = node2vec(model=models.Word2Vec.load("../data/scotus_n2v_1.0_1.0_mini.node2vec"))

In [3]:
d2v_embeddings = d2v_model.model.docvecs
n2v_embeddings = n2v_model.model.wv

In [7]:
data_d2v = json.load(open('../data/ia_doc2vec_clustering.json'))
data_n2v = json.load(open('../data/ia_node2vec_clustering_1.0_1.0.json'))

In [10]:
node_order_d2v = {}
node_ids_d2v = []
node_order_n2v = {}
node_ids_n2v = []

for i in range(len(data_d2v['node_ids'])):
    node_order_d2v[i] = data_d2v['node_ids'][i]
    node_ids_d2v.append(data_d2v['node_ids'][i])

for i in range(len(data_n2v['node_ids'])):
    node_order_n2v[i] = data_n2v['node_ids'][i]
    node_ids_n2v.append(data_n2v['node_ids'][i])
    
X_d2v = [d2v_embeddings[tag].tolist() for tag in node_ids_d2v]
X_n2v = [n2v_embeddings[tag].tolist() for tag in node_ids_n2v]

In [16]:
names_d2v = []
vectors_d2v = []
names_n2v = []
vectors_n2v = []

for tag,_ in d2v_embeddings.doctags.items():
    names_d2v.append(tag)
    vectors_d2v.append(d2v_embeddings[tag].tolist())

for tag,_ in n2v_embeddings.vocab.items():
    names_n2v.append(tag)
    vectors_n2v.append(n2v_embeddings[tag].tolist())
    
print('complete')

complete


In [21]:
name_to_ia = json.load(open('../data/name_to_ia_mini.json'))

In [26]:
mask_d2v = np.random.rand(len(names_d2v)) < 0.85
name_train_d2v = np.array(names_d2v)[mask_d2v]
name_test_d2v = np.array(names_d2v)[~mask_d2v]
mask_n2v = np.random.rand(len(names_n2v)) < 0.85
name_train_n2v = np.array(names_n2v)[mask_n2v]
name_test_n2v = np.array(names_n2v)[~mask_n2v]

X_train_d2v = []
Y_train_d2v = []
X_test_d2v = []
Y_test_d2v = []
X_train_n2v = []
Y_train_n2v = []
X_test_n2v = []
Y_test_n2v = []

for name,vec in zip(names_d2v,vectors_d2v):
    if name in name_train_d2v:
        X_train_d2v.append(vec)
        Y_train_d2v.append(int(name_to_ia[name]))
    elif name in name_test_d2v:
        X_test_d2v.append(vec)
        Y_test_d2v.append(int(name_to_ia[name]))
    else:
        print('D2V PROBLEM WITH:',name)

for name,vec in zip(names_n2v,vectors_n2v):
    if name in name_train_n2v:
        X_train_n2v.append(vec)
        Y_train_n2v.append(int(name_to_ia[name]))
    elif name in name_test_n2v:
        X_test_n2v.append(vec)
        Y_test_n2v.append(int(name_to_ia[name]))
    else:
        print('N2V PROBLEM WITH:',name)

print(len(X_train_d2v),len(Y_train_d2v))
print(len(X_test_d2v),len(Y_test_d2v))
print(len(X_train_n2v),len(Y_train_n2v))
print(len(X_test_n2v),len(Y_test_n2v))

23731 23731
4154 4154
23759 23759
4126 4126


In [37]:
d2v_log_reg = LogisticRegressionCV([.01,.1,1,10,100],
                                   penalty='l2',
                                   scoring='accuracy',
                                   solver='sag',
                                   n_jobs=-1,
                                   verbose=1,
                                   refit=True,
                                   multi_class='multinomial'
                                   )

n2v_log_reg = LogisticRegressionCV([.01,.1,1,10,100],
                                   penalty='l2',
                                   scoring='accuracy',
                                   solver='sag',
                                   n_jobs=-1,
                                   verbose=1,
                                   refit=True,
                                   multi_class='multinomial'
                                   )

In [38]:
d2v_log_reg.fit(X_train_d2v, Y_train_d2v)

convergence after 63 epochs took 47 seconds
convergence after 64 epochs took 48 seconds
convergence after 64 epochs took 48 seconds
max_iter reached after 77 seconds
max_iter reached after 80 seconds
max_iter reached after 81 seconds
max_iter reached after 75 seconds
max_iter reached after 73 seconds
max_iter reached after 73 seconds
max_iter reached after 68 seconds
max_iter reached after 69 seconds
max_iter reached after 70 seconds
max_iter reached after 72 seconds
max_iter reached after 71 seconds
max_iter reached after 70 seconds


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  5.7min finished


LogisticRegressionCV(Cs=[0.01, 0.1, 1, 10, 100], class_weight=None, cv=None,
           dual=False, fit_intercept=True, intercept_scaling=1.0,
           max_iter=100, multi_class='multinomial', n_jobs=-1,
           penalty='l2', random_state=None, refit=True, scoring='accuracy',
           solver='sag', tol=0.0001, verbose=1)

In [39]:
n2v_log_reg.fit(X_train_n2v, Y_train_n2v)

convergence after 20 epochs took 17 seconds
convergence after 20 epochs took 17 seconds
convergence after 20 epochs took 17 seconds
convergence after 40 epochs took 20 seconds
convergence after 39 epochs took 20 seconds
convergence after 41 epochs took 21 seconds
convergence after 74 epochs took 38 seconds
convergence after 74 epochs took 37 seconds
convergence after 76 epochs took 38 seconds
max_iter reached after 52 seconds
max_iter reached after 52 seconds
max_iter reached after 53 seconds
max_iter reached after 58 seconds
max_iter reached after 58 seconds
max_iter reached after 58 seconds


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  3.1min finished


LogisticRegressionCV(Cs=[0.01, 0.1, 1, 10, 100], class_weight=None, cv=None,
           dual=False, fit_intercept=True, intercept_scaling=1.0,
           max_iter=100, multi_class='multinomial', n_jobs=-1,
           penalty='l2', random_state=None, refit=True, scoring='accuracy',
           solver='sag', tol=0.0001, verbose=1)

In [40]:
import pickle
pickle.dump(d2v_log_reg, open('../data/d2v_log_reg_first.pickle', 'wb'))
pickle.dump(n2v_log_reg, open('../data/n2v_log_reg_first.pickle', 'wb'))

In [41]:
print(d2v_log_reg.score(X_test_d2v,Y_test_d2v))

0.674049109292


In [42]:
print(n2v_log_reg.score(X_test_n2v,Y_test_n2v))

0.571013087736


In [43]:
TESTING = False
if TESTING:
    x = [[random.uniform(0,100) for _ in range(300)] for _i_ in range(3000)]
    y = [int(random.uniform(0,1) > 0.5) for _ in range(3000)]
    a = LogisticRegressionCV([.01,.1,1,10,100],
                             penalty='l2',
                             scoring='accuracy',
                             solver='sag',
                             n_jobs=-1,
                             verbose=1,
                             refit=True,
                             multi_class='multinomial'
                            )
    a.fit(x,y)