In [4]:
%matplotlib widget 

# Evaluate Vectors via Logistic Regression

In [5]:
eval_topics_path = 'data_preprocess/data/eval_topics.csv'
node_topics_path = 'data_preprocess/data/node_topics.csv'
nodes_path = 'data_preprocess/data/pypi_nodes.csv'

In [26]:
from evaluate.lr import LREvaluation
import pandas as pd
import evaluate.utils as utils
import numpy as np

In [9]:
LR = LREvaluation(node_topics_path, eval_topics_path)
node_names = list(pd.read_csv(nodes_path, na_filter=False)["nodes"])

## Evaluate TF-IDF Vectors

In [None]:
# Get a name vector dict
tfidf_path = 'vector_generation/data/tfidf/00000/TFIDF'
tfidf_name_vec_dict = utils.get_tsv_dict(f'{tfidf_path}/metadata.tsv', f'{tfidf_path}/tensors.tsv')

In [None]:
# Get features matrix
tfidf_features_matrix = utils.build_features_matrix(
    LR.labeled_node_ids,
    node_names,
    tfidf_name_vec_dict
)

print(tfidf_features_matrix)

In [None]:
# Evaluate!
LR.evaluate(tfidf_features_matrix, num_shuffles=10, training_percents=[0.1, 0.5, 0.9])

## Evaluate Doc2Vec Vectors

In [22]:
# Get a name vector dict
d2v_path = 'vector_generation/data/doc2vec/00000/Doc2Vec_dim_128_epoch_50'
d2v_name_vec_dict = utils.get_tsv_dict(f'{d2v_path}/metadata.tsv', f'{d2v_path}/tensors.tsv')

In [23]:
# Get features matrix
d2v_features_matrix = utils.build_features_matrix(
    LR.labeled_node_ids,
    node_names,
    d2v_name_vec_dict
)

print(d2v_features_matrix)

[[-0.04313257 -0.10160448 -0.01422191 ... -0.06657086  0.01095787
  -0.15670778]
 [-0.1421297   0.24015905 -0.755831   ...  0.21273902  0.08050007
  -0.04276856]
 [ 0.24452315 -0.13549297 -0.06890897 ...  0.32536998 -0.03322582
  -0.17995596]
 ...
 [-0.17609794  0.0298483   0.04648566 ...  0.27582824 -0.16254099
  -0.03240073]
 [ 0.00125125 -0.01431077  0.00368682 ...  0.01446857  0.01123924
  -0.00331963]
 [ 0.42319652 -0.51151925 -0.65454113 ... -0.5228802   0.71550357
  -0.1144278 ]]


In [24]:
# Evaluate!
LR.evaluate(d2v_features_matrix, num_shuffles=2, training_percents=[0.1, 0.5, 0.9])

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
100%|██████████| 3/3 [00:43<00:00, 14.60s/it]

Results, using embeddings of dimensionality 128
-------------------
Train percent: 0.1
-------------------
Shuffle #1:    {'micro': 0.31768602072129554, 'macro': 0.19610663075459114}
Shuffle #2:    {'micro': 0.31870472944184064, 'macro': 0.196687193716647}
Average score: {'micro': 0.3181953750815681, 'macro': 0.19639691223561906}
-------------------
Train percent: 0.5
-------------------
Shuffle #1:    {'micro': 0.4113560527323451, 'macro': 0.3175343453394098}
Shuffle #2:    {'micro': 0.40927716767742994, 'macro': 0.3174465624125932}
Average score: {'micro': 0.4103166102048875, 'macro': 0.3174904538760015}
-------------------
Train percent: 0.9
-------------------
Shuffle #1:    {'micro': 0.4307353346185624, 'macro': 0.33653030201509326}
Shuffle #2:    {'micro': 0.4358407079646018, 'macro': 0.3456678024454734}
Average score: {'micro': 0.43328802129158206, 'macro': 0.34109905223028336}
-------------------





## Evaluate DeepWalk Vectors

In [28]:
# Get a name vector dict
dwlk_path = 'vector_generation/data/deepwalk/00000/deepwalk'
dwlk_name_vec_dict = utils.get_tsv_dict(f'{dwlk_path}/metadata.tsv', f'{dwlk_path}/tensors.tsv')

In [29]:
# Get features matrix
dwlk_features_matrix = utils.build_features_matrix(
    LR.labeled_node_ids,
    node_names,
    dwlk_name_vec_dict
)

print(dwlk_features_matrix)

Couldn't find 2576/95951 vectors in the name_vec_dict provided
Using random vectors in their place
[[ 0.32372928 -0.06645171 -0.04385794 ...  0.10878182  0.09313563
  -0.08789907]
 [ 0.77795282  0.28577447  0.87167607 ...  0.52843138  0.77998984
   0.46045369]
 [ 0.41704962  0.34277883 -0.02421124 ...  0.41972488  0.4007639
  -0.11719696]
 ...
 [ 0.02805469  0.11568511 -0.18876307 ...  0.30574223 -0.06950674
   0.0996941 ]
 [ 0.82481375  0.37239489  0.16097685 ...  0.31024376  0.64665829
   0.36488998]
 [-0.21692657  0.5262821   0.24374469 ... -0.1533963   0.10880067
  -0.19714065]]


In [37]:
# Evaluate!
LR.evaluate(dwlk_features_matrix, num_shuffles=2, training_percents=[0.1, 0.5, 0.9])


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)

 33%|███▎      | 1/3 [00:09<00:19,  9.79s/it][A
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)

100%|██████████| 3/3 [02:21<00:00, 47.03s/it][A

Results, using embeddings of dimensionality 256
-------------------
Train percent: 0.1
-------------------
Shuffle #1:    {'micro': 0.34386061271817403, 'macro': 0.2369503595228126}
Shuffle #2:    {'micro': 0.3448915655321349, 'macro': 0.24047015464522112}
Average score: {'micro': 0.3443760891251545, 'macro': 0.23871025708401686}
-------------------
Train percent: 0.5
-------------------
Shuffle #1:    {'micro': 0.3914783943847335, 'macro': 0.29529402476121575}
Shuffle #2:    {'micro': 0.3883637548891786, 'macro': 0.29311641085547585}
Average score: {'micro': 0.389921074636956, 'macro': 0.2942052178083458}
-------------------
Train percent: 0.9
-------------------
Shuffle #1:    {'micro': 0.40558292282430214, 'macro': 0.30019201950153507}
Shuffle #2:    {'micro': 0.4211233539371137, 'macro': 0.3097594888073434}
Average score: {'micro': 0.4133531383807079, 'macro': 0.3049757541544392}
-------------------





## Evaluate MultiGraph Vectors

Combines dependency graph and language data

In [32]:
# Get a name vector dict
multigraph_path = 'vector_generation/data/multigraph/00000/DeepWalkd2v'
multigraph_name_vec_dict = utils.get_tsv_dict(f'{multigraph_path}/metadata.tsv', f'{multigraph_path}/tensors.tsv')

In [33]:
# Get features matrix
multigraph_features_matrix = utils.build_features_matrix(
    LR.labeled_node_ids,
    node_names,
    multigraph_name_vec_dict
)

print(multigraph_features_matrix)

[[ 0.03707959 -0.02591027 -0.0831027  ...  0.01027415 -0.01948853
  -0.04744922]
 [-0.00341722 -0.00381578  0.00172656 ... -0.00309118  0.00040134
  -0.00212493]
 [ 0.0034879  -0.00499655  0.00109166 ... -0.00500495  0.00429559
  -0.00250053]
 ...
 [-0.05822385  0.00819831  0.04709605 ... -0.03797765 -0.09110721
   0.02847548]
 [ 0.00300985 -0.00302985 -0.00024235 ...  0.00252268 -0.0025351
   0.00141764]
 [ 0.04166736 -0.00035233  0.0085204  ...  0.0206775   0.00536029
   0.02211291]]


In [34]:
# Evaluate!
LR.evaluate(multigraph_features_matrix, num_shuffles=2, training_percents=[0.1, 0.5, 0.9])

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
100%|██████████| 3/3 [00:49<00:00, 16.45s/it]

Results, using embeddings of dimensionality 128
-------------------
Train percent: 0.1
-------------------
Shuffle #1:    {'micro': 0.09206242964495422, 'macro': 0.004428734530203636}
Shuffle #2:    {'micro': 0.09169497694734287, 'macro': 0.0045058632394111885}
Average score: {'micro': 0.09187870329614854, 'macro': 0.004467298884807413}
-------------------
Train percent: 0.5
-------------------
Shuffle #1:    {'micro': 0.09075961854653075, 'macro': 0.0045681329486234115}
Shuffle #2:    {'micro': 0.09045280960174579, 'macro': 0.004782274253442248}
Average score: {'micro': 0.09060621407413827, 'macro': 0.00467520360103283}
-------------------
Train percent: 0.9
-------------------
Shuffle #1:    {'micro': 0.09254218835057158, 'macro': 0.0046973546434318945}
Shuffle #2:    {'micro': 0.09357045143638851, 'macro': 0.004633679979454821}
Average score: {'micro': 0.09305631989348004, 'macro': 0.004665517311443358}
-------------------





## Evaluate on Random Vectors

In [36]:
LR.evaluate(
    np.random.randn(len(LR.labeled_node_ids), 256),
    num_shuffles=2,
    training_percents=[0.1, 0.5, 0.9]
)


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)

 33%|███▎      | 1/3 [00:09<00:19,  9.93s/it][A
 67%|██████▋   | 2/3 [00:31<00:13, 13.47s/it][A
100%|██████████| 3/3 [00:53<00:00, 17.76s/it][A

Results, using embeddings of dimensionality 256
-------------------
Train percent: 0.1
-------------------
Shuffle #1:    {'micro': 0.03971898664882455, 'macro': 0.011291785288605298}
Shuffle #2:    {'micro': 0.04139585737714876, 'macro': 0.011751829530631643}
Average score: {'micro': 0.04055742201298665, 'macro': 0.01152180740961847}
-------------------
Train percent: 0.5
-------------------
Shuffle #1:    {'micro': 0.0235038084874864, 'macro': 0.01369551040881235}
Shuffle #2:    {'micro': 0.024331301147910146, 'macro': 0.01312093035236237}
Average score: {'micro': 0.02391755481769827, 'macro': 0.013408220380587359}
-------------------
Train percent: 0.9
-------------------
Shuffle #1:    {'micro': 0.045530260966129936, 'macro': 0.012903601723521736}
Shuffle #2:    {'micro': 0.045682451253481894, 'macro': 0.013404567940194557}
Average score: {'micro': 0.045606356109805915, 'macro': 0.013154084831858147}
-------------------





## Evaluate on Concatenated Vectors

We're using different data sources - why not combine them?

### Deepwalk and TF-IDF

In [None]:
import numpy as np

dwlk_tfidf_features = np.concatenate((dwlk_features_matrix, tfidf_features_matrix), axis=1)

print(dwlk_features_matrix.shape)
print(tfidf_features_matrix.shape)
print(dwlk_tfidf_features.shape)

In [None]:
# Evaluate!
LR.evaluate(dwlk_tfidf_features, num_shuffles=3, training_percents=[0.1, 0.5, 0.9])

### Deepwalk and Doc2Vec

In [30]:
dwlk_d2v_features = np.concatenate((dwlk_features_matrix, d2v_features_matrix), axis=1)

print(dwlk_features_matrix.shape)
print(d2v_features_matrix.shape)
print(dwlk_d2v_features.shape)

(19482, 256)
(19482, 128)
(19482, 384)


In [31]:
# Evaluate!
LR.evaluate(dwlk_d2v_features, num_shuffles=2, training_percents=[0.1, 0.5, 0.9])

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
100%|██████████| 3/3 [03:00<00:00, 60.23s/it]

Results, using embeddings of dimensionality 384
-------------------
Train percent: 0.1
-------------------
Shuffle #1:    {'micro': 0.38645249280193966, 'macro': 0.27893530968582797}
Shuffle #2:    {'micro': 0.3832125677403641, 'macro': 0.2658285507904139}
Average score: {'micro': 0.3848325302711519, 'macro': 0.27238193023812096}
-------------------
Train percent: 0.5
-------------------
Shuffle #1:    {'micro': 0.46195652173913043, 'macro': 0.38018466638963944}
Shuffle #2:    {'micro': 0.45665245435661966, 'macro': 0.37943457233164596}
Average score: {'micro': 0.45930448804787505, 'macro': 0.3798096193606427}
-------------------
Train percent: 0.9
-------------------
Shuffle #1:    {'micro': 0.49018003273322425, 'macro': 0.41006610412142275}
Shuffle #2:    {'micro': 0.478587319243604, 'macro': 0.3951758811509753}
Average score: {'micro': 0.48438367598841414, 'macro': 0.402620992636199}
-------------------





## Evaluate on Added Vectors

We're using different data sources - why not combine them?

### Deepwalk and TF-IDF Add

In [None]:
dwlk_tfidf_features_add = dwlk_features_matrix + tfidf_features_matrix

print(dwlk_features_matrix.shape)
print(tfidf_features_matrix.shape)
print(dwlk_tfidf_features_add.shape)

In [None]:
# Evaluate!
LR.evaluate(dwlk_tfidf_features_add, num_shuffles=3, training_percents=[0.1, 0.5, 0.9])

### Deepwalk and Doc2Vec Add

In [None]:
dwlk_d2v_features_add = dwlk_features_matrix + d2v_features_matrix

print(dwlk_features_matrix.shape)
print(d2v_features_matrix.shape)
print(dwlk_d2v_features_add.shape)

In [None]:
# Evaluate!
LR.evaluate(dwlk_d2v_features_add, num_shuffles=1, training_percents=[0.1])#, 0.5, 0.9])

In [11]:
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt

array = [[13,1,1,0,2,0],
     [3,9,6,0,1,0],
     [0,0,16,2,0,0],
     [0,0,0,13,0,0],
     [0,0,0,0,15,0],
     [0,0,1,0,0,15]]        
df_cm = pd.DataFrame(array, range(6),
                  range(6))
#plt.figure(figsize = (10,7))
sn.set(font_scale=1.4)#for label size
sn.heatmap(df_cm, annot=True,annot_kws={"size": 8})# font size
sn.plt.show()

AttributeError: module 'seaborn' has no attribute 'plt'