In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix 

## Sparse matrix

In [4]:
df = pd.read_csv('1990_2000_filtered_authorships.csv')

df.head()

Unnamed: 0,id_article,author,year
0,conf/issac/Kajler92,Norbert Kajler,1992
1,conf/issac/BiniP90,Dario Bini,1990
2,conf/issac/BiniP90,Victor Y. Pan,1990
3,conf/issac/SteinZ91,Andreas Stein,1991
4,conf/issac/SteinZ91,Horst Günter Zimmer,1991


In [5]:
df['id_article'].unique().size, df['author'].unique().size

(423380, 93912)

In [6]:
authors_dict = dict(zip(df['author'].unique(), np.arange(df['author'].unique().size)))
articles_dict = dict(zip(df['id_article'].unique(), np.arange(df['id_article'].unique().size)))

In [7]:
rows = [authors_dict[x] for x in df['author'].values]
cols = [articles_dict[x] for x in df['id_article'].values]
data = np.ones(df.shape[0])

X = csr_matrix((data, (rows, cols)))

## SVD

In [8]:
from sklearn.decomposition import TruncatedSVD

In [30]:
svd = TruncatedSVD(n_components=30, n_iter=10, random_state=42)
svd.fit(X)

TruncatedSVD(n_components=30, n_iter=10, random_state=42)

In [31]:
X_svd = svd.transform(X)
X_svd.shape

(93912, 30)

In [35]:
import pickle
with open('SVD.pickle', 'wb') as f:
    pickle.dump(X_svd, f)

## Evaluation

In [57]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

from sklearn.metrics import classification_report, f1_score, accuracy_score

In [33]:
df_test = pd.read_csv('sample_features_test2021-01-05.csv')
df_test = df_test[['source', 'target', 'connected']]
df_test.head()

Unnamed: 0,source,target,connected
0,Robert Milne,Louise Travé-Massuyès,1.0
1,Nando de Freitas,Arnaud Doucet,1.0
2,Yoshifumi Ooyama,Satoshi Shirai,1.0
3,Leen-Kiat Soh,Costas Tsatsoulis,1.0
4,Ya Xu,Deborah Estrin,1.0


### Cosine similarity

In [28]:
y_pred = []
for source, target in zip(df_test['source'], df_test['target']):
    X_source = X_svd[authors_dict[source], :].reshape(1,-1)
    X_target = X_svd[authors_dict[target], :].reshape(1,-1)

    cos_sim = cosine_similarity(X_source, X_target)[0][0]
    pred = 1 if cos_sim > 0.5 else 0
    y_pred.append(pred)

df_test['cosine'] = y_pred

In [29]:
print(classification_report(df_test['connected'], df_test['cosine']))
print('F1: {:.4f}'.format(f1_score(df_test['connected'], df_test['cosine'])))
print('Accuracy: {:.4f}'.format(accuracy_score(df_test['connected'], df_test['cosine'])))

              precision    recall  f1-score   support

         0.0       0.72      0.98      0.83      5000
         1.0       0.97      0.60      0.74      4860

    accuracy                           0.79      9860
   macro avg       0.84      0.79      0.79      9860
weighted avg       0.84      0.79      0.79      9860

F1: 0.7433
Accuracy: 0.7948


### Euclidean

In [36]:
y_pred = []
for source, target in zip(df_test['source'], df_test['target']):
    X_source = X_svd[authors_dict[source], :].reshape(1,-1)
    X_target = X_svd[authors_dict[target], :].reshape(1,-1)

    euclidean_dist = euclidean_distances(X_source, X_target)[0][0]
    y_pred.append(euclidean_dist)

# Es mejor normalizar para poder hacer la comparación
y_pred = np.array(y_pred)/np.max(y_pred)
y_pred = [1 if y < 0.5 else 0 for y in y_pred]

df_test['euclidean'] = y_pred

In [37]:
print(classification_report(df_test['connected'], df_test['euclidean']))
print('F1: {:.4f}'.format(f1_score(df_test['connected'], df_test['euclidean'])))
print('Accuracy: {:.4f}'.format(accuracy_score(df_test['connected'], df_test['euclidean'])))

              precision    recall  f1-score   support

         0.0       0.02      0.00      0.00      5000
         1.0       0.49      0.99      0.66      4860

    accuracy                           0.49      9860
   macro avg       0.25      0.49      0.33      9860
weighted avg       0.25      0.49      0.32      9860

F1: 0.6557
Accuracy: 0.4878


In [38]:
df_samples = pd.read_csv("sample_features2021-02-08.csv")
df_samples

Unnamed: 0.1,Unnamed: 0,source,target,connected,sum_of_papers,sum_of_neighbors,log_secundary_neighbors,lenght_short_path,clustering_index_sum
0,0,Michael Barnett 0001,Christian Lengauer,1.0,57,25,10.244236,1.0,0.320879
1,1,Mark Vriesenga,Kalman Peleg,1.0,11,8,7.544332,1.0,1.400000
2,2,Matt Blaze,Joan Feigenbaum,1.0,78,69,13.227696,1.0,0.314291
3,3,Majid Mirmehdi,John F. Haddon,1.0,35,26,9.679719,1.0,0.644946
4,4,Richard Durbin,Erik L. L. Sonnhammer,1.0,19,25,10.301928,1.0,1.115789
...,...,...,...,...,...,...,...,...,...
149252,149252,Tetsuya Iwasaki,Yoshihiro Sekiguchi,0.0,14,9,5.877736,8.0,1.142857
149253,149253,Quin Cai,Mary Sheeran,0.0,15,9,7.408531,8.0,1.066667
149254,149254,Myong-Soon Park,Philipp Hoschka,0.0,30,15,7.859413,6.0,0.179487
149255,149255,Karsten Müller 0001,David O'Sullivan,0.0,9,6,3.951244,,1.000000


In [39]:
df_samples = df_samples.drop(['Unnamed: 0'], axis = 1)

In [40]:
df_samples

Unnamed: 0,source,target,connected,sum_of_papers,sum_of_neighbors,log_secundary_neighbors,lenght_short_path,clustering_index_sum
0,Michael Barnett 0001,Christian Lengauer,1.0,57,25,10.244236,1.0,0.320879
1,Mark Vriesenga,Kalman Peleg,1.0,11,8,7.544332,1.0,1.400000
2,Matt Blaze,Joan Feigenbaum,1.0,78,69,13.227696,1.0,0.314291
3,Majid Mirmehdi,John F. Haddon,1.0,35,26,9.679719,1.0,0.644946
4,Richard Durbin,Erik L. L. Sonnhammer,1.0,19,25,10.301928,1.0,1.115789
...,...,...,...,...,...,...,...,...
149252,Tetsuya Iwasaki,Yoshihiro Sekiguchi,0.0,14,9,5.877736,8.0,1.142857
149253,Quin Cai,Mary Sheeran,0.0,15,9,7.408531,8.0,1.066667
149254,Myong-Soon Park,Philipp Hoschka,0.0,30,15,7.859413,6.0,0.179487
149255,Karsten Müller 0001,David O'Sullivan,0.0,9,6,3.951244,,1.000000


In [41]:
authors_dict[df_samples.iloc[0]['source']]

6550

In [49]:
X[6550].toarray()[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [44]:
X[6550].shape

(1, 423380)

In [53]:
cosine_similarity(X[6550].toarray(),X[6550].toarray())[0][0]

0.9999999999999998

In [59]:
svd.transform(X[6550])

array([[-1.50148562e-07, -2.12608111e-07,  3.41012541e-06,
        -3.15766410e-05, -5.50664436e-05,  4.62484676e-05,
         8.69253189e-05,  1.11133175e-05, -4.08750459e-05,
         1.84541500e-05,  1.54978476e-04, -2.96414152e-05,
         2.55917655e-04,  3.35604050e-04, -4.69724864e-05,
        -3.94651564e-05, -1.96943551e-04,  6.92527912e-04,
        -3.04539957e-04,  3.61426089e-04, -7.08437168e-04,
        -1.24397496e-03, -1.56942091e-04,  6.12047311e-04,
        -3.11775099e-04, -1.90415217e-03, -6.38611308e-05,
         8.41038852e-04,  2.22574517e-05,  1.47601233e-03]])

In [61]:
simil = []
for index, row in df_samples.iterrows():
    a = svd.transform(X[authors_dict[row['source']]])
    b = svd.transform(X[authors_dict[row['target']]])
    simil.append(cosine_similarity(a,b)[0][0])

In [62]:
simil[:10]

[0.9867884489856374,
 0.9999581006747635,
 0.8422736016116948,
 0.9998556617183771,
 0.5102176893076641,
 0.9834442889813239,
 0.9982271863773484,
 0.3174079399695275,
 0.9242533273482424,
 0.613894735677866]

In [63]:
df_samples['cos_sim'] = simil

In [64]:
df_samples_t = pd.read_csv("sample_features_test2021-01-06.csv")
df_samples_t

Unnamed: 0.1,Unnamed: 0,source,target,connected,sum_of_papers,sum_of_neighbors,log_secundary_neighbors,lenght_short_path,clustering_index_sum
0,0,Hideaki Takanobu,Kayoko Ohtsuki,1.0,17,34,10.757477,1.0,1.407389
1,1,Joseph Douglas Horton,Alejandro López-Ortiz,1.0,26,25,11.075040,1.0,0.752381
2,2,Siddhartha R. Dalal,Ashish Jain,1.0,19,11,7.494430,1.0,0.800000
3,3,Laurence Melloul,Armando Fox,1.0,21,16,8.034955,1.0,1.153846
4,4,Riccardo Bettati,Wei Zhao 0001,1.0,76,25,9.294498,1.0,1.109524
...,...,...,...,...,...,...,...,...,...
38098,38098,Florian Schiel,Walt Truszkowski,0.0,21,10,5.023881,9.0,0.300000
38099,38099,Goffredo Haus,Mizuho Iwaihara,0.0,10,9,7.339538,6.0,0.321429
38100,38100,Joe L. Armstrong,George T. Duncan,0.0,13,0,0.000000,,0.000000
38101,38101,Wayne C. Boncyk,Mike Hinchey,0.0,6,2,2.197225,,1.000000


In [65]:
df_samples_t = df_samples_t.drop(['Unnamed: 0'], axis = 1)

In [66]:
simil = []
for index, row in df_samples_t.iterrows():
    a = svd.transform(X[authors_dict[row['source']]])
    b = svd.transform(X[authors_dict[row['target']]])
    simil.append(cosine_similarity(a,b)[0][0])

In [67]:
df_samples_t['cos_sim'] = simil

In [69]:
df_samples.to_csv('training.csv', index=False)

In [70]:
df_samples_t.to_csv('test.csv', index=False)