In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix 

## Sparse matrix

In [2]:
df1 = pd.read_csv('../Data/1990_2000_1_filtered_authorships.csv')
df2 = pd.read_csv('../Data/1990_2000_2_filtered_authorships.csv')
df3 = pd.read_csv('../Data/2001_2002_filtered_authorships.csv')
df4 = pd.read_csv('../Data/2003_2004_filtered_authorships.csv')

df = pd.concat([df1, df2, df3, df4])
df.head()

Unnamed: 0,id_article,author,year
0,conf/issac/Kajler92,Norbert Kajler,1992
1,conf/issac/BiniP90,Dario Bini,1990
2,conf/issac/BiniP90,Victor Y. Pan,1990
3,conf/issac/SteinZ91,Andreas Stein,1991
4,conf/issac/SteinZ91,Horst Günter Zimmer,1991


In [3]:
df['id_article'].unique().size, df['author'].unique().size

(712350, 93912)

In [4]:
authors_dict = dict(zip(df['author'].unique(), np.arange(df['author'].unique().size)))
articles_dict = dict(zip(df['id_article'].unique(), np.arange(df['id_article'].unique().size)))

In [5]:
rows = [authors_dict[x] for x in df['author'].values]
cols = [articles_dict[x] for x in df['id_article'].values]
data = np.ones(df.shape[0])

X = csr_matrix((data, (rows, cols)))

## SVD

In [6]:
from sklearn.decomposition import TruncatedSVD

In [7]:
svd = TruncatedSVD(n_components=10, n_iter=10, random_state=42)
svd.fit(X)

TruncatedSVD(n_components=10, n_iter=10, random_state=42)

In [8]:
X_svd = svd.transform(X)
X_svd.shape

(93912, 10)

## Evaluation

In [9]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

from sklearn.metrics import classification_report, f1_score, accuracy_score

In [10]:
df_test = pd.read_csv('../Data/sample_features_test2021-01-05.csv')
df_test = df_test[['source', 'target', 'connected']]
df_test.head()

Unnamed: 0,source,target,connected
0,Robert Milne,Louise Travé-Massuyès,1.0
1,Nando de Freitas,Arnaud Doucet,1.0
2,Yoshifumi Ooyama,Satoshi Shirai,1.0
3,Leen-Kiat Soh,Costas Tsatsoulis,1.0
4,Ya Xu,Deborah Estrin,1.0


### Cosine similarity

In [11]:
y_pred = []
for source, target in zip(df_test['source'], df_test['target']):
    X_source = X_svd[authors_dict[source], :].reshape(1,-1)
    X_target = X_svd[authors_dict[target], :].reshape(1,-1)

    cos_sim = cosine_similarity(X_source, X_target)[0][0]
    pred = 1 if cos_sim > 0.5 else 0
    y_pred.append(pred)

df_test['cosine'] = y_pred

In [12]:
print(classification_report(df_test['connected'], df_test['cosine']))
print('F1: {:.4f}'.format(f1_score(df_test['connected'], df_test['cosine'])))
print('Accuracy: {:.4f}'.format(accuracy_score(df_test['connected'], df_test['cosine'])))

              precision    recall  f1-score   support

         0.0       0.79      0.83      0.81      5000
         1.0       0.82      0.77      0.79      4860

    accuracy                           0.80      9860
   macro avg       0.80      0.80      0.80      9860
weighted avg       0.80      0.80      0.80      9860

F1: 0.7944
Accuracy: 0.8025


### Euclidean

In [13]:
y_pred = []
for source, target in zip(df_test['source'], df_test['target']):
    X_source = X_svd[authors_dict[source], :].reshape(1,-1)
    X_target = X_svd[authors_dict[target], :].reshape(1,-1)

    euclidean_dist = euclidean_distances(X_source, X_target)[0][0]
    y_pred.append(euclidean_dist)

# Es mejor normalizar para poder hacer la comparación
y_pred = np.array(y_pred)/np.max(y_pred)
y_pred = [1 if y < 0.5 else 0 for y in y_pred]

df_test['euclidean'] = y_pred

In [14]:
print(classification_report(df_test['connected'], df_test['euclidean']))
print('F1: {:.4f}'.format(f1_score(df_test['connected'], df_test['euclidean'])))
print('Accuracy: {:.4f}'.format(accuracy_score(df_test['connected'], df_test['euclidean'])))

              precision    recall  f1-score   support

         0.0       0.05      0.00      0.00      5000
         1.0       0.49      1.00      0.66      4860

    accuracy                           0.49      9860
   macro avg       0.27      0.50      0.33      9860
weighted avg       0.27      0.49      0.32      9860

F1: 0.6586
Accuracy: 0.4911
