In [34]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix 
from tqdm import tqdm

# SVD

Una aproximación es el uso de una SVD como método de extracción de tópicos por autor. Si a un autor se le ve como un conjunto de títulos en los que ha colaborado, se puede generar una matriz binaria de co-ocurrencia con autores en las filas y artículos en las columnas con dimensiones $93,912 \times 423,380$.

- [Sparse matrix](#Sparse-matrix)
- [SVD](#SVD)
- [Evaluation](#Evaluation)
  - [Cosine similarity](#Cosine)
  - [Euclidean distance](#Euclidean)
- [Train-test](#Train-test)

<a name="Sparse-matrix"></a>
## Sparse matrix

Debido a que la matriz es gigantesca y la mayoría de sus elementos son cero se debe tratar como una matriz sparse.

In [7]:
df1 = pd.read_csv('../Data/1990_2000_1_filtered_authorships.csv')
df2 = pd.read_csv('../Data/1990_2000_2_filtered_authorships.csv')

df = pd.concat([df1, df2])
df.head()

Unnamed: 0,id_article,author,year
0,conf/issac/Kajler92,Norbert Kajler,1992
1,conf/issac/BiniP90,Dario Bini,1990
2,conf/issac/BiniP90,Victor Y. Pan,1990
3,conf/issac/SteinZ91,Andreas Stein,1991
4,conf/issac/SteinZ91,Horst Günter Zimmer,1991


In [8]:
df['id_article'].unique().size, df['author'].unique().size

(423380, 93912)

In [9]:
authors_dict = dict(zip(df['author'].unique(), np.arange(df['author'].unique().size)))
articles_dict = dict(zip(df['id_article'].unique(), np.arange(df['id_article'].unique().size)))

In [10]:
rows = [authors_dict[x] for x in df['author'].values]
cols = [articles_dict[x] for x in df['id_article'].values]
data = np.ones(df.shape[0])

X = csr_matrix((data, (rows, cols)))

<a name="SVD"></a>
## SVD

Nos quedaremos únicamente con las primeras 30 componentes de la SVD.

In [14]:
from sklearn.decomposition import TruncatedSVD
import pickle

In [15]:
svd = TruncatedSVD(n_components=30, n_iter=10, random_state=42)
svd.fit(X)

TruncatedSVD(n_components=30, n_iter=10, random_state=42)

In [16]:
X_svd = svd.transform(X)
X_svd.shape

(93912, 30)

In [17]:
with open('SVD.pickle', 'wb') as f:
    pickle.dump(X_svd, f)

<a name="Evaluation"></a>
## Evaluation

In [18]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

from sklearn.metrics import classification_report, f1_score, accuracy_score

In [20]:
df_test = pd.read_csv('../Data/sample_features_test2021-01-05.csv')
df_test = df_test[['source', 'target', 'connected']]
df_test.head()

Unnamed: 0,source,target,connected
0,Robert Milne,Louise Travé-Massuyès,1.0
1,Nando de Freitas,Arnaud Doucet,1.0
2,Yoshifumi Ooyama,Satoshi Shirai,1.0
3,Leen-Kiat Soh,Costas Tsatsoulis,1.0
4,Ya Xu,Deborah Estrin,1.0


<a name="Cosine"></a>
### Cosine similarity

Cuanto se trata de vectores dispersos lo mejor es usar la militud coseno.

In [21]:
y_pred = []
for source, target in zip(df_test['source'], df_test['target']):
    X_source = X_svd[authors_dict[source], :].reshape(1,-1)
    X_target = X_svd[authors_dict[target], :].reshape(1,-1)

    cos_sim = cosine_similarity(X_source, X_target)[0][0]
    pred = 1 if cos_sim > 0.5 else 0
    y_pred.append(pred)

df_test['cosine'] = y_pred

In [22]:
print(classification_report(df_test['connected'], df_test['cosine']))
print('F1: {:.4f}'.format(f1_score(df_test['connected'], df_test['cosine'])))
print('Accuracy: {:.4f}'.format(accuracy_score(df_test['connected'], df_test['cosine'])))

              precision    recall  f1-score   support

         0.0       0.72      0.94      0.82      5000
         1.0       0.91      0.63      0.74      4860

    accuracy                           0.78      9860
   macro avg       0.81      0.78      0.78      9860
weighted avg       0.81      0.78      0.78      9860

F1: 0.7417
Accuracy: 0.7845


<a name="Euclidean"></a>
### Euclidean distance

La distancia euclideana no funciona tan bien como la similitud coseno.

In [23]:
y_pred = []
for source, target in zip(df_test['source'], df_test['target']):
    X_source = X_svd[authors_dict[source], :].reshape(1,-1)
    X_target = X_svd[authors_dict[target], :].reshape(1,-1)

    euclidean_dist = euclidean_distances(X_source, X_target)[0][0]
    y_pred.append(euclidean_dist)

# Es mejor normalizar para poder hacer la comparación
y_pred = np.array(y_pred)/np.max(y_pred)
y_pred = [1 if y < 0.5 else 0 for y in y_pred]

df_test['euclidean'] = y_pred

In [24]:
print(classification_report(df_test['connected'], df_test['euclidean']))
print('F1: {:.4f}'.format(f1_score(df_test['connected'], df_test['euclidean'])))
print('Accuracy: {:.4f}'.format(accuracy_score(df_test['connected'], df_test['euclidean'])))

              precision    recall  f1-score   support

         0.0       0.02      0.00      0.00      5000
         1.0       0.49      0.99      0.66      4860

    accuracy                           0.49      9860
   macro avg       0.25      0.49      0.33      9860
weighted avg       0.25      0.49      0.32      9860

F1: 0.6557
Accuracy: 0.4878


<a name="Train-test"></a>
## Train-test

Ahora le añadiremos una característica más a nuestros datos: su similitud coseno. 

In [48]:
df_samples = pd.read_csv("../Data/sample_features2021-02-08.csv")
df_samples.head()

Unnamed: 0,source,target,connected,sum_of_papers,sum_of_neighbors,log_secundary_neighbors,lenght_short_path,clustering_index_sum
0,Michael Barnett 0001,Christian Lengauer,1,57,25,10.244236,1.0,0.320879
1,Mark Vriesenga,Kalman Peleg,1,11,8,7.544332,1.0,1.4
2,Matt Blaze,Joan Feigenbaum,1,78,69,13.227696,1.0,0.314291
3,Majid Mirmehdi,John F. Haddon,1,35,26,9.679719,1.0,0.644946
4,Richard Durbin,Erik L. L. Sonnhammer,1,19,25,10.301928,1.0,1.115789


In [52]:
simil = []
for index, row in tqdm(df_samples.iterrows(), total=len(df_samples)):
    a = svd.transform(X[authors_dict[row['source']]])
    b = svd.transform(X[authors_dict[row['target']]])
    simil.append(cosine_similarity(a,b)[0][0])

100%|██████████| 149257/149257 [00:06<00:00, 23269.41it/s]


In [63]:
df_samples['cos_sim'] = simil

In [53]:
df_samples_t = pd.read_csv("../Data/sample_features_test2021-01-06.csv")
df_samples_t = df_samples_t.drop(['Unnamed: 0'], axis=1)
df_samples_t

Unnamed: 0,source,target,connected,sum_of_papers,sum_of_neighbors,log_secundary_neighbors,lenght_short_path,clustering_index_sum
0,Hideaki Takanobu,Kayoko Ohtsuki,1.0,17,34,10.757477,1.0,1.407389
1,Joseph Douglas Horton,Alejandro López-Ortiz,1.0,26,25,11.075040,1.0,0.752381
2,Siddhartha R. Dalal,Ashish Jain,1.0,19,11,7.494430,1.0,0.800000
3,Laurence Melloul,Armando Fox,1.0,21,16,8.034955,1.0,1.153846
4,Riccardo Bettati,Wei Zhao 0001,1.0,76,25,9.294498,1.0,1.109524
...,...,...,...,...,...,...,...,...
38098,Florian Schiel,Walt Truszkowski,0.0,21,10,5.023881,9.0,0.300000
38099,Goffredo Haus,Mizuho Iwaihara,0.0,10,9,7.339538,6.0,0.321429
38100,Joe L. Armstrong,George T. Duncan,0.0,13,0,0.000000,,0.000000
38101,Wayne C. Boncyk,Mike Hinchey,0.0,6,2,2.197225,,1.000000


In [45]:
simil = []
for index, row in tqdm(df_samples_t.iterrows(), total=len(df_samples_t)):
    a = svd.transform(X[authors_dict[row['source']]])
    b = svd.transform(X[authors_dict[row['target']]])
    simil.append(cosine_similarity(a,b)[0][0])

100%|██████████| 38103/38103 [00:01<00:00, 22561.01it/s]


In [67]:
df_samples_t['cos_sim'] = simil

In [69]:
df_samples.to_csv('training.csv', index=False)

In [70]:
df_samples_t.to_csv('test.csv', index=False)