In [19]:
%matplotlib inline
import os
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import *
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, auc
import pandas as pd
import csv
from sklearn import svm
from sklearn import ensemble
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn import preprocessing
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import random
from sklearn.metrics import accuracy_score, f1_score
from tqdm.notebook import tqdm

In [2]:
nltk.download('punkt') # for tokenization
nltk.download('stopwords')
stpwds = set(nltk.corpus.stopwords.words("english"))
stemmer = nltk.stem.PorterStemmer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\remys\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\remys\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Info of the graph
def compute_network_characteristics(graph):
    prop = {}
    prop['N'] =  graph.number_of_nodes() # number of nodes
    prop['M'] = graph.number_of_edges() # number of edges
    degrees = [degree for node, degree in graph.degree()] # degree list
    prop['min_degree'] =  np.min(degrees) # minimum degree
    prop['max_degree'] =  np.max(degrees) # maximum degree
    prop['mean_degree'] = np.mean(degrees) # mean of node degrees
    prop['median_degree'] = np.median(degrees) # median of node degrees
    prop['density'] =  nx.density(graph) # density of the graph
    return prop

# Turning the training data into a graph

In [4]:
def get_training_graph(csv_file, column_names=['source', 'target', 'connected']):

    # Pandas dataframe
    df = pd.read_csv(csv_file, sep=' ', names=column_names)
    edges = df.loc[df['connected'] == 1]

    # Number of edges
    print(f'There are {len(edges)} edges and {len(df) - len(edges)} non edges')

    # networkx graph
    Graphtype = nx.DiGraph()
    G = nx.from_pandas_edgelist(edges, create_using=Graphtype)
    
    return G

In [5]:
training_graph = get_training_graph('training_set.txt')

There are 335130 edges and 280382 non edges


In [6]:
compute_network_characteristics(graph=training_graph)

{'N': 27684,
 'M': 335130,
 'min_degree': 1,
 'max_degree': 2346,
 'mean_degree': 24.211096662332032,
 'median_degree': 14.0,
 'density': 0.0004372917794735403}

## For now let's focus on a smaller graph

In [7]:
with open("training_set.txt", "r") as f:
    reader = csv.reader(f)
    training_set  = list(reader)

training_set = [element[0].split(" ") for element in training_set]

In [8]:
random.seed(10)
to_keep = random.sample(range(len(training_set)), k=int(round(len(training_set)*1.)))
training_set_reduced = [training_set[i] for i in to_keep]

In [9]:
# Pandas dataframe
df = pd.DataFrame(training_set_reduced, columns=['source', 'target', 'connected'])
edges = df.loc[df['connected'] == '1']

# Number of edges
print(f'There are {len(edges)} edges and {len(df) - len(edges)} non edges')

# networkx graph !!!!!!! The graph is directed
Graphtype = nx.DiGraph()
G = nx.from_pandas_edgelist(edges, create_using=Graphtype)

# Take the largest weakly conected component
nodes = max(nx.strongly_connected_components(G), key=len) 
G0 = G.subgraph(nodes)

# Make that graph undirected
#G0 = G0.to_undirected()
nx.is_strongly_connected(G0)

There are 335130 edges and 280382 non edges


True

In [10]:
compute_network_characteristics(G)

{'N': 27684,
 'M': 335130,
 'min_degree': 1,
 'max_degree': 2346,
 'mean_degree': 24.211096662332032,
 'median_degree': 14.0,
 'density': 0.0004372917794735403}

## Generate Samples

In [13]:
def real_labels(graph, samples):
    labels = []
    for edge in tqdm(samples):
        if edge in graph.edges():
            labels.append(1)
        else:
            labels.append(0)
    return labels

In [14]:
def generate_samples(graph, train_set_ratio):
    """
    Graph pre-processing step required to perform supervised link prediction
    Create training and test sets
    """
        
    # --- Step 0: The graph must be connected ---
    if nx.is_strongly_connected(graph) is not True:
        raise ValueError("The graph contains more than one connected component!")
       
    
    # --- Step 1: Generate positive edge samples for testing set ---
    residual_g = graph.copy()
    test_pos_samples = []
      
    # Store the shuffled list of current edges of the graph
    edges = list(residual_g.edges())
    np.random.shuffle(edges)
    
    # Define number of positive test samples desired
    test_set_size = int((1.0 - train_set_ratio) * graph.number_of_edges())
    train_set_size = graph.number_of_edges() - test_set_size
    num_of_pos_test_samples = 0
    
    # Remove random edges from the graph, leaving it connected
    # Fill in the blanks
    for edge in tqdm(edges):
        
        # Remove the edge
        residual_g.remove_edge(edge[0], edge[1])
        
        # Add the removed edge to the positive sample list if the network is still connected
        if nx.is_strongly_connected(residual_g):
            num_of_pos_test_samples += 1
            test_pos_samples.append(edge)
        # Otherwise, re-add the edge to the network
        else: 
            residual_g.add_edge(edge[0], edge[1])
        
        # If we have collected enough number of edges for testing set, we can terminate the loop
        if num_of_pos_test_samples == test_set_size:
            break
    
    # Check if we have the desired number of positive samples for testing set 
    if num_of_pos_test_samples != test_set_size:
        raise ValueError("Enough positive edge samples could not be found!")

        
    # --- Step 2: Generate positive edge samples for training set ---
    # The remaining edges are simply considered for positive samples of the training set
    train_pos_samples = list(residual_g.edges())
        
        
    # --- Step 3: Generate the negative samples for testing and training sets ---
    # Fill in the blanks
    non_edges = list(nx.non_edges(graph))
    random.seed(10)
    np.random.shuffle(non_edges)
    
    train_neg_samples = non_edges[:train_set_size] 
    test_neg_samples = non_edges[train_set_size:train_set_size + test_set_size]

    
    # --- Step 4: Combine sample lists and create corresponding labels ---
    # For training set
    train_samples = train_pos_samples + train_neg_samples
    train_labels = [1 for _ in train_pos_samples] + [0 for _ in train_neg_samples]
    # For testing set
    test_samples = test_pos_samples + test_neg_samples
    test_labels = [1 for _ in test_pos_samples] + [0 for _ in test_neg_samples]
    
    return residual_g, train_samples, train_labels, test_samples, test_labels


In [15]:
residual_g, train_samples, train_labels, valid_samples, valid_labels = generate_samples(G0, train_set_ratio=0.8)

  0%|          | 0/107612 [00:00<?, ?it/s]

('9906191', '9903178')
('8091', '5087')
('9908115', '9908040')
('101085', '9912204')
('106231', '9191')
('207107', '111160')
('9802173', '9710178')
('9911160', '9906201')
('6112', '9803031')
('104043', '9910053')
('207117', '3071')
('9811042', '9806021')
('9804058', '9802150')
('11033', '10028')
('9705055', '9702187')
('108085', '106103')
('11073', '9908130')
('5067', '9905212')
('105090', '9707093')
('2084', '1215')
('108172', '105070')
('9909218', '9902111')
('9056', '9905012')
('8127', '9911116')
('9907086', '9711131')
('2205', '9910238')
('9807213', '9712166')
('9710030', '9706130')
('106014', '4148')
('9707217', '9702136')
('12180', '11008')
('10241', '9805086')
('9709113', '9706097')
('102063', '2211')
('111030', '104073')
('101192', '10151')
('3024', '9910263')
('9901144', '9811257')
('4085', '3145')
('4056', '9905012')


KeyboardInterrupt: 

## Get the feature Vector

In [14]:
print("degree centrality")
deg_centrality = nx.degree_centrality(G0)
print("done!")

# katz_cent = nx.katz_centrality(G)
  
# print('betweeness_centrality')
# betweeness_centrality = nx.betweenness_centrality(G0)
# print('done!')

degree centrality
done!


In [15]:
def feature_extractor(graph, samples, deg_centrality):
    """
    Creates a feature vector for each edge of the graph contained in samples 
    """
    feature_vector = []
    number_nodes_out = 0

    for edge in tqdm(samples):
        source_node, target_node = edge[0], edge[1]

        # Degree Centrality
        if (source_node not in list(deg_centrality.keys())) or (target_node not in list(deg_centrality.keys())):
            feature_vector.append(np.array([0, 0, 0, 0, 0, 0]))
            number_nodes_out += 1

        else:

            source_degree_centrality = deg_centrality[source_node]
            target_degree_centrality = deg_centrality[target_node]
#             source_katz_cent = katz_cent[source_node]
#             target_katz_cent = katz_cent[target_node]
            
            # # Betweeness centrality measure 
            #diff_bt = betweeness_centrality[target_node] - betweeness_centrality[source_node]

            # Preferential Attachement 
            pref_attach = list(nx.preferential_attachment(graph, [(source_node, target_node)]))[0][2]

            # AdamicAdar
            aai = list(nx.adamic_adar_index(graph, [(source_node, target_node)]))[0][2]

            # Jaccard
            jacard_coeff = list(nx.jaccard_coefficient(graph, [(source_node, target_node)]))[0][2]
            # Ressource allocation index
            res_all = list(nx.resource_allocation_index(graph, [(source_node, target_node)]))[0][2]
            
            # Create edge feature vector with all metric computed above
            feature_vector.append(np.array([source_degree_centrality, target_degree_centrality, pref_attach, aai, jacard_coeff, res_all]) ) 
    print(number_nodes_out)
        
    return np.array(feature_vector)

In [16]:
train_features = feature_extractor(G0.to_undirected(), train_samples, deg_centrality)
valid_features = feature_extractor(G0.to_undirected(), valid_samples, deg_centrality)

  0%|          | 0/172180 [00:00<?, ?it/s]

0


  0%|          | 0/43044 [00:00<?, ?it/s]

0


In [17]:
feat_train = pd.DataFrame(train_features, columns=['source_degree_centrality', 'target_degree_centrality', 'pref_attach', 'aai', 'jacard_coeff', 'res_all'])
feat_valid = pd.DataFrame(valid_features, columns=['source_degree_centrality', 'target_degree_centrality', 'pref_attach', 'aai', 'jacard_coeff', 'res_all'])

In [18]:
feat_train.head()

Unnamed: 0,source_degree_centrality,target_degree_centrality,pref_attach,aai,jacard_coeff,res_all
0,0.001245,0.004012,261.0,0.0,0.0,0.0
1,0.001245,0.002767,180.0,0.248425,0.035714,0.017857
2,0.001245,0.007748,504.0,0.648466,0.031746,0.091667
3,0.001245,0.00083,54.0,0.0,0.0,0.0
4,0.005949,0.003735,1161.0,0.4287,0.029412,0.020333


## Prediction

In [19]:
# --- Build the model and train it ---

#Scale the datas
scaler = preprocessing.MinMaxScaler()
scaled_train_features = scaler.fit_transform(train_features)
scaled_valid_features = scaler.transform(valid_features)

#Predict
clf = LogisticRegression()
clf.fit(scaled_train_features, train_labels)

train_preds = clf.predict_proba(scaled_train_features)[:, 1]
valid_preds = clf.predict_proba(scaled_valid_features)[:, 1]
labels_pred = clf.predict(scaled_valid_features)

print(f'Accuracy: {accuracy_score(valid_labels, labels_pred)}')
print(f'F1 score: {f1_score(valid_labels, labels_pred)}')

# --- Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) from predictions ---
# Fill in the blanks
fpr, tpr, _ = roc_curve(valid_labels, valid_preds)
roc_auc = auc(fpr, tpr)
print(roc_auc)

Accuracy: 0.9283523836074714
F1 score: 0.9265924021708083
0.9761766909266953


In [20]:
clf = svm.LinearSVC(max_iter=50000)
clf.fit(scaled_train_features, train_labels)
labels_pred = clf.predict(scaled_valid_features)
print(f'Accuracy: {accuracy_score(valid_labels, labels_pred)}')
print(f'F1 score: {f1_score(valid_labels, labels_pred)}')

Accuracy: 0.9297927701886441
F1 score: 0.9274917222515475


In [21]:
column_names = ['id', 'year', 'title', 'authors', 'journal', 'abstract']
info = pd.read_csv('node_information.csv', sep=',', names=column_names)
info.head()

Unnamed: 0,id,year,title,authors,journal,abstract
0,1001,2000,compactification geometry and duality,Paul S. Aspinwall,,these are notes based on lectures given at tas...
1,1002,2000,domain walls and massive gauged supergravity p...,"M. Cvetic, H. Lu, C.N. Pope",Class.Quant.Grav.,we point out that massive gauged supergravity ...
2,1003,2000,comment on metric fluctuations in brane worlds,"Y.S. Myung, Gungwon Kang",,recently ivanov and volovich hep-th 9912242 cl...
3,1004,2000,moving mirrors and thermodynamic paradoxes,Adam D. Helfer,Phys.Rev.,quantum fields responding to moving mirrors ha...
4,1005,2000,bundles of chiral blocks and boundary conditio...,"J. Fuchs, C. Schweigert",,proceedings of lie iii clausthal july 1999 var...


In [22]:
with open("node_information.csv", "r") as f:
    reader = csv.reader(f)
    node_info  = list(reader)

IDs = [element[0] for element in node_info]

In [23]:
vect = TfidfVectorizer(stop_words="english")
abstract_vectorized = vect.fit_transform(info['abstract'])

In [24]:
def preprocessing_info(sample, abstract_vectorized):
    # number of overlapping words in title
    overlap_title = []

    # temporal distance between the papers
    temp_diff = []

    # number of common authors
    comm_auth = []

    # Cosine sim between abstracts
    cosine_sim = []

    dense_matrix = abstract_vectorized.todense()

    for i in tqdm(range(len(sample))):
        source = sample[i][0]
        target = sample[i][1]
        
        index_source = IDs.index(source)
        index_target = IDs.index(target)
        
        source_info = [element for element in node_info if element[0]==source][0]
        target_info = [element for element in node_info if element[0]==target][0]
        
        # convert to lowercase and tokenize
        source_title = source_info[2].lower().split(" ")
        # remove stopwords
        source_title = [token for token in source_title if token not in stpwds]
        source_title = [stemmer.stem(token) for token in source_title]
        
        target_title = target_info[2].lower().split(" ")
        target_title = [token for token in target_title if token not in stpwds]
        target_title = [stemmer.stem(token) for token in target_title]
        
        source_auth = source_info[3].split(",")
        target_auth = target_info[3].split(",")
        
        overlap_title.append(len(set(source_title).intersection(set(target_title))))
        temp_diff.append(int(source_info[1]) - int(target_info[1]))
        comm_auth.append(len(set(source_auth).intersection(set(target_auth))))

        v1 = dense_matrix[index_source,:]
        v2 = dense_matrix[index_target,:]

        sim = cosine_similarity(v1, v2)
        cosine_sim.append(sim[0][0])

    return overlap_title, temp_diff, comm_auth, cosine_sim

In [25]:
overlap_title_train, temp_diff_train, comm_auth_train, cosine_sim_train = preprocessing_info(train_samples, abstract_vectorized)

  0%|          | 0/172180 [00:00<?, ?it/s]

In [26]:
overlap_title_valid, temp_diff_valid, comm_auth_valid, cosine_sim_valid = preprocessing_info(valid_samples, abstract_vectorized)

  0%|          | 0/43044 [00:00<?, ?it/s]

In [27]:
def get_training_features(overlap_title, temp_diff, comm_auth, sim):
    training_features = np.array([overlap_title, temp_diff, comm_auth, sim]).T
    df = pd.DataFrame(training_features, columns=['overl_title', 'temp_diff', 'comm_author', 'sim'])
    return training_features, df

In [28]:
training_add_feat = get_training_features(overlap_title_train, temp_diff_train, comm_auth_train, cosine_sim_train)[1]
valid_add_feat = get_training_features(overlap_title_valid, temp_diff_valid, comm_auth_valid, cosine_sim_valid)[1]

In [29]:
all_train_feat = pd.concat([feat_train, training_add_feat], axis=1)
all_valid_feat = pd.concat([feat_valid, valid_add_feat], axis=1)

In [30]:
all_train_feat.head()

Unnamed: 0,source_degree_centrality,target_degree_centrality,pref_attach,aai,jacard_coeff,res_all,overl_title,temp_diff,comm_author,sim
0,0.001245,0.004012,261.0,0.0,0.0,0.0,1.0,4.0,0.0,0.015978
1,0.001245,0.002767,180.0,0.248425,0.035714,0.017857,0.0,2.0,0.0,0.005286
2,0.001245,0.007748,504.0,0.648466,0.031746,0.091667,0.0,3.0,0.0,0.023356
3,0.001245,0.00083,54.0,0.0,0.0,0.0,1.0,3.0,0.0,0.026137
4,0.005949,0.003735,1161.0,0.4287,0.029412,0.020333,1.0,1.0,0.0,0.07808


In [17]:
# --- Build the model and train it ---
# Fill in the blanks
#Scale the datas
scaler = preprocessing.StandardScaler()
scaled_all_train_feat = scaler.fit_transform(all_train_feat)
scaled_all_valid_feat = scaler.transform(all_valid_feat)

clf = LogisticRegression()
clf.fit(scaled_all_train_feat, train_labels)

train_preds = clf.predict_proba(scaled_all_train_feat)[:, 1]
valid_preds = clf.predict_proba(scaled_all_valid_feat)[:, 1]
labels_pred = clf.predict(scaled_all_valid_feat)

print(f'Accuracy: {accuracy_score(valid_labels, labels_pred)}')
print(f'F1 score: {f1_score(valid_labels, labels_pred)}')

# --- Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) from predictions ---
# Fill in the blanks
fpr, tpr, _ = roc_curve(valid_labels, valid_preds)
roc_auc = auc(fpr, tpr)
print(roc_auc)

Accuracy: 0.9525369389461946
F1 score: 0.9515704634348702
0.9882795289093774


In [20]:
scaler = preprocessing.StandardScaler()
scaled_all_train_feat = scaler.fit_transform(all_train_feat)
scaled_all_valid_feat = scaler.transform(all_valid_feat)

clf = ensemble.RandomForestClassifier()
clf.fit(scaled_all_train_feat, train_labels)

train_preds = clf.predict_proba(scaled_all_train_feat)[:, 1]
valid_preds = clf.predict_proba(scaled_all_valid_feat)[:, 1]
labels_pred = clf.predict(scaled_all_valid_feat)

print(f'Accuracy: {accuracy_score(valid_labels, labels_pred)}')
print(f'F1 score: {f1_score(valid_labels, labels_pred)}')

Accuracy: 0.9653377938853266
F1 score: 0.9652877948908845


In [21]:
scaler = preprocessing.StandardScaler()
scaled_all_train_feat = scaler.fit_transform(all_train_feat)
scaled_all_valid_feat = scaler.transform(all_valid_feat)

clf = svm.LinearSVC(max_iter=50000)
clf.fit(scaled_all_train_feat, train_labels)

labels_pred = clf.predict(scaled_all_valid_feat)

print(f'Accuracy: {accuracy_score(valid_labels, labels_pred)}')
print(f'F1 score: {f1_score(valid_labels, labels_pred)}')

AttributeError: 'LinearSVC' object has no attribute 'predict_proba'

# Getting the features of the test set

In [32]:
with open("testing_set.txt", "r") as f:
    reader = csv.reader(f)
    testing_set  = list(reader)

testing_set = [element[0].split(" ") for element in testing_set]

In [33]:
overlap_title_test = []
temp_diff_test = []
comm_auth_test = []
cosine_sim_test = []
dense_matrix = abstract_vectorized.todense()
   
counter = 0
for i in tqdm(range(len(testing_set))):

    source = testing_set[i][0]
    target = testing_set[i][1]
    
    index_source = IDs.index(source)
    index_target = IDs.index(target)
    
    source_info = [element for element in node_info if element[0]==source][0]
    target_info = [element for element in node_info if element[0]==target][0]
    
    source_title = source_info[2].lower().split(" ")
    source_title = [token for token in source_title if token not in stpwds]
    source_title = [stemmer.stem(token) for token in source_title]
    
    target_title = target_info[2].lower().split(" ")
    target_title = [token for token in target_title if token not in stpwds]
    target_title = [stemmer.stem(token) for token in target_title]
    
    source_auth = source_info[3].split(",")
    target_auth = target_info[3].split(",")
    
    overlap_title_test.append(len(set(source_title).intersection(set(target_title))))
    temp_diff_test.append(int(source_info[1]) - int(target_info[1]))
    comm_auth_test.append(len(set(source_auth).intersection(set(target_auth))))

    v1 = dense_matrix[index_source,:]
    v2 = dense_matrix[index_target,:]

    sim = cosine_similarity(v1, v2)
    cosine_sim_test.append(sim[0][0])

  0%|          | 0/32648 [00:00<?, ?it/s]

In [34]:
testing_features = np.array([overlap_title_test,temp_diff_test,comm_auth_test, cosine_sim_test]).T

In [35]:
test_feat = pd.DataFrame(testing_features, columns=['overl_title', 'temp_diff', 'comm_author', 'sim'])

In [36]:
test_graph_feat = feature_extractor(G0.to_undirected(), testing_set, deg_centrality)

  0%|          | 0/32648 [00:00<?, ?it/s]

26129


In [37]:
test_graph_feat = pd.DataFrame(test_graph_feat, columns=['source_degree_centrality', 'target_degree_centrality', 'pref_attach', 'aai', 'jacard_coeff', 'res_all'])

In [38]:
all_test_feat = pd.concat([test_graph_feat, test_feat], axis=1)

In [39]:
all_test_feat.head()

Unnamed: 0,source_degree_centrality,target_degree_centrality,pref_attach,aai,jacard_coeff,res_all,overl_title,temp_diff,comm_author,sim
0,0.007056,0.001522,561.0,0.0,0.0,0.0,0.0,0.0,0.0,0.078254
1,0.026702,0.005534,7680.0,5.632175,0.115385,0.37509,2.0,1.0,0.0,0.174073
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.138734
3,0.005396,0.006779,1911.0,5.239889,0.313433,0.391564,1.0,0.0,0.0,0.11987
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.304493


## Save the train, valid and test all feature dataframe

In [44]:
#Save the train
all_train_feat.to_csv('save/traindf.csv',index=False)
pd.Series(train_labels).to_csv('save/trainlabels.csv',index=False)

#Save the valid
all_valid_feat.to_csv('save/validdf.csv',index=False)
pd.Series(valid_labels).to_csv('save/validlabels.csv',index=False)

#Save the test
all_test_feat.to_csv('save/testdf.csv',index=False)

## Load the dataframes

In [37]:
all_train_feat = pd.read_csv('save/traindf.csv')
train_labels = pd.read_csv('save/trainlabels.csv')

all_valid_feat = pd.read_csv('save/validdf.csv')
valid_labels = pd.read_csv('save/validlabels.csv')

all_test_feat = pd.read_csv('save/testdf.csv')


#Convert labels from pandas Series to list
train_labels = list(train_labels.values.flatten())
valid_labels = list(valid_labels.values.flatten())


In [38]:
#all_train_feat = pd.concat([all_train_feat, all_valid_feat], ignore_index=True, sort=False)
#train_labels = train_labels + valid_labels


## Prediction

In [39]:
#Scale the datas
scaler = preprocessing.StandardScaler()
scaled_all_train_feat = scaler.fit_transform(all_train_feat)
scaled_all_test_feat = scaler.transform(all_test_feat)

# initialize basic SVM or LR
#classifier = svm.LinearSVC(max_iter=50000)
#classifier = LogisticRegression(max_iter=50000)
clf = ensemble.RandomForestClassifier()

# train
classifier.fit(all_train_feat, train_labels)

LogisticRegression(max_iter=50000)

In [40]:
predictions_SVM = list(classifier.predict(scaled_all_test_feat))
id = list(range(len(predictions_SVM)))

## Submission

In [42]:
submission_df = pd.DataFrame(zip(id,predictions_SVM),columns=['id','category'])
submission_df.to_csv('testrf.csv',index=False)