In [1]:
from __future__ import division
import random
import csv
import numpy as np
import pandas as pd
import networkx as nx

%load_ext autoreload
%autoreload 2
from attribute_features import *
from semantic_features import *
from topological_features import *
from classifiers import *


# Loading datasets

In [None]:
# Load the training set
Xfull = pd.read_csv("../data/training_set.txt", sep=" ", header=None)
# Shuffling
Xfull = Xfull.reindex(np.random.permutation(Xfull.index))
Yfull = Xfull[2]
Xfull.drop([2], axis = 1, inplace = True)
print 'Full training set dimension: ',Xfull.shape

n = 600000
Xtrain = Xfull[:n]
Xtrain.index = range(n)
Ytrain = Yfull[:n]
Ytrain.index = range(n)

m = int(n + round(n/10))
Xval = Xfull[n+1:]
Yval = Yfull[n+1:]
print 'Train/validation subsets:'
print "Train : ", Xtrain.shape , Ytrain.shape
print "Validation : ", Xval.shape, Yval.shape

In [None]:
Xtrain.head()

In [None]:
nodes_info = pd.read_csv("../data/node_information.csv", header= None,\
 names=["Id", "year", "title", "authors", "journal", "abstract"],\
 sep=",",index_col = "Id", encoding = 'utf-8')
# Parsing authors and their affiliations
fix_auth_aff(nodes_info)

In [None]:
nodes_info.head()

# Topological features:
1. Betweenness centrality
2. Nb. common neighbors
3. Jaccard coefficients
4. Difference of inlinks
5. Nb. of articles that cited the target
6. Academic/adar
7. Max cited author pagerank (authors citations graph)
8. Authors similarity (simrank on authors co-authorship graph)

In [None]:
# Building the citations graph...
G = build_graph(Xtrain, Ytrain, nodes_info)
# Building the authors graphs...
GAC = build_authors_citation_graph(Xtrain, Ytrain, nodes_info)
GAA = build_authors_coauthorship_graph(nodes_info)


# Save graphs
fh1 = open("../data/G_authors_citations.edgelist","wb")
nx.write_edgelist(GAC,fh1)
fh2 = open("../data/G_authors_coauthorship.edgelist","wb")
nx.write_edgelist(GAA,fh2)
fh3 = open("../data/G_articles.edgelist","wb")
nx.write_edgelist(G,fh3)

In [None]:
Xtrain = topologic_features(Xtrain, G, GAC, GAA, nodes_info)
print 'Updated dimension: ',Xtrain.shape

In [None]:
Xtrain.head()

# Attribute features:
1. Title overlap
2. Abstract overlap
3. Difference in publication year
4. Is self citation
5. same journal
6. Common authors
7. Is same institution


In [None]:
Xtrain = attribute_features(Xtrain, nodes_info)
print 'Updated dimension: ', Xtrain.shape

In [None]:
Xtrain.head()

# Semantic features:
1. Tf-Idf abstract cosine
2. Word2vec abstract cosine

In [None]:
Xtrain = semantic_features(Xtrain, nodes_info)
print 'Updated dimension: ', Xtrain.shape

In [None]:
Xtrain.head()

In [None]:
# Save features for visualization:
Xtrain.to_csv('../data/Xtrain.csv')
Ytrain.to_csv('../data/Ytrain.csv')

In [None]:
# Same features for the valisation set:
Xval = topologic_features(Xval, G, GAC, GAA, nodes_info,train=False)
Xval = attribute_features(Xval, nodes_info, train=False)
Xval = semantic_features(Xval, nodes_info, train=False)

In [None]:
Xval.head()

# Benchmark classifiers

## SVM classification

In [None]:
svm_model,X1, scaler, = train_svm(Xtrain,Ytrain, Xval, Yval)

In [None]:
X1.head()

## Random forest

In [None]:
train_rf(Xtrain, Ytrain, Xval, Yval)

In [None]:
rfe = recursive_feature_elimination(Xtrain,Ytrain)
Results =  Xtrain.loc[0,'betweeness centrality':]
Results[:]= rfe.ranking_.T
print(Results)
# Selected (i.e., estimated best) features are assigned rank 1

In [None]:
train_extraT(Xtrain, Ytrain, Xval, Yval)

## Grid search

In [None]:
svm_model = gridsearch_svm(Xtrain,Ytrain, Xval, Yval)