In [1]:
import csv
import pandas as pd
import networkx as nx
import numpy as np
import random
import numpy as np
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn import preprocessing
import nltk

# Making the training set into a graph

In [31]:
def get_training_graph(csv_file, colmun_names=['source', 'target', 'connected']):

    # Pandas dataframe
    df = pd.read_csv(csv_file, sep=' ', names=column_names)
    edges = df.loc[training['connected'] == 1]

    # Number of edges
    print(f'There are {len(edges)} edges and {len(df) - len(edges)} non edges')

    # networkx graph
    Graphtype = nx.DiGraph()
    G = nx.from_pandas_edgelist(edges, create_using=Graphtype)
    
    return G

In [27]:
training_graph = get_training_graph('training_set.txt')

There are 335130 edges and 280382 non edges


In [28]:
training_graph

<networkx.classes.digraph.DiGraph at 0x2b17c457c70>

In [30]:
def compute_network_characteristics(graph):
    prop = {}
    prop['N'] =  graph.number_of_nodes() # number of nodes
    prop['M'] = graph.number_of_edges() # number of edges
    degrees = [degree for node, degree in graph.degree()] # degree list
    prop['min_degree'] =  np.min(degrees) # minimum degree
    prop['max_degree'] =  np.max(degrees) # maximum degree
    prop['mean_degree'] = np.mean(degrees) # mean of node degrees
    prop['median_degree'] = np.median(degrees) # median of node degrees
    prop['density'] =  nx.density(graph) # density of the graph

    return prop

###################################################################
prop = compute_network_characteristics(graph=training_graph)
print("Number of nodes: {}".format(prop['N']))
print("Number of edges: {}".format(prop['M']))
print("Min. degree: {}".format(prop['min_degree']))
print("Max. degree: {}".format(prop['max_degree']))
print("Mean degree: {}".format(prop['mean_degree']))
print("Median degree: {}".format(prop['median_degree']))
print("Density: {}".format(prop['density']))

Number of nodes: 27684
Number of edges: 335130
Min. degree: 1
Max. degree: 2346
Mean degree: 24.211096662332032
Median degree: 14.0
Density: 0.0004372917794735403


# Looking at the data

In [2]:
with open("training_set.txt", "r") as f:
    reader = csv.reader(f)
    training_set  = list(reader)

training_set = [element[0].split(" ") for element in training_set]

In [3]:
# 1 if there is an edge between two nodes, 0 else
# print(training_set)

In [4]:
column_names = ['id', 'year', 'title', 'authors', 'journal', 'abstract']
info = pd.read_csv('node_information.csv', sep=',', names=column_names)

In [5]:
info.head()

Unnamed: 0,id,year,title,authors,journal,abstract
0,1001,2000,compactification geometry and duality,Paul S. Aspinwall,,these are notes based on lectures given at tas...
1,1002,2000,domain walls and massive gauged supergravity p...,"M. Cvetic, H. Lu, C.N. Pope",Class.Quant.Grav.,we point out that massive gauged supergravity ...
2,1003,2000,comment on metric fluctuations in brane worlds,"Y.S. Myung, Gungwon Kang",,recently ivanov and volovich hep-th 9912242 cl...
3,1004,2000,moving mirrors and thermodynamic paradoxes,Adam D. Helfer,Phys.Rev.,quantum fields responding to moving mirrors ha...
4,1005,2000,bundles of chiral blocks and boundary conditio...,"J. Fuchs, C. Schweigert",,proceedings of lie iii clausthal july 1999 var...


In [6]:
year_columns = pd.get_dummies(info.year, prefix='year')
info = pd.concat([info, year_columns], axis=1)
info = info.drop(columns = ['year'])

In [7]:
info.head()

Unnamed: 0,id,title,authors,journal,abstract,year_1992,year_1993,year_1994,year_1995,year_1996,year_1997,year_1998,year_1999,year_2000,year_2001,year_2002,year_2003
0,1001,compactification geometry and duality,Paul S. Aspinwall,,these are notes based on lectures given at tas...,0,0,0,0,0,0,0,0,1,0,0,0
1,1002,domain walls and massive gauged supergravity p...,"M. Cvetic, H. Lu, C.N. Pope",Class.Quant.Grav.,we point out that massive gauged supergravity ...,0,0,0,0,0,0,0,0,1,0,0,0
2,1003,comment on metric fluctuations in brane worlds,"Y.S. Myung, Gungwon Kang",,recently ivanov and volovich hep-th 9912242 cl...,0,0,0,0,0,0,0,0,1,0,0,0
3,1004,moving mirrors and thermodynamic paradoxes,Adam D. Helfer,Phys.Rev.,quantum fields responding to moving mirrors ha...,0,0,0,0,0,0,0,0,1,0,0,0
4,1005,bundles of chiral blocks and boundary conditio...,"J. Fuchs, C. Schweigert",,proceedings of lie iii clausthal july 1999 var...,0,0,0,0,0,0,0,0,1,0,0,0


In [8]:
vect = TfidfVectorizer(stop_words="english")
abstract_vectorized = vect.fit_transform(info['abstract'])

In [10]:
abstract_vec = abstract_vectorized.todense()
vec_abs = pd.DataFrame(abstract_vec, columns=vect.get_feature_names())

In [11]:
info = pd.concat([info, vec_abs], axis=1)

In [13]:
info = info.drop(columns = ['abstract'])
info.head()

Unnamed: 0,id,title,authors,journal,year_1992,year_1993,year_1994,year_1995,year_1996,year_1997,...,zweibach,zweibein,zweibeine,zweibeins,zweibiens,zwiebach,zwirner,zynda,zyserman,zz
0,1001,compactification geometry and duality,Paul S. Aspinwall,,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1002,domain walls and massive gauged supergravity p...,"M. Cvetic, H. Lu, C.N. Pope",Class.Quant.Grav.,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1003,comment on metric fluctuations in brane worlds,"Y.S. Myung, Gungwon Kang",,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1004,moving mirrors and thermodynamic paradoxes,Adam D. Helfer,Phys.Rev.,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1005,bundles of chiral blocks and boundary conditio...,"J. Fuchs, C. Schweigert",,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
vect = TfidfVectorizer(stop_words="english")
title_vectorized = vect.fit_transform(info['title'])
title_vec = title_vectorized.todense()
vec_title = pd.DataFrame(title_vec, columns=vect.get_feature_names())

In [17]:
info = pd.concat([info, vec_title], axis=1)

In [18]:
info = info.drop(columns = ['title'])
info.head()

Unnamed: 0,id,authors,journal,year_1992,year_1993,year_1994,year_1995,year_1996,year_1997,year_1998,...,zweibach,zweibein,zweibeine,zweibeins,zweibiens,zwiebach,zwirner,zynda,zyserman,zz
0,1001,Paul S. Aspinwall,,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1002,"M. Cvetic, H. Lu, C.N. Pope",Class.Quant.Grav.,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1003,"Y.S. Myung, Gungwon Kang",,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1004,Adam D. Helfer,Phys.Rev.,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1005,"J. Fuchs, C. Schweigert",,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27770 entries, 0 to 27769
Columns: 25056 entries, id to zz
dtypes: float64(25041), int64(1), object(2), uint8(12)
memory usage: 5.2+ GB
