In [45]:
import numpy as np
import os
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split

In [46]:
zip_file = keras.utils.get_file(
    fname="cora.tgz",
    origin="https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz",
    extract=True,
)
data_dir = os.path.join(os.path.dirname(zip_file), "cora")

In [47]:
citations = pd.read_csv(
    os.path.join(data_dir, "cora.cites"),
    sep="\t",
    header=None,
    names=["target", "source"],
)
print("Citations shape:", citations.shape)

Citations shape: (5429, 2)


In [49]:
column_names = ["paper_id"] + [f"term_{idx}" for idx in range(1433)] + ["subject"]
papers = pd.read_csv(
    os.path.join(data_dir, "cora.content"), sep="\t", header=None, names=column_names,
)
print("Papers shape:", papers.shape)
papers = papers.sort_values(by = 'paper_id')

Papers shape: (2708, 1435)


In [18]:
G = nx.DiGraph()
G.add_nodes_from(np.unique(papers['paper_id']))
for i in range(citations.shape[0]):
  G.add_edge(citations.iloc[i]['source'], citations.iloc[i]['target'])

In [19]:
features = pd.DataFrame(columns={"node"})
# degree_centrality = nx.degree_centrality(G)
# features['node'] = degree_centrality.keys()
features['node'] = np.union1d(np.unique(citations['source']), np.unique(citations['target']))
features.insert(features.shape[1],"degree_centrality", nx.degree_centrality(G).values())
features.insert(features.shape[1],"in_degree_centrality", nx.in_degree_centrality(G).values())
features.insert(features.shape[1],"out_degree_centrality", nx.out_degree_centrality(G).values())
features.insert(features.shape[1],"average_neighbor_degree", nx.average_neighbor_degree(G).values())
features.insert(features.shape[1],"eigenvector_centrality", nx.eigenvector_centrality(G).values())
features.insert(features.shape[1],"closeness_centrality", nx.closeness_centrality(G).values())
# features.insert(features.shape[1],"current_flow_closeness_centrality", nx.current_flow_closeness_centrality(G).values())
features.insert(features.shape[1],"betweenness_centrality", nx.betweenness_centrality(G).values())
# features.insert(features.shape[1],"current_flow_betweenness_centrality", nx.current_flow_betweenness_centrality(G).values())
features.insert(features.shape[1],"load_centrality", nx.load_centrality(G).values())
features.insert(features.shape[1],"harmonic_centrality", nx.harmonic_centrality(G).values())
# features.insert(features.shape[1],"second_order_centrality", nx.second_order_centrality(G).values())
features.insert(features.shape[1],"pagerank", nx.pagerank(G, alpha = 0.8).values())
features.insert(features.shape[1],"subject", papers['subject'])

In [None]:
# from networkx.algorithms import node_classification
# node_classification.harmonic_function(G)

In [35]:
# G1 = nx.Graph()
# for i in range(citations.shape[0]):
#   G1.add_edge(citations.iloc[i]['source'], citations.iloc[i]['target'])

In [20]:
class_values = sorted(papers["subject"].unique())
class_idx = {name: id for id, name in enumerate(class_values)}
paper_idx = {name: idx for idx, name in enumerate(sorted(papers["paper_id"].unique()))}

papers["paper_id"] = papers["paper_id"].apply(lambda name: paper_idx[name])
citations["source"] = citations["source"].apply(lambda name: paper_idx[name])
citations["target"] = citations["target"].apply(lambda name: paper_idx[name])
papers["subject"] = papers["subject"].apply(lambda value: class_idx[value])

In [98]:
X = features.to_numpy()[:, 1:11]
Y = features.to_numpy()[:, 11]
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [99]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(max_depth=10, random_state=0)
model.fit(x_train, y_train)

# make prediction
test_prediction = model.predict(x_test)

# calculate the accuracy
true_predicted = 0
for i in range(len(test_prediction)):
    if test_prediction[i] == list(y_test)[i]:
        true_predicted +=1

numberOfTestNodes = len(y_test)

print('Accuracy: ', true_predicted/numberOfTestNodes)

Accuracy:  0.3042505592841163


In [88]:
X_all = pd.concat([features, papers], axis=1).drop(['paper_id', 'subject', 'node'], axis=1).to_numpy()
Y_all = features.to_numpy()[:, 11]
x_train, x_test, y_train, y_test = train_test_split(X_all, Y_all, test_size=0.33, random_state=42)

In [94]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(max_depth=10, random_state=0)
model.fit(x_train, y_train)

# make prediction
test_prediction = model.predict(x_test)

# calculate the accuracy
true_predicted = 0
for i in range(len(test_prediction)):
    if test_prediction[i] == list(y_test)[i]:
        true_predicted +=1

numberOfTestNodes = len(y_test)

print('Accuracy: ', true_predicted/numberOfTestNodes)

Accuracy:  0.5894854586129754


In [103]:
x = papers.to_numpy()[:, 1:1432]
y = features.to_numpy()[:, 11]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [104]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(max_depth=10, random_state=0)
model.fit(x_train, y_train)

# make prediction
test_prediction = model.predict(x_test)

# calculate the accuracy
true_predicted = 0
for i in range(len(test_prediction)):
    if test_prediction[i] == list(y_test)[i]:
        true_predicted +=1

numberOfTestNodes = len(y_test)

print('Accuracy: ', true_predicted/numberOfTestNodes)

Accuracy:  0.31767337807606266
