# Setup

In [2]:
import random
import numpy as np
import igraph
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn import preprocessing
import nltk
import csv
import networkx as nx
import tqdm

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [3]:
nltk.download('punkt') # for tokenization
nltk.download('stopwords')
stpwds = set(nltk.corpus.stopwords.words("english"))
stemmer = nltk.stem.PorterStemmer()

with open("testing_set.txt", "r") as f:
    reader = csv.reader(f)
    testing_set  = list(reader)

testing_set = [element[0].split(" ") for element in testing_set]

[nltk_data] Downloading package punkt to /home/geraud/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/geraud/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Please choose if you want to generate features or use the already generated ones (it takes some hours to recreate them).

You can also choose the model you want to test.

In [4]:
# Recreation of features
recreate_training_features = False
recreate_testing_features = False
recreate_graphs = False
# Saving recreated features
save_recreated_features = False
# Model to use
allowed_algorithms = ['SVM', 'RandomForest', 'XGBoost']
algorithm = 'XGBoost'
if algorithm not in allowed_algorithms:
    algorithm = 'XGBoost'

# Creating training features

In [5]:
with open("training_set.txt", "r") as f:
    reader = csv.reader(f)
    training_set  = list(reader)

training_set = [element[0].split(" ") for element in training_set]

with open("node_information.csv", "r", newline='\n') as f:
    reader = csv.reader(f, delimiter=',')
    node_info  = list(reader)

In [6]:
IDs = [element[0] for element in node_info]
# compute TFIDF vector of each paper
corpus = [element[5] for element in node_info]
vectorizer = TfidfVectorizer(stop_words="english")
# each row is a node in the order of node_info
features_TFIDF = vectorizer.fit_transform(corpus)

## Creating graph features between publications

In [7]:
if recreate_graphs:
    G_article = nx.Graph()
    article_list = IDs
    for article in article_list:
        G_article.add_node(article)
    pbar = tqdm.tqdm(total=len(training_set))
    for edge in training_set:
        if edge[2] == '0':
            pbar.update(1)
            continue
        else:
            art1 = int(edge[0])
            art2 = int(edge[1])
            G_article.add_edge(art1, art2)
        pbar.update(1)
    pbar.close()

In [8]:
if not recreate_graphs:
    G_article = nx.read_graphml("G_article.graphml")

In [9]:
def feature_extractor_article(graph, samples):
    feature_vector = [[],[],[],[],[]]
    deg_centrality = nx.degree_centrality(graph)
    pbar = tqdm.tqdm(total=len(samples))
    for edge in samples:
        source_node, target_node = edge[0], edge[1]
        source_degree_centrality = deg_centrality[source_node]
        target_degree_centrality = deg_centrality[target_node]
        pref_attach = list(nx.preferential_attachment(graph, [(source_node, target_node)]))[0][2]
        if source_node == target_node:
            aai = 1
        else:
            aai = list(nx.adamic_adar_index(graph, [(source_node, target_node)]))[0][2]
        jacard_coeff = list(nx.jaccard_coefficient(graph, [(source_node, target_node)]))[0][2]
        feature_vector[0].append(source_degree_centrality)
        feature_vector[1].append(target_degree_centrality)
        feature_vector[2].append(pref_attach)
        feature_vector[3].append(aai)
        feature_vector[4].append(jacard_coeff)
        pbar.update(1)
    pbar.close()
    return feature_vector

if recreate_training_features:
    graph_features_article = feature_extractor_article(G_article, training_set)
    source_degree_centrality_article = graph_features_article[0]
    target_degree_centrality_article = graph_features_article[1]
    pref_attach_article = graph_features_article[2]
    aai_article = graph_features_article[3]
    jacard_coeff_article = graph_features_article[4]
    if save_recreated_features:
        np.save('source_degree_centrality_article', np.array(graph_features_article[0]))
        np.save('target_degree_centrality_article', np.array(graph_features_article[1]))
        np.save('pref_attach_article', np.array(graph_features_article[2]))
        np.save('aai_article', np.array(graph_features_article[3]))
        np.save('jacard_coeff_article', np.array(graph_features_article[4]))

In [10]:
if recreate_graphs:
    G_journal = nx.Graph()
    journal_list = list(set([element[4] for element in node_info]))
    for journal in journal_list:
        G_journal.add_node(journal)
    pbar = tqdm.tqdm(total=len(training_set))
    for edge in training_set:
        if edge[2] == '0':
            pbar.update(1)
            continue
        else:
            journ1 = 0
            journ2 = 0
            for i in range(len(IDs)):
                if int(IDs[i]) == int(edge[0]):
                    journ1 = i
                if int(IDs[i]) == int(edge[1]):
                    journ2 = i
            G_journal.add_edge(node_info[journ1][4], node_info[journ2][4])
        pbar.update(1)
    pbar.close()

In [11]:
if not recreate_graphs:
    G_journal = nx.read_graphml("G_journal.graphml")

In [12]:
def feature_extractor_journal(graph, samples):
    feature_vector = [[],[],[],[],[]]
    deg_centrality = nx.degree_centrality(graph)
    pbar = tqdm.tqdm(total=len(samples))
    for edge in samples:
        journ1 = 0
        journ2 = 0
        for i in range(len(IDs)):
            if int(IDs[i]) == int(edge[0]):
                journ1 = i
            if int(IDs[i]) == int(edge[1]):
                journ2 = i
        source_node, target_node = node_info[journ1][4], node_info[journ2][4]
        source_degree_centrality = deg_centrality[source_node]
        target_degree_centrality = deg_centrality[target_node]
        pref_attach = list(nx.preferential_attachment(graph, [(source_node, target_node)]))[0][2]
        if source_node == target_node:
            aai = 1
        else:
            aai = list(nx.adamic_adar_index(graph, [(source_node, target_node)]))[0][2]
        jacard_coeff = list(nx.jaccard_coefficient(graph, [(source_node, target_node)]))[0][2]
        feature_vector[0].append(source_degree_centrality)
        feature_vector[1].append(target_degree_centrality)
        feature_vector[2].append(pref_attach)
        feature_vector[3].append(aai)
        feature_vector[4].append(jacard_coeff)
        pbar.update(1)
    pbar.close()
    return feature_vector

if recreate_training_features:
    graph_features_journal = feature_extractor_journal(G_journal, training_set)
    source_degree_centrality_journal = graph_features_journal[0]
    target_degree_centrality_journal = graph_features_journal[1]
    pref_attach_journal = graph_features_journal[2]
    aai_journal = graph_features_journal[3]
    jacard_coeff_journal = graph_features_journal[4]
    if save_recreated_features:
        np.save('source_degree_centrality_journal', np.array(graph_features_journal[0]))
        np.save('target_degree_centrality_journal', np.array(graph_features_journal[1]))
        np.save('pref_attach_journal', np.array(graph_features_journal[2]))
        np.save('aai_journal', np.array(graph_features_journal[3]))
        np.save('jacard_coeff_journal', np.array(graph_features_journal[4]))

In [13]:
if recreate_graphs:
    G_authors = nx.Graph()
    authors_list = []
    for element in node_info:
        to_append = element[3].split(sep=',')
        for x in to_append:
            authors_list.append(x)
    authors_list = list(set(authors_list))
    for author in authors_list:
        G_authors.add_node(author)
    pbar = tqdm.tqdm(total=len(training_set))
    for edge in training_set:
        if edge[2] == '0':
            pbar.update(1)
            continue
        else:
            authors1 = []
            authors2 = []
            for i in range(len(IDs)):
                if int(IDs[i]) == int(edge[0]):
                    authors1 = node_info[i][3].split(sep=',')
                if int(IDs[i]) == int(edge[1]):
                    authors2 = node_info[i][3].split(sep=',')
            for author1 in authors1:
                for author2 in authors2:
                    G_authors.add_edge(author1, author2)
        pbar.update(1)
    pbar.close()

In [14]:
if not recreate_graphs:
    G_authors = nx.read_graphml("G_authors.graphml")

In [14]:
def feature_extractor_author(graph, samples):
    feature_vector = [[],[],[],[],[]]
    deg_centrality = nx.degree_centrality(graph)
    pbar = tqdm.tqdm(total=len(samples))
    for edge in samples:
        authors1 = []
        authors2 = []
        for i in range(len(IDs)):
            if int(IDs[i]) == int(edge[0]):
                authors1 = node_info[i][3]
            if int(IDs[i]) == int(edge[1]):
                authors2 = node_info[i][3]
        source_authors, target_authors = authors1.split(sep=','), authors2.split(sep=',')
        source_centralities = [deg_centrality[source_author] for source_author in source_authors]
        source_degree_centrality = np.amax(source_centralities)
        target_centralities = [deg_centrality[target_author] for target_author in target_authors]
        target_degree_centrality = np.amax(target_centralities)
        pref_attachs = [[list(nx.preferential_attachment(graph, [(source_author, target_author)]))[0][2] for source_author in source_authors] for target_author in target_authors]
        pref_attach = np.amax(pref_attachs)
        aais = []
        for source_author in source_authors:
            for target_author in target_authors:
                if target_author == source_author:
                    aais.append(1)
                else:
                    aais.append(list(nx.adamic_adar_index(graph, [(source_author, target_author)]))[0][2])
        aai = np.amax(aais)
        jacard_coeffs = [[list(nx.jaccard_coefficient(graph, [(source_author, target_author)]))[0][2] for source_author in source_authors] for target_author in target_authors]
        jacard_coeff = np.amax(jacard_coeffs)
        feature_vector[0].append(source_degree_centrality)
        feature_vector[1].append(target_degree_centrality)
        feature_vector[2].append(pref_attach)
        feature_vector[3].append(aai)
        feature_vector[4].append(jacard_coeff)
        pbar.update(1)
    pbar.close()
    return feature_vector

def feature_extractor_author_mean(graph, samples):
    feature_vector = [[],[],[],[],[]]
    deg_centrality = nx.degree_centrality(graph)
    pbar = tqdm.tqdm(total=len(samples))
    for edge in samples:
        authors1 = []
        authors2 = []
        for i in range(len(IDs)):
            if int(IDs[i]) == int(edge[0]):
                authors1 = node_info[i][3]
            if int(IDs[i]) == int(edge[1]):
                authors2 = node_info[i][3]
        source_authors, target_authors = authors1.split(sep=','), authors2.split(sep=',')
        source_centralities = [deg_centrality[source_author] for source_author in source_authors]
        source_degree_centrality = np.mean(source_centralities)
        target_centralities = [deg_centrality[target_author] for target_author in target_authors]
        target_degree_centrality = np.mean(target_centralities)
        pref_attachs = [[list(nx.preferential_attachment(graph, [(source_author, target_author)]))[0][2] for source_author in source_authors] for target_author in target_authors]
        pref_attach = np.mean(pref_attachs)
        aais = []
        for source_author in source_authors:
            for target_author in target_authors:
                if target_author == source_author:
                    aais.append(1)
                else:
                    aais.append(list(nx.adamic_adar_index(graph, [(source_author, target_author)]))[0][2])
        aai = np.mean(aais)
        jacard_coeffs = [[list(nx.jaccard_coefficient(graph, [(source_author, target_author)]))[0][2] for source_author in source_authors] for target_author in target_authors]
        jacard_coeff = np.mean(jacard_coeffs)
        feature_vector[0].append(source_degree_centrality)
        feature_vector[1].append(target_degree_centrality)
        feature_vector[2].append(pref_attach)
        feature_vector[3].append(aai)
        feature_vector[4].append(jacard_coeff)
        pbar.update(1)
    pbar.close()
    return feature_vector

if recreate_training_features:
    graph_features_author = feature_extractor_author(G_authors, training_set)
    source_degree_centrality_author = graph_features_author[0]
    target_degree_centrality_author = graph_features_author[1]
    pref_attach_author = graph_features_author[2]
    aai_author = graph_features_author[3]
    jacard_coeff_author = graph_features_author[4]
    graph_features_author = feature_extractor_author_mean(G_authors, training_set)
    source_degree_centrality_author_mean = graph_features_author[0]
    target_degree_centrality_author_mean = graph_features_author[1]
    pref_attach_author_mean = graph_features_author[2]
    aai_author_mean = graph_features_author[3]
    jacard_coeff_author_mean = graph_features_author[4]
    if save_recreated_features:
        np.save('source_degree_centrality_author', source_degree_centrality_author)
        np.save('target_degree_centrality_author', target_degree_centrality_author)
        np.save('pref_attach_author', pref_attach_author)
        np.save('aai_author', aai_author)
        np.save('jacard_coeff_author', jacard_coeff_author)
        np.save('source_degree_centrality_author_mean', source_degree_centrality_author_mean)
        np.save('target_degree_centrality_author_mean', target_degree_centrality_author_mean)
        np.save('pref_attach_author_mean', pref_attach_author_mean)
        np.save('aai_author_mean', aai_author_mean)
        np.save('jacard_coeff_author_mean', jacard_coeff_author_mean)

In [15]:
if recreate_training_features:
    # number of overlapping words in title
    overlap_title = []
    # temporal distance between the papers
    temp_diff = []
    # number of common authors
    comm_auth = []
    # same journal
    same_journal = []
    # number of overlapping words in abstract
    overlap_abstract = []

    counter = 0
    print(len(training_set))
    for i in range(len(training_set)):
        source = training_set[i][0]
        target = training_set[i][1]
        
        index_source = IDs.index(source)
        index_target = IDs.index(target)
        
        source_info = [element for element in node_info if element[0]==source][0]
        target_info = [element for element in node_info if element[0]==target][0]
        
        # convert to lowercase and tokenize
        source_title = source_info[2].lower().split(" ")
        # remove stopwords
        source_title = [token for token in source_title if token not in stpwds]
        source_title = [stemmer.stem(token) for token in source_title]
        
        target_title = target_info[2].lower().split(" ")
        target_title = [token for token in target_title if token not in stpwds]
        target_title = [stemmer.stem(token) for token in target_title]
        
        source_auth = source_info[3].split(",")
        target_auth = target_info[3].split(",")
        
        id_journ = int(source_info[4] == target_info[4])
        
        # convert to lowercase and tokenize
        source_abstract = source_info[5].lower().split(" ")
        # remove stopwords
        source_abstract = [token for token in source_title if token not in stpwds]
        source_abstract = [stemmer.stem(token) for token in source_title]
        
        target_abstract = target_info[5].lower().split(" ")
        target_abstract = [token for token in target_title if token not in stpwds]
        target_abstract = [stemmer.stem(token) for token in target_title]
        
        overlap_title.append(len(set(source_title).intersection(set(target_title))))
        temp_diff.append(int(source_info[1]) - int(target_info[1]))
        comm_auth.append(len(set(source_auth).intersection(set(target_auth))))
        same_journal.append(id_journ)
        overlap_abstract.append(len(set(source_abstract).intersection(set(target_abstract))))
    
        counter += 1
        if counter % 10000 == True:
            print(counter, "training examples processsed\r")

    overlap_title_sqrt = np.sqrt(overlap_title)
    temp_diff_sqrt = np.sqrt(np.abs(temp_diff))
    overlap_abstract_sqrt = np.sqrt(overlap_abstract)

    if save_recreated_features:
        np.save('overlap_title', np.array(overlap_title))
        np.save('temp_diff', np.array(temp_diff))
        np.save('comm_auth', np.array(comm_auth))
        np.save('same_journal', np.array(same_journal))
        np.save('overlap_abstract', np.array(overlap_abstract))
        np.save('overlap_title_sqrt', np.array(overlap_title_sqrt))
        np.save('temp_diff_sqrt', np.array(temp_diff_sqrt))
        np.save('overlap_abstract_sqrt', np.array(overlap_abstract_sqrt))

In [16]:
if not recreate_training_features:
    source_degree_centrality_article = np.load('source_degree_centrality_article.npy')
    target_degree_centrality_article = np.load('target_degree_centrality_article.npy')
    pref_attach_article = np.load('pref_attach_article.npy')
    aai_article = np.load('aai_article.npy')
    jacard_coeff_article = np.load('jacard_coeff_article.npy')
    source_degree_centrality_journal = np.load('source_degree_centrality_journal.npy')
    target_degree_centrality_journal = np.load('target_degree_centrality_journal.npy')
    pref_attach_journal = np.load('pref_attach_journal.npy')
    aai_journal = np.load('aai_journal.npy')
    jacard_coeff_journal = np.load('jacard_coeff_journal.npy')
    source_degree_centrality_author = np.load('source_degree_centrality_author.npy')
    target_degree_centrality_author = np.load('target_degree_centrality_author.npy')
    pref_attach_author = np.load('pref_attach_author.npy')
    aai_author = np.load('aai_author.npy')
    jacard_coeff_author = np.load('jacard_coeff_author.npy')
    source_degree_centrality_author_mean = np.load('source_degree_centrality_author_mean.npy')
    target_degree_centrality_author_mean = np.load('target_degree_centrality_author_mean.npy')
    pref_attach_author_mean = np.load('pref_attach_author_mean.npy')
    aai_author_mean = np.load('aai_author_mean.npy')
    jacard_coeff_author_mean = np.load('jacard_coeff_author_mean.npy')
    overlap_title = np.load('overlap_title.npy')
    overlap_abstract = np.load('overlap_abstract.npy')
    temp_diff = np.load('temp_diff.npy')
    comm_auth = np.load('comm_auth.npy')
    same_journal = np.load('same_journal.npy')
    overlap_title_sqrt = np.load('overlap_title_sqrt.npy')
    temp_diff_sqrt = np.load('temp_diff_sqrt.npy')
    overlap_abstract_sqrt = np.load('overlap_abstract_sqrt.npy')

In [17]:
# convert list of lists into array
# documents as rows, unique words as columns (i.e., example as rows, features as columns)
training_features = np.array([source_degree_centrality_article, target_degree_centrality_article, pref_attach_article, aai_article, jacard_coeff_article, overlap_title, temp_diff, comm_auth, same_journal, overlap_abstract, overlap_title_sqrt, temp_diff_sqrt, overlap_abstract_sqrt, source_degree_centrality_journal, target_degree_centrality_journal, pref_attach_journal, aai_journal, jacard_coeff_journal, source_degree_centrality_author, target_degree_centrality_author, pref_attach_author, aai_author, jacard_coeff_author, source_degree_centrality_author_mean, target_degree_centrality_author_mean, pref_attach_author_mean, aai_author_mean, jacard_coeff_author_mean]).T

In [18]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(2)
training_features = poly.fit_transform(training_features)

# scale
scaler = preprocessing.StandardScaler().fit(training_features)
training_features = scaler.transform(training_features)

In [19]:
print(training_features.shape)

(615512, 435)


In [20]:
# convert labels into integers then into column array
# labels = [int(element[2]) for element in training_set_reduced]
labels = [int(element[2]) for element in training_set]
labels = list(labels)
labels_array = np.array(labels)

# Using neural network for inference with 2 classes¶

In [None]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

device = 'cuda' if torch.cuda.is_available() else 'cpu'

class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(435, 512),
            nn.ReLU(),
            nn.Linear(512, 1024),
            nn.ReLU(),
            nn.Linear(1024, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 2),
        )
        
    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits
    
    def predict(self, x):
        return self.forward(x)


In [None]:
def test(model, criterion, testloader, epoch):
    best_acc = 0
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(testloader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
    print(f"Epoch {epoch+1}\t\t Test accuracy:{correct/total}")
    return correct/total

def test(model, criterion, testloader, epoch):
    best_acc = 0
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(testloader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
    print(f"Epoch {epoch+1}\t\t Test accuracy:{correct/total}")
    return correct/total
    
def train(model, optimizer, criterion, train_loader, testloader, epoch, log_step):
    model.train()
    correct = 0
    total = 0
    hist_test = [0, 0]
    for batch_idx, (inputs, targets) in enumerate(train_loader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
    if (epoch+1) % log_step == 0:
        print(f"Epoch {epoch+1}\t\t Training accuracy:{correct/total}")
        test_acc = test(model, criterion, testloader, epoch)
        hist_test.append(test_acc)
        if hist_test[-3] > hist_test[-1] and hist_test[-2] > hist_test[-1]:
            print("Early stopping")

In [None]:
classifier = NeuralNetwork().to(device)
tensor_features = torch.Tensor(training_features).to(device)
tensor_y = torch.tensor(labels_array, dtype=torch.long).to(device)
dataset_train = torch.utils.data.TensorDataset(tensor_features[:50000],tensor_y[:50000])
dataset_val = torch.utils.data.TensorDataset(tensor_features[50000:],tensor_y[50000:])
train_data_loader = torch.utils.data.DataLoader(dataset_train,
                                          batch_size=100,
                                          shuffle=True)
test_data_loader = torch.utils.data.DataLoader(dataset_val,
                                          batch_size=100,
                                          shuffle=True)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(classifier.parameters())

In [None]:
for epoch in range(100):
    train(classifier, optimizer, criterion, train_data_loader, test_data_loader, epoch, 1)

# SVM

In [21]:
if algorithm == "SVM":
    # initialize basic SVM
    classifier = svm.LinearSVC(max_iter=20000)
    # train
    classifier.fit(training_features, labels_array)

# Random forest

In [22]:
from sklearn.ensemble import RandomForestClassifier

if algorithm == "RandomForest":
    classifier = RandomForestClassifier(n_estimators=200, n_jobs=-1)
    classifier.fit(training_features, labels_array)

# XGBoost

In [23]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

if algorithm == "XGBoost":
    data_dmatrix = xgb.DMatrix(data=training_features,label=labels_array)

    classifier = xgb.XGBClassifier(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                    max_depth = 8, alpha = 10, n_estimators = 75)#, tree_method='gpu_hist')

    classifier.fit(training_features, labels_array)

  from pandas import MultiIndex, Int64Index


# Predictions on test set

In [15]:
if recreate_testing_features:
    graph_features_article = feature_extractor_article(G_article, testing_set)
    source_degree_centrality_article = graph_features_article[0]
    target_degree_centrality_article = graph_features_article[1]
    pref_attach_article = graph_features_article[2]
    aai_article = graph_features_article[3]
    jacard_coeff_article = graph_features_article[4]
    if save_recreated_features:
        np.save('source_degree_centrality_article', np.array(source_degree_centrality_article))
        np.save('target_degree_centrality_article', np.array(target_degree_centrality_article))
        np.save('pref_attach_article', np.array(pref_attach_article))
        np.save('aai_article', np.array(aai_article))
        np.save('jacard_coeff_article', np.array(jacard_coeff_article))

100%|██████████| 32648/32648 [00:05<00:00, 5641.82it/s]


In [17]:
if not recreate_testing_features:
    source_degree_centrality_article = np.load("source_degree_centrality_article.npy")
    target_degree_centrality_article = np.load("target_degree_centrality_article.npy")
    pref_attach_article = np.load("pref_attach_article.npy")
    aai_article = np.load("aai_article.npy")
    jacard_coeff_article = np.load("jacard_coeff_article.npy")

In [25]:
if recreate_testing_features:
    graph_features_journal = feature_extractor_journal(G_journal, testing_set)
    source_degree_centrality_journal = graph_features_journal[0]
    target_degree_centrality_journal = graph_features_journal[1]
    pref_attach_journal = graph_features_journal[2]
    aai_journal = graph_features_journal[3]
    jacard_coeff_journal = graph_features_journal[4]
    if save_recreated_features:
        np.save('source_degree_centrality_test_journal', np.array(source_degree_centrality_journal))
        np.save('target_degree_centrality_test_journal', np.array(target_degree_centrality_journal))
        np.save('pref_attach_test_journal', np.array(pref_attach_journal))
        np.save('aai_test_journal', np.array(aai_journal))
        np.save('jacard_coeff_test_journal', np.array(jacard_coeff_journal))

100%|██████████| 32648/32648 [05:00<00:00, 108.57it/s]


In [18]:
if not recreate_testing_features:
    source_degree_centrality_journal = np.load("source_degree_centrality_test_journal.npy")
    target_degree_centrality_journal = np.load("target_degree_centrality_test_journal.npy")
    pref_attach_journal = np.load("pref_attach_test_journal.npy")
    aai_journal = np.load("aai_test_journal.npy")
    jacard_coeff_journal = np.load("jacard_coeff_test_journal.npy")

In [27]:
if recreate_testing_features:
    graph_features_author = feature_extractor_author(G_authors, testing_set)
    source_degree_centrality_author = graph_features_author[0]
    target_degree_centrality_author = graph_features_author[1]
    pref_attach_author = graph_features_author[2]
    aai_author = graph_features_author[3]
    jacard_coeff_author = graph_features_author[4]
    if save_recreated_features:
        np.save('source_degree_centrality_test_author', np.array(source_degree_centrality_author))
        np.save('target_degree_centrality_test_author', np.array(target_degree_centrality_author))
        np.save('pref_attach_test_author', np.array(pref_attach_author))
        np.save('aai_test_author', np.array(aai_author))
        np.save('jacard_coeff_test_author', np.array(jacard_coeff_author))

    graph_features_author_mean = feature_extractor_author_mean(G_authors, testing_set)
    source_degree_centrality_author_mean = graph_features_author_mean[0]
    target_degree_centrality_author_mean = graph_features_author_mean[1]
    pref_attach_author_mean = graph_features_author_mean[2]
    aai_author_mean = graph_features_author_mean[3]
    jacard_coeff_author_mean = graph_features_author_mean[4]
    if save_recreated_features:
        np.save('source_degree_centrality_test_author_mean', np.array(source_degree_centrality_author_mean))
        np.save('target_degree_centrality_test_author_mean', np.array(target_degree_centrality_author_mean))
        np.save('pref_attach_test_author_mean', np.array(pref_attach_author_mean))
        np.save('aai_test_author_mean', np.array(aai_author_mean))
        np.save('jacard_coeff_test_author_mean', np.array(jacard_coeff_author_mean))

In [29]:
if not recreate_testing_features:
    source_degree_centrality_author = np.load("source_degree_centrality_test_author.npy")
    target_degree_centrality_author = np.load("target_degree_centrality_test_author.npy")
    pref_attach_author = np.load("pref_attach_test_author.npy")
    aai_author = np.load("aai_test_author.npy")
    jacard_coeff_author = np.load("jacard_coeff_test_author.npy")

    source_degree_centrality_author_mean = np.load("source_degree_centrality_test_author_mean.npy")
    target_degree_centrality_author_mean = np.load("target_degree_centrality_test_author_mean.npy")
    pref_attach_author_mean = np.load("pref_attach_test_author_mean.npy")
    aai_author_mean = np.load("aai_test_author_mean.npy")
    jacard_coeff_author_mean = np.load("jacard_coeff_test_author_mean.npy")

In [30]:
# These features are quite fast to recreate, we didn't save them

overlap_title_test = []
temp_diff_test = []
comm_auth_test = []
same_journal_test = []
overlap_abstract_test = []
    
counter = 0
for i in range(len(testing_set)):
    source = testing_set[i][0]
    target = testing_set[i][1]
    
    index_source = IDs.index(source)
    index_target = IDs.index(target)
    
    source_info = [element for element in node_info if element[0]==source][0]
    target_info = [element for element in node_info if element[0]==target][0]
    
    source_title = source_info[2].lower().split(" ")
    source_title = [token for token in source_title if token not in stpwds]
    source_title = [stemmer.stem(token) for token in source_title]
    
    target_title = target_info[2].lower().split(" ")
    target_title = [token for token in target_title if token not in stpwds]
    target_title = [stemmer.stem(token) for token in target_title]
    
    source_auth = source_info[3].split(",")
    target_auth = target_info[3].split(",")
    
    id_journ = int(source_info[4] == target_info[4])
    
	# convert to lowercase and tokenize
    source_abstract = source_info[5].lower().split(" ")
	# remove stopwords
    source_abstract = [token for token in source_title if token not in stpwds]
    source_abstract = [stemmer.stem(token) for token in source_title]
    
    target_abstract = target_info[5].lower().split(" ")
    target_abstract = [token for token in target_title if token not in stpwds]
    target_abstract = [stemmer.stem(token) for token in target_title]
    
    overlap_title_test.append(len(set(source_title).intersection(set(target_title))))
    temp_diff_test.append(int(source_info[1]) - int(target_info[1]))
    comm_auth_test.append(len(set(source_auth).intersection(set(target_auth))))
    same_journal_test.append(id_journ)
    overlap_abstract_test.append(len(set(source_abstract).intersection(set(target_abstract))))
   
    counter += 1
    if counter % 10000 == True:
        print(counter, "testing examples processsed\r")

        
overlap_title_test_sqrt = np.sqrt(overlap_title_test)
temp_diff_test_sqrt = np.sqrt(np.abs(temp_diff_test))
overlap_abstract_test_sqrt = np.sqrt(overlap_abstract_test)

# convert list of lists into array
# documents as rows, unique words as columns (i.e., example as rows, features as columns)
testing_features = np.array([source_degree_centrality_journal, target_degree_centrality_journal, pref_attach_journal, aai_journal, jacard_coeff_journal, overlap_title_test,temp_diff_test,comm_auth_test, same_journal_test, overlap_abstract_test, overlap_title_test_sqrt, temp_diff_test_sqrt, overlap_abstract_test_sqrt, source_degree_centrality_journal, target_degree_centrality_journal, pref_attach_journal, aai_journal, jacard_coeff_journal, source_degree_centrality_author, target_degree_centrality_author, pref_attach_author, aai_author, jacard_coeff_author, source_degree_centrality_author_mean, target_degree_centrality_author_mean, pref_attach_author_mean, aai_author_mean, jacard_coeff_author_mean]).T

1 testing examples processsed
10001 testing examples processsed
20001 testing examples processsed
30001 testing examples processsed


In [31]:
testing_features = poly.fit_transform(testing_features)
testing_features = scaler.transform(testing_features)

In [32]:
print(testing_features.shape)

(32648, 300)


In [34]:
# issue predictions
predictions_rand_for = list(classifier.predict(testing_features))
for i in range(len(predictions_rand_for)):
    if predictions_rand_for[i] <= 0.5:
        predictions_rand_for[i] = 0
    else:
        predictions_rand_for[i] = 1
# write predictions to .csv file suitable for Kaggle (just make sure to add the column names)
predictions = zip(range(len(testing_set)), predictions_rand_for)

In [35]:
name = "improved_predictions_20000_300_features_svm.csv"

with open(name,"w") as pred1:
    csv_out = csv.writer(pred1)
    csv_out.writerow(("id","category"))
    for row in predictions:
        csv_out.writerow(row)