In [None]:
%conda install numpy 
%conda install pandas
%conda install -c anaconda scikit-learn 
%conda install networkx
%conda install -c conda-forge tpot
%conda install -c conda-forge ipywidgets

In [None]:
%conda install pytorch torchvision -c pytorch

In [2]:
import pytorch

ModuleNotFoundError: No module named 'pytorch'

In [None]:
import numpy as np 
import pandas as pd
import networkx as nx

In [None]:
links = pd.read_csv('../data/raw/training.txt', header = None, sep = ' ', names = ['node', 'target', 'edge'])

In [None]:
X = links[['node', 'target']]
y = links['edge']

# Networkx Link Prediction Features

1. Split the dataset 
2. Create graph connections with the training set 
3. Predict new coefficients for training and test set (feature generation)


In [None]:
def CreateGraph (X, y):
    
    G = nx.Graph()
    G.add_nodes_from(range(1,33227))
    G.add_edges_from( list(X[y == 1][['node','target']].itertuples(index=False, name=None)) )
    
    return G

In [None]:
def AppendNextworkxFeature (function, G, X):
    
    column_name = str(function).split()[1]
    
    tuple_list = list(X[['node','target']].itertuples(index=False, name=None))
    coef_generator = function(G, tuple_list)
    coef_df = pd.DataFrame(coef_generator)
    X.insert(2, column_name, list(coef_df[2]), allow_duplicates = True)


In [None]:
def nxGenerateFeatures (X_train, X_test, y_train):
    
    # 2. Create Graph
    # Total number of nodes=pages: 33.226
    G = CreateGraph (X_train, y_train)
    
    # 3. Predict new coefficient/feature for defined link prediction function
    linkPredictionFunctions = [nx.resource_allocation_index, nx.jaccard_coefficient, nx.adamic_adar_index, nx.preferential_attachment]
    for function in linkPredictionFunctions:
        AppendNextworkxFeature(function, G, X_train)
        AppendNextworkxFeature(function, G, X_test)
        
    return X_train, X_test

In [None]:
# 1. Stratified Split 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
# 2/3. Create Graph and generate features 
X_train, X_test = nxGenerateFeatures (X_train, X_test, y_train)

# Save Intermediate Results
X_train.to_csv('../data/intermediate/X_train_nx.csv', sep=',', index=False)
X_test.to_csv('../data/intermediate/X_test_nx.csv', sep=',', index=False)
y_train.to_csv('../data/intermediate/y_train_nx.csv', sep=',', index=False)
y_test.to_csv('../data/intermediate/y_test_nx.csv', sep=',', index=False)

# Load Networkx Intermediate Sets

In [None]:
X_train = pd.read_csv('../data/intermediate/X_train_nx.csv', sep=',')
X_test = pd.read_csv('../data/intermediate/X_test_nx.csv', sep=',')

y_train = np.ravel(pd.read_csv('../data/intermediate/y_train_nx.csv', sep=','))
y_test = np.ravel(pd.read_csv('../data/intermediate/y_test_nx.csv', sep=','))

In [None]:
if 'G' not in locals():
    G = CreateGraph (X_train, y_train)

# First classification model evaluation

In [None]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.preprocessing import StandardScaler, MinMaxScaler

##################################################################################################

##Classifiers
from sklearn.ensemble import AdaBoostClassifier #begins by fitting a classifier on the original dataset and then fits additional copies of the classifier on the same dataset but where the weights of incorrectly classified instances are adjusted
from sklearn.ensemble import BaggingClassifier #Bagging classifier fits base classifiers each on random subsets of the original dataset and aggregate their individual predictions
from sklearn.ensemble import ExtraTreesClassifier #Extremely Random Trees: This class implements a meta estimator that fits a number of randomized decision trees (a.k.a. extra-trees) on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting
from sklearn.ensemble import GradientBoostingClassifier #GB builds an additive model in a forward stage-wise fashion; it allows for the optimization of arbitrary differentiable loss functions
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier #Classifier implementing the k-nearest neighbors vote.
from sklearn.ensemble import VotingClassifier

In [None]:
def Scale(X):
    scaled_features = StandardScaler().fit_transform(X.values)
    return pd.DataFrame(scaled_features, index = X.index, columns = X.columns)

In [None]:
# Data preparation parameters
'''Better performance withou scaling'''
scale = False 

In [None]:
# Data preparation
if 'node' and 'target' in X_train.columns:
    X_train = X_train.drop(columns = ['node', 'target'])
if 'node' and 'target' in X_test.columns:
    X_test = X_test.drop(columns = ['node', 'target'])
    

if scale:  
    X_train = Scale(X_train)
    X_test = Scale(X_test)

In [None]:
# Classification Models
print("Summary for classifiers:")

clf = [
            [AdaBoostClassifier(), "AdaBoostClassifier"],
            [BaggingClassifier(), "BaggingClassifier"],
            [ExtraTreesClassifier(), "ExtraTreesClassifier"],
            [GradientBoostingClassifier(), "GradientBoostClassifier"],
            [DecisionTreeClassifier(), "DecisionTreeClassifier"],
            [RandomForestClassifier(), "RandomForestClassifier"]
        ]

performance_train = {}
performance_test = {}
    
for classifier, clf_name in clf: performance_train[clf_name] = []
for classifier, clf_name in clf: performance_test[clf_name] = []

    
for elem in clf: #Use each classifier in clf
    classifier = elem[0]
    classifier_name = elem[1]
    print(classifier_name)
        
    try:    
        classifier.fit(X_train, y_train)
            
        y_hat = classifier.predict(X_train)
        #Train Scores:
        f1_train = f1_score(y_train, y_hat)
        accuracy_train = accuracy_score(y_train, y_hat)
        precision_train = precision_score(y_train, y_hat)
        recall_train = recall_score(y_train, y_hat)
        roc_auc_train = roc_auc_score(y_train, y_hat)
        #Print train Scores
        print(f"Train scores: \nf1-score: {round(f1_train,3)}\tAccuracy: {round(accuracy_train, 3)}\tPrecision: {round(precision_train,3)}\tRecall: {round(recall_train,3)}\tROC-AUC: {round(roc_auc_train,3)}")
        #Sava train scors for comparison
        performance_train[classifier_name].append(f1_train)
        performance_train[classifier_name].append(accuracy_train)
        performance_train[classifier_name].append(precision_train)
        performance_train[classifier_name].append(recall_train)
        performance_train[classifier_name].append(roc_auc_train)
           
        y_pred = classifier.predict(X_test)
        #Test scores
        f1_test = f1_score(y_test, y_pred)
        accuracy_test = accuracy_score(y_test, y_pred)
        precision_test = precision_score(y_test, y_pred)
        recall_test = recall_score(y_test, y_pred)
        roc_auc_test = roc_auc_score(y_test, y_pred)
        #Print test scores          
        print(f"Test scores: \nf1-score: {round(f1_test,3)}\tAccuracy: {round(accuracy_test,3)}\tPrecision: {round(precision_test,3)}\tRecall: {round(recall_test,3)}\tROC-AUC: {round(roc_auc_test,3)}")
          #Save test scores
        performance_test[classifier_name].append(f1_test)
        performance_test[classifier_name].append(accuracy_test)
        performance_test[classifier_name].append(precision_test)
        performance_test[classifier_name].append(recall_test)
        performance_test[classifier_name].append(roc_auc_test)

        print("\n**********************************************************************")
    except ImportError:
        print("Classifier \"" + classifier_name + "failed.")
print("End")

# tpot auto-ml tool for hyper-parameter selection

In [None]:
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

if 'node' and 'target' in X_train.columns:
    X_train = X_train.drop(columns = ['node', 'target'])
if 'node' and 'target' in X_test.columns:
    X_test = X_test.drop(columns = ['node', 'target'])
    

tpot = TPOTClassifier(generations = 5, population_size = 20, cv = 3, verbosity=2, scoring = 'f1')

tpot.fit(X_train, y_train)
tpot.score(X_test, y_test)

# Submission Prediction

In [None]:
# Testing File
X_submission = pd.read_csv('../data/raw/testing.txt', header = None, sep = ' ', names = ['node', 'target'])

# Feature Generation
X, X_submission = nxGenerateFeatures(X, X_submission, y)

# Save Processed Results
X.to_csv('../data/processed/X_nx.csv', sep=',', index=False)
y.to_csv('../data/processed/y_nx.csv', sep=',', index=False)
X_submission.to_csv('../data/processed/X_submission.csv', sep=',', index=False)

In [None]:
# Load Processed Results Networkx
X = pd.read_csv('../data/processed/X_nx.csv', sep = ',')
y = pd.read_csv('../data/processed/y_nx.csv', sep = ',')
X_submission = pd.read_csv('../data/processed/X_submission_nx.csv', sep = ',')

In [None]:
# Data preparation

# could be 'or' since these columns are always removed together
if 'node' and 'target' in X.columns:
    X = X.drop(columns = ['node', 'target'])
    
if 'node' and 'target' in X_submission.columns:
    X_submission = X_submission.drop(columns = ['node', 'target'])

In [None]:
# Prediction according to Ada Boost Clssifier  

elem = [AdaBoostClassifier(), "AdaBoostClassifier"]
classifier = elem[0]
classifier_name = elem[1]
print(classifier_name)

try:    
    classifier.fit(X, y)
    y_hat = classifier.predict(X)
        
    #Train Scores:
    f1_train = f1_score(y, y_hat)
    accuracy_train = accuracy_score(y, y_hat)
    precision_train = precision_score(y, y_hat)
    recall_train = recall_score(y, y_hat)
    roc_auc_train = roc_auc_score(y, y_hat)
    #Print train Scores
    print(f"Train scores: \nf1-score: {round(f1_train,3)}\tAccuracy: {round(accuracy_train, 3)}\tPrecision: {round(precision_train,3)}\tRecall: {round(recall_train,3)}\tROC-AUC: {round(roc_auc_train,3)}")
    
    y_pred = classifier.predict(X_submission)
    print("\n**********************************************************************")
except ImportError:
    print("Classifier \"" + classifier_name + "failed.")
print("End")

In [None]:
# Save Results
pd.DataFrame(y_pred, columns = ['predicted']).to_csv('../results/outputs/nxAdaBoost.csv', sep=',', index=True, index_label='id')

In [None]:
submission = pd.read_csv('../results/outputs/nxAdaBoost.csv', sep=',')

In [None]:
submission