In [None]:
# %conda install numpy 
# %conda install pandas
# %conda install -c anaconda scikit-learn 
# %conda install networkx
# %conda install -c conda-forge node2vec 
# %conda install -c conda-forge tpot
# %conda install -c conda-forge ipywidgets
# %conda install pytorch torchvision -c pytorch
# %conda install -c anaconda cudatoolkit=10.0
# %pip install xgboost (GPU not supported in conda environment)

In [1]:
import os
import numpy as np 
import pandas as pd
import networkx as nx

# Summary

This notebook consists of an explorative analysis of the problem of predicting links in web pages. Input data:  
* pairs of pages (two nodes in a graph) and a boolean variable indicating if there is a link (an edge) between them.  
* text of all pages

This way the strategy to approach this problem is first to extract as much relevant information as possible from the inputs, that is, to engineer features for the graph and for the text of the pages, and then to train and tune a classification model.  The problem is thus divided:

##### Feature Engineering
1. Networkx Link Prediction Features
2. Node embedding Features
3. Text Features

##### Classification Models
4. Miscellaneous Classifiers
5. XGBoost

##### Prediction
6. Submission prediction



#### Load Initial Data 

In [None]:
links = pd.read_csv('../data/raw/training.txt', header = None, sep = ' ', names = ['node', 'target', 'edge'])
X = links[['node', 'target']]
y = links['edge']

## 1. Networkx Link Prediction Features

### Feature Engineering
1.1 Split the dataset <br/>
1.2 Create graph connections with the training set <br/>
1.3 Predict new coefficients for training and test set (feature generation)



In [None]:
def CreateGraph (X, y, directed = False):
    
    if directed:
        G = nx.from_pandas_edgelist(X, 'node', 'target', create_using=nx.DiGraph())
    else:    
        G = nx.from_pandas_edgelist(X[y == 1], 'node', 'target', create_using=nx.Graph())
        
    G.add_nodes_from(range(33226))
    
    return G

In [None]:
def AppendNextworkxFeature (function, G, X):
    
    column_name = str(function).split()[1]
    
    tuple_list = list(X[['node','target']].itertuples(index=False, name=None))
    coef_generator = function(G, tuple_list)
    coef_df = pd.DataFrame(coef_generator)
    X.insert(2, column_name, list(coef_df[2]), allow_duplicates = True)


In [None]:
def nxGenerateFeatures (X_train, X_test, y_train):
    
    # 1.2 Create Graph
    # Total number of nodes=pages: 33.226
    G = CreateGraph (X_train, y_train)
    
    # 1.3 Predict new coefficient/feature for defined link prediction function
    linkPredictionFunctions = [nx.resource_allocation_index, nx.jaccard_coefficient, nx.adamic_adar_index, nx.preferential_attachment]
    for function in linkPredictionFunctions:
        AppendNextworkxFeature(function, G, X_train)
        AppendNextworkxFeature(function, G, X_test)
        
    return X_train, X_test

In [2]:
def SaveData (path, X_train, X_test, y_train, y_test=[]):
    
    parent_folder = '/'.join([x for x in path.split('/')[:-2]])
    
    X_train.to_csv(os.path.join(path, 'X_train.csv'), sep=',', index=False)
    X_test.to_csv(os.path.join(path, 'X_test.csv'), sep=',', index=False)
    
    pd.DataFrame(y_train).to_csv(os.path.join(parent_folder,'y_train.csv'), sep=',', index=False, header=False)
    
    if len(y_test) > 0:
        pd.DataFrame(y_test).to_csv(os.path.join(parent_folder,'y_test.csv'), sep=',', index=False, header=False)   

In [3]:
def LoadData (path):
    
    parent_folder = '/'.join([x for x in path.split('/')[:-2]])
    
    X_train = pd.read_csv(os.path.join(path, 'X_train.csv'), sep=',')
    X_test = pd.read_csv(os.path.join(path, 'X_test.csv'), sep=',')
    y_train = np.genfromtxt(os.path.join(parent_folder,'y_train.csv'), delimiter=',', skip_header=0)
    
    try:
        y_test = np.genfromtxt(os.path.join(parent_folder,'y_test.csv'), delimiter=',', skip_header=0)
        return X_train, X_test, y_train, y_test
    
    except Exception:
        pass
        
    return X_train, X_test, y_train

In [None]:
# 1.1 Stratified Split 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
# 1.2 and 1.3 Generate features
X_train, X_test = nxGenerateFeatures (X_train, X_test, y_train)

# Save Networkx Intermediate Results
SaveData('../data/intermediate/networkx/', X_train, X_test, y_train, y_test)

In [None]:
# Load Networkx Intermediate Results
X_train, X_test, y_train, y_test = LoadData('../data/intermediate/networkx/')

# 2. Node embedding features 

### Feature Engineering
2.1. Fit node2vec model to the graph, and embed it's edges <br/>
2.2. Apply the model to train and test set (feature generation)

In [35]:
from gensim.models import Word2Vec
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder
from numpy import linalg as LA

In [None]:
def node2VecGenerateFeatures(X_train, X_test, y_train, savePath = None, loadPath = None, workers = 1):
    
    DG = CreateGraph(X_train, y_train, directed = True)

    # 2.1 Fit or Load model
    if loadPath:
        model = Word2Vec.load(loadPath)
    else:
        node2vec = Node2Vec(DG, dimensions=20, walk_length=16, num_walks=100, workers=workers)
        model = node2vec.fit(window=4, min_count=1)
        if savePath:
            model.save(savePath)

            
    # Embed edges using Hadamard Embedder
    edges_embs = HadamardEmbedder(keyed_vectors=model.wv)


    # 2.2 Apply embedding to each element
    emb_train = [ edges_embs[(str(i), str(j))] for i,j in zip(X_train['node'], X_train['target'])]
    emb_test = [ edges_embs[(str(i), str(j))] for i,j in zip(X_test['node'], X_test['target'])]
    
    X_train['edge_norm'] = [LA.norm(v) for v in emb_train]
    X_test['edge_norm'] = [LA.norm(v) for v in emb_test]

    return X_train, X_test

In [None]:
try:
    X_train, X_test = node2VecGenerateFeatures(X_train, X_test, y_train, loadPath = '../results/models/node2vec/emb1_train.model')
except Exception:
    X_train, X_test = node2VecGenerateFeatures(X_train, X_test, y_train, savePath = '../results/models/node2vec/emb1_train.model', workers = 12)
    pass

# Save Node2Vec Intermediate Results
SaveData('../data/intermediate/node2vec/', X_train, X_test, y_train, y_test)

In [4]:
# Load Node2Vec Intermediate Data
X_train, X_test, y_train, y_test = LoadData('../data/intermediate/node2vec/')

# 3. Text Features

### Feature Engineering

2.1. Load doc2vec embedding model trained in 'TextFeatures' notebook  <br/>
2.2. Apply the model to train and test set (feature generation)

In [6]:
# Vectorization Options: TF-IDF or doc2vec 
# Format Options: concatenating nodes or cosine similarity 

In [9]:
from gensim.models.doc2vec import Doc2Vec

def doc2vecGenerateFeatures (X_train, X_test, loadPath, difference = False):
    
    # 3.1 Load Model 
    model = Doc2Vec.load(loadPath) 
    
    # 3.2 Apply embedding to each element
    if difference:
        vec_train = [ model.infer_vector([str(i)]) -  model.infer_vector([str(j)]) for i, j in zip(X_train['node'], X_train['target']) ]
        vec_test = [ model.infer_vector([str(i)]) -  model.infer_vector([str(j)]) for i, j in zip(X_test['node'], X_test['target']) ]
        
    else:
        vec_train = [ model.infer_vector([str(i), str(j)]) for i, j in zip(X_train['node'], X_train['target']) ]
        vec_test = [ model.infer_vector([str(i), str(j)]) for i, j in zip(X_test['node'], X_test['target']) ]
    
    X_train = X_train.join(pd.DataFrame(vec_train))
    X_test = X_test.join(pd.DataFrame(vec_test))
    
    return X_train, X_test

In [10]:
X_train, X_test = doc2vecGenerateFeatures(X_train, X_test, loadPath="../results/models/doc2vec/vec10.model", difference=True)

# Save Node2Vec Intermediate Results
SaveData('../data/intermediate/doc2vec/', X_train, X_test, y_train, y_test)

In [11]:
# Load Node2Vec Intermediate Data
X_train, X_test, y_train, y_test = LoadData('../data/intermediate/doc2vec/')

# 4. Miscellaneous Classifiers

#### Data Preparation

In [39]:
def Scale(X):
    scaled_features = StandardScaler().fit_transform(X.values)
    return pd.DataFrame(scaled_features, index = X.index, columns = X.columns)

In [40]:
# Data preparation parameters
'''Better performance withou scaling'''
scale = False 

In [41]:
# Data preparation
if 'node' and 'target' in X_train.columns:
    X_train = X_train.drop(columns = ['node', 'target'])
if 'node' and 'target' in X_test.columns:
    X_test = X_test.drop(columns = ['node', 'target'])
    

if scale:  
    X_train = Scale(X_train)
    X_test = Scale(X_test)

#### Ensemble Classifiers

In [42]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.preprocessing import StandardScaler, MinMaxScaler

##################################################################################################

##Classifiers
from sklearn.ensemble import AdaBoostClassifier #begins by fitting a classifier on the original dataset and then fits additional copies of the classifier on the same dataset but where the weights of incorrectly classified instances are adjusted
from sklearn.ensemble import BaggingClassifier #Bagging classifier fits base classifiers each on random subsets of the original dataset and aggregate their individual predictions
from sklearn.ensemble import ExtraTreesClassifier #Extremely Random Trees: This class implements a meta estimator that fits a number of randomized decision trees (a.k.a. extra-trees) on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting
from sklearn.ensemble import GradientBoostingClassifier #GB builds an additive model in a forward stage-wise fashion; it allows for the optimization of arbitrary differentiable loss functions
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier #Classifier implementing the k-nearest neighbors vote.
from sklearn.ensemble import VotingClassifier

In [43]:
# Classification Models
print("Summary for classifiers:")

clf = [
            [AdaBoostClassifier(), "AdaBoostClassifier"],
            [BaggingClassifier(), "BaggingClassifier"],
            [ExtraTreesClassifier(), "ExtraTreesClassifier"],
            [GradientBoostingClassifier(), "GradientBoostClassifier"],
            [DecisionTreeClassifier(), "DecisionTreeClassifier"],
            [RandomForestClassifier(), "RandomForestClassifier"]
        ]

performance_train = {}
performance_test = {}
    
for classifier, clf_name in clf: performance_train[clf_name] = []
for classifier, clf_name in clf: performance_test[clf_name] = []

    
for elem in clf: #Use each classifier in clf
    classifier = elem[0]
    classifier_name = elem[1]
    print(classifier_name)
        
    try:    
        classifier.fit(X_train, y_train)
            
        y_hat = classifier.predict(X_train)
        #Train Scores:
        f1_train = f1_score(y_train, y_hat)
        accuracy_train = accuracy_score(y_train, y_hat)
        precision_train = precision_score(y_train, y_hat)
        recall_train = recall_score(y_train, y_hat)
        roc_auc_train = roc_auc_score(y_train, y_hat)
        #Print train Scores
        print(f"Train scores: \nf1-score: {round(f1_train,3)}\tAccuracy: {round(accuracy_train, 3)}\tPrecision: {round(precision_train,3)}\tRecall: {round(recall_train,3)}\tROC-AUC: {round(roc_auc_train,3)}")
        #Sava train scors for comparison
        performance_train[classifier_name].append(f1_train)
        performance_train[classifier_name].append(accuracy_train)
        performance_train[classifier_name].append(precision_train)
        performance_train[classifier_name].append(recall_train)
        performance_train[classifier_name].append(roc_auc_train)
           
        y_pred = classifier.predict(X_test)
        #Test scores
        f1_test = f1_score(y_test, y_pred)
        accuracy_test = accuracy_score(y_test, y_pred)
        precision_test = precision_score(y_test, y_pred)
        recall_test = recall_score(y_test, y_pred)
        roc_auc_test = roc_auc_score(y_test, y_pred)
        #Print test scores          
        print(f"Test scores: \nf1-score: {round(f1_test,3)}\tAccuracy: {round(accuracy_test,3)}\tPrecision: {round(precision_test,3)}\tRecall: {round(recall_test,3)}\tROC-AUC: {round(roc_auc_test,3)}")
          #Save test scores
        performance_test[classifier_name].append(f1_test)
        performance_test[classifier_name].append(accuracy_test)
        performance_test[classifier_name].append(precision_test)
        performance_test[classifier_name].append(recall_test)
        performance_test[classifier_name].append(roc_auc_test)

        print("\n**********************************************************************")
    except ImportError:
        print("Classifier \"" + classifier_name + "failed.")
print("End")

Summary for classifiers:
AdaBoostClassifier
Train scores: 
f1-score: 0.912	Accuracy: 0.891	Precision: 0.915	Recall: 0.909	ROC-AUC: 0.885
Test scores: 
f1-score: 0.899	Accuracy: 0.876	Precision: 0.913	Recall: 0.886	ROC-AUC: 0.872

**********************************************************************
BaggingClassifier


KeyboardInterrupt: 

### tpot auto-ml tool for hyper-parameter selection

In [None]:
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

def tpot (X_train, y_train, X_test = None, y_test = None,
          export_file = '../results/models/tpot/exported_pipeline.py', n_jobs = 1):
    
    if 'node' and 'target' in X_train.columns:
        X_train = X_train.drop(columns = ['node', 'target'])
    if 'node' and 'target' in X_test.columns:
        X_test = X_test.drop(columns = ['node', 'target'])

    tpot = TPOTClassifier(generations = 5, population_size = 40, cv=3, verbosity=2, scoring = 'f1', n_jobs=6)

    tpot.fit(X_train, y_train)
    tpot.export(export_file)
    print(tpot.score(X_test, y_test))

In [None]:
# 1) networkx
tpot(X_train, y_train, X_test, y_test, export_file = '../results/models/tpot/nx_exported_pipeline.py',  n_jobs=6)

In [None]:
# 2) node2vec
tpot(X_train, y_train, X_test, y_test, export_file = '../results/models/tpot/n2v_exported_pipeline.py',  n_jobs=6)

In [None]:
# 3) doc2vec
tpot(X_train, y_train, X_test, y_test, export_file = '../results/models/tpot/d2v_exported_pipeline.py',  n_jobs=6)

# 5) XGBoost

In [44]:
import xgboost

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

def XGB(LR = 0.1, n_est = 1000, max_d = 5, min_c = 1, gm = 0, colsample = 0.8, subs = 1, lambd = 0, alpha = 0):
   
    xgb_model = XGBClassifier( learning_rate=LR, 
                                            n_estimators=n_est,
                                            max_depth=max_d,
                                            min_child_weight=min_c,                         
                                            gamma=min_c,
                                            colsample_bytree=colsample,
                                            subsample=subs,
                                            objective ='binary:logistic',                   
                                            reg_lambda=lambd,
                                            reg_alpha=alpha,
                                            scale_pos_weight = 1,
                                            seed=42)


    xgb_model.fit(X_train,y_train)
    y_pred_train = xgb_model.predict(X_train)
    y_pred = xgb_model.predict(X_test)


    print('performance over the training set: ' + str(f1_score(y_train, y_pred_train)))
    print('performance over the test set: ' + str(f1_score(y_test, y_pred)) + '\n')
    print(classification_report(y_test, y_pred))
    
    
    
def XGB_tuning (LR = 0.1, n_est = 1000, max_d = 5, min_c = 1, gm = 0, colsample = 0.8, subs = 1,
                lambd = 0, alpha = 0, param_test = {'learning_rate':[i/100.0 for i in range(5,20,2)]} ):
    
    gsearch = GridSearchCV(estimator = XGBClassifier( learning_rate=LR, 
                                                    n_estimators=n_est,
                                                    max_depth=max_d,
                                                    min_child_weight=min_c,                         
                                                    gamma=min_c,
                                                    colsample_bytree=colsample,
                                                    subsample = subs,
                                                    objective ='binary:logistic',
                                                    reg_lambda=lambd,
                                                    reg_alpha=alpha,
                                                    scale_pos_weight = 1,
                                                    seed=42), 
                            param_grid = param_test, 
                            scoring='f1',
                            n_jobs=4, 
                            cv=3)


    gsearch.fit(X_train, y_train)
    return gsearch

In [45]:
# STEP 1 - First XGB - A little bit overfitted, but could improve more the training set
LearningRate = 0.1
n_estimators = 1000

# Fix Learning Rate and n_estimators
XGB(LearningRate, n_estimators)

performance over the training set: 0.9218679460737016
performance over the test set: 0.9007730122542035

              precision    recall  f1-score   support

         0.0       0.82      0.85      0.84     34008
         1.0       0.91      0.89      0.90     56752

    accuracy                           0.88     90760
   macro avg       0.87      0.87      0.87     90760
weighted avg       0.88      0.88      0.88     90760



In [46]:
# STEP 2 - Tuning max_depth and min_child_weight

parameters_test = {
    'max_depth':range(3,7,2),
    'min_child_weight':range(1,7,2)
}

gsearch2 = XGB_tuning (LR = LearningRate, n_est = n_estimators, param_test=parameters_test)
best_max_depth, best_min_child_weight = gsearch2.best_params_['max_depth'], gsearch2.best_params_['min_child_weight']
print(f"{gsearch2.best_params_} CV_Score: {gsearch2.best_score_}")

# Evaluation
XGB(LR = LearningRate, n_est = n_estimators, max_d=best_max_depth, min_c= best_min_child_weight)

{'max_depth': 3, 'min_child_weight': 5} CV_Score: 0.9138881713274564
performance over the training set: 0.9145638801610834
performance over the test set: 0.9011386840486147

              precision    recall  f1-score   support

         0.0       0.82      0.86      0.84     34008
         1.0       0.91      0.89      0.90     56752

    accuracy                           0.88     90760
   macro avg       0.87      0.87      0.87     90760
weighted avg       0.88      0.88      0.88     90760



In [47]:
# STEP 3 - Tuning Gamma 
parameters_test = { 
    'gamma':[i/10.0 for i in range(0,5)] 
}

gsearch3 = XGB_tuning (LR = LearningRate, n_est = n_estimators, max_d=best_max_depth, min_c= best_min_child_weight,
                      param_test=parameters_test)
best_gamma = gsearch3.best_params_['gamma']
print(f"{gsearch3.best_params_} CV_Score: {gsearch3.best_score_}")

# Evaluation
XGB(LR = LearningRate, n_est = n_estimators, max_d = best_max_depth, min_c = best_min_child_weight, gm = best_gamma)

{'gamma': 0.0} CV_Score: 0.9139307459555347
performance over the training set: 0.9145638801610834
performance over the test set: 0.9011386840486147

              precision    recall  f1-score   support

         0.0       0.82      0.86      0.84     34008
         1.0       0.91      0.89      0.90     56752

    accuracy                           0.88     90760
   macro avg       0.87      0.87      0.87     90760
weighted avg       0.88      0.88      0.88     90760



In [48]:
# STEP 4 - Tuning colsample_bytree and subsaample
parameters_test = {
    'colsample_bytree':[i/10.0 for i in range(7,11)],
    'subsample':[i/10.0 for i in range(7,11)]
}

gsearch4 = XGB_tuning (LR = LearningRate, n_est = n_estimators, max_d = best_max_depth, min_c = best_min_child_weight,
                      gm = best_gamma, param_test=parameters_test)
best_colsample_bytree, best_subsample = gsearch4.best_params_['colsample_bytree'], gsearch4.best_params_['subsample']
print(f"{gsearch4.best_params_} CV_Score: {gsearch4.best_score_}")

# Evaluation
XGB(LR = LearningRate, n_est = n_estimators, max_d = best_max_depth, 
                       min_c = best_min_child_weight, gm = best_gamma,  colsample = best_colsample_bytree, 
                       subs = best_subsample)

{'colsample_bytree': 0.9, 'subsample': 0.8} CV_Score: 0.9140435480710711
performance over the training set: 0.9162213415295286
performance over the test set: 0.9010383706938812

              precision    recall  f1-score   support

         0.0       0.82      0.86      0.84     34008
         1.0       0.91      0.89      0.90     56752

    accuracy                           0.88     90760
   macro avg       0.87      0.87      0.87     90760
weighted avg       0.88      0.88      0.88     90760



In [None]:
# STEP 5 - Tuning Regularization Parameters
# Lambda L2 Regularization
# Ampha L1 Regularization

parameters_test = {
    'reg_lambda':[1e-2, 0.1, 0.5, 1, 2, 10],
    'reg_alpha':[1e-2, 0.1, 0.5, 1, 2, 10],
}

gsearch5 = XGB_tuning (LR = LearningRate, n_est = n_estimators, max_d = best_max_depth, 
                       min_c = best_min_child_weight, gm = best_gamma,  colsample = best_colsample_bytree, 
                       subs = best_subsample, param_test=parameters_test)

best_reg_lambda, best_reg_alpha = gsearch5.best_params_['reg_lambda'], gsearch5.best_params_['reg_alpha']
print(f"{gsearch5.best_params_} CV_Score: {gsearch5.best_score_}")

In [None]:
# Closer Look
parameters_test = {
    'reg_lambda':[best_reg_lambda*0.8, best_reg_lambda, best_reg_lambda*1.2, best_reg_lambda*1.5],
    'reg_alpha':[best_reg_alpha*0.8, best_reg_alpha, best_reg_alpha*1.2, best_reg_alpha*1.5]
}

gsearch5 = XGB_tuning (LR = LearningRate, n_est = n_estimators, max_d = best_max_depth, min_c = best_min_child_weight,
                       gm = best_gamma,  colsample = best_colsample_bytree, subs = best_subsample, 
                       param_test=parameters_test)

best_reg_lambda, best_red_alpha = gsearch5.best_params_['reg_lambda'], gsearch5.best_params_['reg_alpha']
print(f"{gsearch5.best_params_} CV_Score: {gsearch5.best_score_}")

# Evaluation
XGB(LR = LearningRate, n_est = n_estimators, max_d = best_max_depth, min_c = best_min_child_weight,
                       gm = best_gamma,  colsample = best_colsample_bytree, subs = best_subsample, 
                       lambd = best_reg_lambda, alpha = best_reg_alpha)

In [None]:
 # STEP 6 - Reducing Learning Rate and Adding More Trees
n_estimators *= 5
parameters_test = {
    'learning_rate':[i/100.0 for i in range(2,10)]
}

gsearch6 = XGB_tuning (LR = LearningRate, n_est = n_estimators, max_d = best_max_depth, min_c = best_min_child_weight,
                       gm = best_gamma,  colsample = best_colsample_bytree, subs = best_subsample, 
                       lambd = best_reg_lambda, alpha = best_reg_alpha, param_test=parameters_test)

LearningRate = gsearch6.best_params_['learning_rate']
print(f"{gsearch6.best_params_} CV_Score: {gsearch6.best_score_}")


# Final Evaluation
XGB(LR = LearningRate, n_est = n_estimators, max_d = best_max_depth, min_c = best_min_child_weight,
                       gm = best_gamma,  colsample = best_colsample_bytree, subs = best_subsample, 
                       lambd = best_reg_lambda, alpha = best_reg_alpha)

# 6. Submission Prediction

In [None]:
# Load Testing File
X_submission = pd.read_csv('../data/raw/testing.txt', header = None, sep = ' ', names = ['node', 'target'])

### Networkx Feature Generation 

In [None]:
# 1) Networkx Features
X, X_submission = nxGenerateFeatures(X, X_submission, y)

# Save Processed Results
SaveData('../data/processed/networkx/', X, X_submission, y)

In [None]:
# Load Networkx Processed Data
X, X_submission, y = LoadData ('../data/processed/networkx/')

------
### Node2Vec Feature Generation 

In [None]:
# 2) Node2Vec Features
try:
    X, X_submission = node2VecGenerateFeatures(X, X_submission, y, loadPath = '../results/models/node2vec/emb1.model')
except Exception:
    X, X_submission = node2VecGenerateFeatures(X, X_submission, y, savePath = '../results/models/node2vec/emb1.model', workers = 12)
    pass

# Save Processed Results
SaveData('../data/processed/node2vec/', X, X_submission, y)

In [None]:
# Load Node2Vec Processed Data
X, X_submission, y = LoadData ('../data/processed/node2vec/')

-----
### Doc2Vec Feature Generation

-------------------------
###  Classification Model

In [None]:
# Data preparation

# could be 'or' since these columns are always removed together
if 'node' and 'target' in X.columns:
    X = X.drop(columns = ['node', 'target'])
    
if 'node' and 'target' in X_submission.columns:
    X_submission = X_submission.drop(columns = ['node', 'target'])

In [None]:
# Save Results
result_file_name = '../results/predictions/predictions.csv'
pd.DataFrame(y_pred, columns = ['predicted']).to_csv(result_file_name, sep=',', index=True, index_label='id')

In [None]:
# Verify Results
submission = pd.read_csv(result_file_name, sep=',')
submission

In [None]:
##### REFERENCES ####
# https://cs.stanford.edu/~jure/pubs/node2vec-kdd16.pdf