In [None]:
%pip install xgboost # (GPU supported)
%conda install numpy 
%conda install pandas
%conda install -c anaconda scikit-learn 
%conda install networkx
%conda install -c conda-forge node2vec 
%conda install -c conda-forge tpot
%conda install -c conda-forge ipywidgets
%conda install pytorch torchvision -c pytorch

In [1]:
import os
import sys
import numpy as np 
import pandas as pd
import networkx as nx
import scipy

sys.path.insert(0, '../src')
import file_io

# Summary

This notebook consists of an explorative analysis of the problem of predicting links in web pages. Input data:  
* pairs of pages (two nodes in a graph) and a boolean variable indicating if there is a link (an edge) between them.  
* text of all pages

This way the strategy to approach this problem is first to extract as much relevant information as possible from the inputs, that is, to engineer features for the graph and for the text of the pages, and then to train and tune a classification model.  The problem is thus divided:

##### Feature Engineering
1. Networkx Link Prediction features
2. TF-IDF text features
3. Doc2Vec embedding features


##### Classification Models
4. Miscellaneous Classifiers
5. XGBoost

##### Prediction
6. Submission prediction



#### Load Initial Data 

In [None]:
links = pd.read_csv('../data/raw/training.txt', header = None, sep = ' ', names = ['node', 'target', 'edge'])
X = links[['node', 'target']]
y = links['edge']

## 1. Networkx Link Prediction Features

### Feature Engineering
1.1 Split the dataset <br/>
1.2 Create graph connections with the training set <br/>
1.3 Predict new coefficients for training and test set (feature generation)



In [11]:
def CreateGraph (X, y, directed = False):
    
    if directed:
        G = nx.from_pandas_edgelist(X, 'node', 'target', create_using=nx.DiGraph())
    else:    
        G = nx.from_pandas_edgelist(X[y == 1], 'node', 'target', create_using=nx.Graph())
        
    G.add_nodes_from(range(33226))
    
    return G

In [None]:
def AppendNextworkxFeature (function, G, X):
    
    column_name = str(function).split()[1]
    
    tuple_list = list(X[['node','target']].itertuples(index=False, name=None))
    coef_generator = function(G, tuple_list)
    coef_df = pd.DataFrame(coef_generator)
    X.insert(2, column_name, list(coef_df[2]), allow_duplicates = True)


In [None]:
def nxGenerateFeatures (X_train, X_test, y_train):
    
    # 1.2 Create Graph
    # Total number of nodes=pages: 33.226
    G = CreateGraph (X_train, y_train)
    
    # 1.3 Predict new coefficient/feature for defined link prediction function
    linkPredictionFunctions = [nx.resource_allocation_index, nx.jaccard_coefficient, nx.adamic_adar_index, nx.preferential_attachment]
    for function in linkPredictionFunctions:
        AppendNextworkxFeature(function, G, X_train)
        AppendNextworkxFeature(function, G, X_test)
        
    return X_train, X_test

In [None]:
# 1.1 Stratified Split 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
# 1.2 and 1.3 Generate features
X_train, X_test = nxGenerateFeatures (X_train, X_test, y_train)

# Save Networkx Intermediate Results
file_io.SaveData('../data/intermediate/networkx/', X_train, X_test, y_train, y_test)

In [167]:
# Load Networkx Intermediate Results
X_train, X_test, y_train, y_test = file_io.LoadData('../data/intermediate/networkx/')

In [168]:
X_train

Unnamed: 0,node,target,preferential_attachment,adamic_adar_index,jaccard_coefficient,resource_allocation_index
0,9154,9156,176,0.276938,0.038462,0.027027
1,29190,8765,7596,0.291149,0.003110,0.002704
2,3239,16566,468,0.402430,0.006329,0.083333
3,1294,3248,1666,0.000000,0.000000,0.000000
4,21519,32895,1,0.000000,0.000000,0.000000
...,...,...,...,...,...,...
363032,1297,326,540,0.000000,0.000000,0.000000
363033,4688,24489,549,0.264754,0.029412,0.001621
363034,6839,15644,21,0.000000,0.000000,0.000000
363035,2344,12969,35,0.000000,0.000000,0.000000


# 2) TF-IDF

### Applying Truncated SVD on tf-idf embedding

In [169]:
from sklearn.decomposition import TruncatedSVD 

In [170]:
def saveTruncatedSVD(sparseMatrixPath = "../results/models/tf-idf/emb_matrix.npz"):
    
    try:
        tf_idf = scipy.sparse.load_npz(sparseMatrixPath)  
    except Exception:
        print("TF-IDF model not saved, please run TextFeatures.ipynb")
        raise
        
    svd = TruncatedSVD(n_components=100, n_iter=100, random_state=42)
    svd.fit(tf_idf)

    svd_matrix = svd.transform(tf_idf)
    pd.DataFrame(svd_matrix).to_csv("../results/models/tf-idf/svd.csv", sep=',', index=False, header=False)

In [None]:
saveTruncatedSVD()

### Feature Generation

In [171]:
from sklearn.metrics.pairwise import cosine_similarity as CS
import similarity

In [172]:
def appendCosineSimilarity (X_train, X_test, embedding, column_name = ''):
    
    cosine_similarity_train = [ CS(embedding[i].reshape(1, -1), embedding[j].reshape(1, -1))[0][0] for i, j in zip(X_train['node'], X_train['target']) ]
    cosine_similarity_test  = [ CS(embedding[i].reshape(1, -1), embedding[j].reshape(1, -1))[0][0] for i, j in zip(X_test['node'], X_test['target']) ]
    
    X_train[column_name] = cosine_similarity_train
    X_test[column_name] = cosine_similarity_test
    
    return X_train, X_test

In [173]:
def appendTS_SS(X_train, X_test, embedding, column_name = '', mult_factor = 1):
    
    ts_ss = similarity.TS_SS()
    ts_score_train = [ ts_ss(embedding[i], embedding[j])*mult_factor for i, j in zip(X_train['node'], X_train['target']) ]
    ts_score_test = [ ts_ss(embedding[i], embedding[j])*mult_factor for i, j in zip(X_test['node'], X_test['target']) ]
    
    X_train[column_name] = ts_score_train
    X_test[column_name] = ts_score_test
    
    X_train = X_train.fillna(value=0)
    X_test = X_test.fillna(value=0)
        
    return X_train, X_test

In [174]:
# dimension with most variance by a large amount (17%)
def appendDimensionZero(X_train, X_test, embedding, column_name = ''):
    
    dim0_train = [ [embedding[i][0], embedding[j][0]] for i, j in zip(X_train['node'], X_train['target']) ]
    dim0_test = [ [embedding[i][0], embedding[j][0]] for i, j in zip(X_test['node'], X_test['target']) ]
    
    X_train = X_train.join(pd.DataFrame(dim0_train)).rename(columns={0: column_name+'dim0_node', 1: column_name+'dim0_target'})
    X_test = X_test.join(pd.DataFrame(dim0_test)).rename(columns={0: column_name+'dim0_node', 1: column_name+'dim0_target'})
        
    return X_train, X_test

In [177]:
def tfidfGenerateFeatures (X_train, X_test):
    
    
    try:
        tf_idf = scipy.sparse.load_npz("../results/models/tf-idf/emb_matrix.npz")  
    except Exception:
        print("TF-IDF model not saved, please run TextFeatures.ipynb")
        raise
    
    X_train, X_test = appendCosineSimilarity(X_train, X_test, embedding=tf_idf, column_name='cosine_sim')
    
    print('generated cosine similarity')
    
    
    try:
        svd_matrix = np.genfromtxt("../results/models/tf-idf/svd.csv", delimiter=',')
    except Exception:
        saveTruncatedSVD()
        svd_matrix = np.genfromtxt("../results/models/tf-idf/svd.csv", delimiter=',')
        pass
    
    X_train, X_test = appendTS_SS(X_train, X_test, embedding=svd_matrix, column_name='ts-ss_sim', mult_factor = 1e5)
    print('generated ts ss')
    X_train, X_test = appendDimensionZero(X_train, X_test, embedding=svd_matrix)
    
    
    
    return X_train, X_test

In [186]:
X_train, X_test = tfidfGenerateFeatures(X_train, X_test)

# Save tf-idf Intermediate Results
file_io.SaveData('../data/intermediate/tf-idf/', X_train, X_test, y_train, y_test)

In [69]:
# Load tf-idf Intermediate Data
X_train, X_test, y_train, y_test = file_io.LoadData('../data/intermediate/tf-idf/')

# 3. Doc2Vec 

### Feature Engineering

2.1. Load doc2vec embedding model trained in 'TextFeatures' notebook  <br/>
2.2. Apply the model to train and test set (feature generation)

In [None]:
from gensim.models.doc2vec import Doc2Vec
import similarity

In [None]:
def doc2vecGenerateFeatures (X_train, X_test, loadPath):
    
    # 3.1 Load Model 
    model = Doc2Vec.load(loadPath) 
    
    # 3.2 Apply embedding to each element
    ts_ss = similarity.TS_SS()
    vec_train = [ ts_ss( model.infer_vector([str(i)]), model.infer_vector([str(j)]) ) for i, j in zip(X_train['node'], X_train['target']) ]
    vec_test = [ ts_ss( model.infer_vector([str(i)]), model.infer_vector([str(j)]) ) for i, j in zip(X_test['node'], X_test['target']) ]
        
    
    X_train = X_train.join(pd.DataFrame(vec_train))
    X_test = X_test.join(pd.DataFrame(vec_test))
    
    return X_train, X_test

In [None]:
X_train, X_test = doc2vecGenerateFeatures(X_train, X_test, loadPath="../results/models/doc2vec/vec100.model")

# Save Doc2Vec Intermediate Results
file_io.SaveData('../data/intermediate/doc2vec/', X_train, X_test, y_train, y_test)

In [None]:
# Load Doc2Vec Intermediate Data
X_train, X_test, y_train, y_test = file_io.LoadData('../data/intermediate/doc2vec/')

# 4. Miscellaneous Classifiers

#### Data Preparation

In [None]:
def Scale(X):
    scaled_features = StandardScaler().fit_transform(X.values)
    return pd.DataFrame(scaled_features, index = X.index, columns = X.columns)

In [None]:
# Data preparation

'''After some tests I realized that keeping node and target index number increases performance by a little amount'''
# if 'node' and 'target' in X_train.columns:
#     X_train = X_train.drop(columns = ['node', 'target'])
# if 'node' and 'target' in X_test.columns:
#     X_test = X_test.drop(columns = ['node', 'target'])
    

'''Better performance withou scaling'''
scale = False 
if scale:  
    X_train = Scale(X_train)
    X_test = Scale(X_test)

#### Ensemble Classifiers

In [None]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.preprocessing import StandardScaler, MinMaxScaler

##################################################################################################

##Classifiers
from sklearn.ensemble import AdaBoostClassifier #begins by fitting a classifier on the original dataset and then fits additional copies of the classifier on the same dataset but where the weights of incorrectly classified instances are adjusted
from sklearn.ensemble import BaggingClassifier #Bagging classifier fits base classifiers each on random subsets of the original dataset and aggregate their individual predictions
from sklearn.ensemble import ExtraTreesClassifier #Extremely Random Trees: This class implements a meta estimator that fits a number of randomized decision trees (a.k.a. extra-trees) on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting
from sklearn.ensemble import GradientBoostingClassifier #GB builds an additive model in a forward stage-wise fashion; it allows for the optimization of arbitrary differentiable loss functions
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier #Classifier implementing the k-nearest neighbors vote.
from sklearn.ensemble import VotingClassifier

In [None]:
# Classification Models
print("Summary for classifiers:")

clf = [
            [AdaBoostClassifier(), "AdaBoostClassifier"],
            [BaggingClassifier(), "BaggingClassifier"],
            [ExtraTreesClassifier(), "ExtraTreesClassifier"],
            [GradientBoostingClassifier(), "GradientBoostClassifier"],
            [DecisionTreeClassifier(), "DecisionTreeClassifier"],
            [RandomForestClassifier(), "RandomForestClassifier"]
        ]

performance_train = {}
performance_test = {}
    
for classifier, clf_name in clf: performance_train[clf_name] = []
for classifier, clf_name in clf: performance_test[clf_name] = []

    
for elem in clf: #Use each classifier in clf
    classifier = elem[0]
    classifier_name = elem[1]
    print(classifier_name)
        
    try:    
        classifier.fit(X_train, y_train)
            
        y_hat = classifier.predict(X_train)
        #Train Scores:
        f1_train = f1_score(y_train, y_hat)
        accuracy_train = accuracy_score(y_train, y_hat)
        precision_train = precision_score(y_train, y_hat)
        recall_train = recall_score(y_train, y_hat)
        roc_auc_train = roc_auc_score(y_train, y_hat)
        #Print train Scores
        print(f"Train scores: \nf1-score: {round(f1_train,3)}\tAccuracy: {round(accuracy_train, 3)}\tPrecision: {round(precision_train,3)}\tRecall: {round(recall_train,3)}\tROC-AUC: {round(roc_auc_train,3)}")
        #Sava train scors for comparison
        performance_train[classifier_name].append(f1_train)
        performance_train[classifier_name].append(accuracy_train)
        performance_train[classifier_name].append(precision_train)
        performance_train[classifier_name].append(recall_train)
        performance_train[classifier_name].append(roc_auc_train)
           
        y_pred = classifier.predict(X_test)
        #Test scores
        f1_test = f1_score(y_test, y_pred)
        accuracy_test = accuracy_score(y_test, y_pred)
        precision_test = precision_score(y_test, y_pred)
        recall_test = recall_score(y_test, y_pred)
        roc_auc_test = roc_auc_score(y_test, y_pred)
        #Print test scores          
        print(f"Test scores: \nf1-score: {round(f1_test,3)}\tAccuracy: {round(accuracy_test,3)}\tPrecision: {round(precision_test,3)}\tRecall: {round(recall_test,3)}\tROC-AUC: {round(roc_auc_test,3)}")
          #Save test scores
        performance_test[classifier_name].append(f1_test)
        performance_test[classifier_name].append(accuracy_test)
        performance_test[classifier_name].append(precision_test)
        performance_test[classifier_name].append(recall_test)
        performance_test[classifier_name].append(roc_auc_test)

        print("\n**********************************************************************")
    except ImportError:
        print("Classifier \"" + classifier_name + "failed.")
print("End")

# 5) XGBoost

In [183]:
import xgboost

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

def XGB(LR = 0.1, n_est = 1000, max_d = 5, min_c = 1, gm = 0, colsample = 0.8, subs = 1, lambd = 0, alpha = 0):
   
    xgb_model = XGBClassifier( learning_rate=LR, 
                                            n_estimators=n_est,
                                            max_depth=max_d,
                                            min_child_weight=min_c,                         
                                            gamma=gm,
                                            colsample_bytree=colsample,
                                            subsample=subs,
                                            objective ='binary:logistic',                   
                                            reg_lambda=lambd,
                                            reg_alpha=alpha,
                                            scale_pos_weight = 1,
                                            tree_method='gpu_hist',
                                            seed=42)


    xgb_model.fit(X_train,y_train)
    y_pred_train = xgb_model.predict(X_train)
    y_pred = xgb_model.predict(X_test)


    print('performance over the training set: ' + str(f1_score(y_train, y_pred_train)))
    print('performance over the test set: ' + str(f1_score(y_test, y_pred)) + '\n')
    print(classification_report(y_test, y_pred))
    
    
    
def XGB_tuning (LR = 0.1, n_est = 1000, max_d = 5, min_c = 1, gm = 0, colsample = 0.8, subs = 1,
                lambd = 0, alpha = 0, param_test = {'learning_rate':[i/100.0 for i in range(5,20,2)]} ):
    
    gsearch = GridSearchCV(estimator = XGBClassifier( learning_rate=LR, 
                                                    n_estimators=n_est,
                                                    max_depth=max_d,
                                                    min_child_weight=min_c,                         
                                                    gamma=gm,
                                                    colsample_bytree=colsample,
                                                    subsample = subs,
                                                    objective ='binary:logistic',
                                                    reg_lambda=lambd,
                                                    reg_alpha=alpha,
                                                    scale_pos_weight = 1,
                                                    tree_method='gpu_hist',
                                                    seed=42), 
                            param_grid = param_test, 
                            scoring='f1',
                            n_jobs=4, 
                            cv=3)


    gsearch.fit(X_train, y_train)
    return gsearch

In [184]:
# STEP 1 - First XGB - A little bit overfitted, but could improve more the training set
LearningRate = 0.1
n_estimators = 1000

# Fix Learning Rate and n_estimators
XGB(LearningRate, n_estimators)

performance over the training set: 0.949697675391195
performance over the test set: 0.9323741772960316

              precision    recall  f1-score   support

         0.0       0.88      0.90      0.89     34008
         1.0       0.94      0.93      0.93     56752

    accuracy                           0.92     90760
   macro avg       0.91      0.91      0.91     90760
weighted avg       0.92      0.92      0.92     90760



In [187]:
# STEP 2 - Tuning max_depth and min_child_weight

parameters_test = {
    'max_depth':range(3,7,2),
    'min_child_weight':range(1,7,2)
}

gsearch2 = XGB_tuning (LR = LearningRate, n_est = n_estimators, param_test=parameters_test)
best_max_depth, best_min_child_weight = gsearch2.best_params_['max_depth'], gsearch2.best_params_['min_child_weight']
print(f"{gsearch2.best_params_} CV_Score: {gsearch2.best_score_}")

# Evaluation
XGB(LR = LearningRate, n_est = n_estimators, max_d=best_max_depth, min_c= best_min_child_weight)

In [159]:
# STEP 3 - Tuning Gamma 
parameters_test = { 
    'gamma':[i/10.0 for i in range(0,5)] 
}

gsearch3 = XGB_tuning (LR = LearningRate, n_est = n_estimators, max_d=best_max_depth, min_c= best_min_child_weight,
                      param_test=parameters_test)
best_gamma = gsearch3.best_params_['gamma']
print(f"{gsearch3.best_params_} CV_Score: {gsearch3.best_score_}")

# Evaluation
XGB(LR = LearningRate, n_est = n_estimators, max_d = best_max_depth, min_c = best_min_child_weight, gm = best_gamma)

{'gamma': 0.1} CV_Score: 0.9453991370659339
performance over the training set: 0.953789735362048
performance over the test set: 0.9263614290669442

              precision    recall  f1-score   support

         0.0       0.90      0.85      0.87     34008
         1.0       0.91      0.94      0.93     56752

    accuracy                           0.91     90760
   macro avg       0.90      0.89      0.90     90760
weighted avg       0.91      0.91      0.91     90760



In [160]:
# STEP 4 - Tuning colsample_bytree and subsaample
parameters_test = {
    'colsample_bytree':[i/10.0 for i in range(7,11)],
    'subsample':[i/10.0 for i in range(7,11)]
}

gsearch4 = XGB_tuning (LR = LearningRate, n_est = n_estimators, max_d = best_max_depth, min_c = best_min_child_weight,
                      gm = best_gamma, param_test=parameters_test)
best_colsample_bytree, best_subsample = gsearch4.best_params_['colsample_bytree'], gsearch4.best_params_['subsample']
print(f"{gsearch4.best_params_} CV_Score: {gsearch4.best_score_}")

# Evaluation
XGB(LR = LearningRate, n_est = n_estimators, max_d = best_max_depth, 
                       min_c = best_min_child_weight, gm = best_gamma,  colsample = best_colsample_bytree, 
                       subs = best_subsample)

{'colsample_bytree': 0.7, 'subsample': 0.9} CV_Score: 0.9456485859903755
performance over the training set: 0.9538318465498397
performance over the test set: 0.9256015040851159

              precision    recall  f1-score   support

         0.0       0.90      0.85      0.87     34008
         1.0       0.91      0.94      0.93     56752

    accuracy                           0.91     90760
   macro avg       0.90      0.89      0.90     90760
weighted avg       0.91      0.91      0.90     90760



In [161]:
# STEP 5 - Tuning Regularization Parameters
# Lambda L2 Regularization
# Ampha L1 Regularization

parameters_test = {
    'reg_lambda':[1e-2, 0.1, 0.5, 1, 2, 10],
    'reg_alpha':[1e-2, 0.1, 0.5, 1, 2, 10],
}

gsearch5 = XGB_tuning (LR = LearningRate, n_est = n_estimators, max_d = best_max_depth, 
                       min_c = best_min_child_weight, gm = best_gamma,  colsample = best_colsample_bytree, 
                       subs = best_subsample, param_test=parameters_test)

best_reg_lambda, best_reg_alpha = gsearch5.best_params_['reg_lambda'], gsearch5.best_params_['reg_alpha']
print(f"{gsearch5.best_params_} CV_Score: {gsearch5.best_score_}")

{'reg_alpha': 0.5, 'reg_lambda': 0.5} CV_Score: 0.9454935526695749


In [162]:
# Closer Look
parameters_test = {
    'reg_lambda':[best_reg_lambda*0.8, best_reg_lambda, best_reg_lambda*1.2, best_reg_lambda*1.5],
    'reg_alpha':[best_reg_alpha*0.8, best_reg_alpha, best_reg_alpha*1.2, best_reg_alpha*1.5]
}

gsearch5 = XGB_tuning (LR = LearningRate, n_est = n_estimators, max_d = best_max_depth, min_c = best_min_child_weight,
                       gm = best_gamma,  colsample = best_colsample_bytree, subs = best_subsample, 
                       param_test=parameters_test)

best_reg_lambda, best_red_alpha = gsearch5.best_params_['reg_lambda'], gsearch5.best_params_['reg_alpha']
print(f"{gsearch5.best_params_} CV_Score: {gsearch5.best_score_}")

# Evaluation
XGB(LR = LearningRate, n_est = n_estimators, max_d = best_max_depth, min_c = best_min_child_weight,
                       gm = best_gamma,  colsample = best_colsample_bytree, subs = best_subsample, 
                       lambd = best_reg_lambda, alpha = best_reg_alpha)

{'reg_alpha': 0.4, 'reg_lambda': 0.4} CV_Score: 0.9454991058137155
performance over the training set: 0.9532942998992946
performance over the test set: 0.9257615475788801

              precision    recall  f1-score   support

         0.0       0.90      0.84      0.87     34008
         1.0       0.91      0.94      0.93     56752

    accuracy                           0.91     90760
   macro avg       0.90      0.89      0.90     90760
weighted avg       0.91      0.91      0.90     90760



In [163]:
 # STEP 6 - Reducing Learning Rate and Adding More Trees
n_estimators = 10000
parameters_test = {
    'learning_rate':[i/100.0 for i in range(1,10)]
}

gsearch6 = XGB_tuning (LR = LearningRate, n_est = n_estimators, max_d = best_max_depth, min_c = best_min_child_weight,
                       gm = best_gamma,  colsample = best_colsample_bytree, subs = best_subsample, 
                       lambd = best_reg_lambda, alpha = best_reg_alpha, param_test=parameters_test)

LearningRate = gsearch6.best_params_['learning_rate']
print(f"{gsearch6.best_params_} CV_Score: {gsearch6.best_score_}")


# Final Evaluation
XGB(LR = LearningRate, n_est = n_estimators, max_d = best_max_depth, min_c = best_min_child_weight,
                       gm = best_gamma,  colsample = best_colsample_bytree, subs = best_subsample, 
                       lambd = best_reg_lambda, alpha = best_reg_alpha)

{'learning_rate': 0.03} CV_Score: 0.946415742107483
performance over the training set: 0.9659243311922072
performance over the test set: 0.9262006690122537

              precision    recall  f1-score   support

         0.0       0.90      0.85      0.87     34008
         1.0       0.91      0.94      0.93     56752

    accuracy                           0.91     90760
   macro avg       0.90      0.89      0.90     90760
weighted avg       0.91      0.91      0.91     90760



# 6. Submission Prediction

In [None]:
# Load Testing File
X_submission = pd.read_csv('../data/raw/testing.txt', header = None, sep = ' ', names = ['node', 'target'])

### Networkx Feature Generation 

In [None]:
# 1) Networkx Features
X, X_submission = nxGenerateFeatures(X, X_submission, y)

# Save Processed Results
SaveData('../data/processed/networkx/', X, X_submission, y)

In [3]:
# Load Networkx Processed Data
X, X_submission, y = file_io.LoadData('../data/processed/networkx/')

---
### TF-IDF Feature Generation

In [10]:
# 2) TF-IDF Features
X, X_submission = tfidfGenerateFeatures(X, X_submission)

# Save tf-idf Intermediate Results
file_io.SaveData('../data/processed/tf-idf/', X, X_submission, y)

  return np.dot(vec1, vec2.T)/(np.linalg.norm(vec1) * np.linalg.norm(vec2))
  return np.arccos(self.Cosine(vec1, vec2)) + np.radians(10)


In [32]:
# Load TF-IDF Intermediate Data
X, X_submission, y = file_io.LoadData('../data/processed/tf-idf/')

-----
### Doc2Vec Feature Generation

In [None]:
# 3) Doc2Vec Features
 X, X_submission = doc2vecGenerateFeatures(X, X_submission, loadPath="../results/models/doc2vec/vec100.model", similarity=True)

# Save Doc2Vec Intermediate Results
file_io.SaveData('../data/processed/doc2vec/',  X, X_submission, y)

In [None]:
# Load Doc2Vec Intermediate Data
X, X_submission, y = file_io.LoadData('../data/processed/doc2vec/')

-------------------------
###  Classification Model

In [19]:
param = { 'learning_rate': 0.04, 
                'n_estimators': 10000,
                'max_depth': 5,
                'min_child_weight': 1,                         
                'gamma': 0.1,
                'colsample_bytree': 1,
                'subsample': 0.8,
                'objective': 'binary:logistic',                   
                'reg_lambda': 1,
                'reg_alpha': 0.6,
                'scale_pos_weight': 1,
                'tree_method': 'gpu_hist',
                'seed': 42}

xgb_final_model = XGBClassifier(**param)


xgb_final_model.fit(X,y)
y_pred = xgb_final_model.predict(X_submission)
y_pred = y_pred.astype(int)

In [30]:
# Save Results
result_file_name = '../results/predictions/tf-idf_predictions.csv'
pd.DataFrame(y_pred, columns = ['predicted']).to_csv(result_file_name, sep=',', index=True, index_label='id')

In [31]:
# Verify Results
submission = pd.read_csv(result_file_name, sep=',')
submission

Unnamed: 0,id,predicted
0,0,1
1,1,1
2,2,0
3,3,0
4,4,1
...,...,...
113445,113445,0
113446,113446,0
113447,113447,0
113448,113448,1


In [None]:
##### REFERENCES ####
# https://cs.stanford.edu/~jure/pubs/node2vec-kdd16.pdf
# https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
# https://github.com/taki0112/Vector_Similarity