In [6]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import SMOTE
from collections import Counter
import pandas as pd
import sys
import datetime
import time
import csv
import pathlib
import os

In [7]:

def KFold_Validation(X, y, nSplits = 5):
    
    # Initialize the K-fold cross-validator
    kf = KFold(n_splits = nSplits, shuffle = True, random_state = 42)
    
    Testing_Results = {}

    for i, (train_index, test_index) in enumerate(kf.split(X)):
        
        # Split the data into training and testing sets for this fold
        print("KFlod : ", i)
        X_t, X_test = X.iloc[train_index], X.iloc[test_index]
        y_t, y_test = y.iloc[train_index], y.iloc[test_index]

        smo = SMOTE(random_state=42)
        X_train, y_train = smo.fit_resample(X_t, y_t)
        
        # Train the model 
        model = train(X_train, y_train)
    
        # Test the model
        precision, recall, f1 = test(X_test, y_test, model)
        
        # Store the results
        Testing_Results[i] = (precision, recall, f1)
        
    return Testing_Results

In [8]:
'''
    Change the train method according to classifier used
'''

def train(X_train, y_train):
    """
    :param X_train:
    :param y_train:
    :return: None
    """
    tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3], 'C': [100, 1000]},
                        {'kernel': ['linear'], 'C': [1, 10]}]
    print("Start training : " + "\n")
    grid = GridSearchCV(SVC(), tuned_parameters, cv=5, scoring='roc_auc', verbose=2, n_jobs=4)
    model = grid.fit(X_train, y_train)
    return model

In [9]:
def test(X_test, y_test, estimator):
    
    y_pre = estimator.predict(X_test)
    precision = precision_score(y_test, y_pre, labels=None, pos_label=1, average='binary', sample_weight=None)
    recall = recall_score(y_test, y_pre, labels=None, pos_label=1, average='binary', sample_weight=None)
    f1 = f1_score(y_test, y_pre, labels=None, pos_label=1, average='binary', sample_weight=None)

    # precision = precision_score(y_test, y_pre, labels=None, average='macro', sample_weight=None)
    # recall = recall_score(y_test, y_pre, labels=None, average='macro', sample_weight=None)
    # f1 = f1_score(y_test, y_pre, labels=None, average='macro', sample_weight=None)


    print("precision: {:.3}, recall: {:.3}, f1:{:.3}, \n".format(precision, recall, f1))
          
    return precision, recall, f1

## Automation

In [16]:
''' 
    Change the input path accordingly
'''

train_dir = r"C:\Users\manoh\Documents\Project_DSCI\rems_modified\Training_CSV"

Final_Results = {}
# for Code_Emb in os.listdir(train_dir):
for Code_Emb in ['PLBART']:
    
    print(Code_Emb)
        
    Final_Results[Code_Emb] = {}
    
    for Tree_Emb in os.listdir(train_dir+"\\"+Code_Emb):
        print("\t"+Tree_Emb.split(".")[0])
        
        if Tree_Emb == "deepwalk_cg.csv":
            continue
        
        curr_dir = train_dir+"\\"+Code_Emb+"\\"+Tree_Emb
        
        data = pd.read_csv(curr_dir, header=None)
        
        X = data.iloc[:, :-1]
        y = data.iloc[:, -1]

        # Get the current time
        start_time = time.time()

        # Execute some code you want to time
        Results = KFold_Validation(X, y, nSplits = 10)

        # Get the elapsed time
        elapsed_time = round((time.time() - start_time)/60, 3)

        print("Elapsed time: ", elapsed_time, " Minutes")
        
        results_ = pd.DataFrame(Results, index=["Precision", "Recall", "F1"]).T
        
    
        Final_Results[Code_Emb][Tree_Emb.split(".")[0]] = dict(results_.describe().loc["mean"])
        
        


PLBART
	deepwalk_cg
	grarep_cg
KFlod :  0
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits
precision: 0.0, recall: 0.0, f1:0.0, 

KFlod :  1
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits
precision: 0.0, recall: 0.0, f1:0.0, 

KFlod :  2
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits


  _warn_prf(average, modifier, msg_start, len(result))


precision: 0.0, recall: 0.0, f1:0.0, 

KFlod :  3
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits


  _warn_prf(average, modifier, msg_start, len(result))


precision: 0.0, recall: 0.0, f1:0.0, 

KFlod :  4
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits


  _warn_prf(average, modifier, msg_start, len(result))


precision: 0.0, recall: 0.0, f1:0.0, 

KFlod :  5
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits


  _warn_prf(average, modifier, msg_start, len(result))


precision: 0.0, recall: 0.0, f1:0.0, 

KFlod :  6
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits


  _warn_prf(average, modifier, msg_start, len(result))


precision: 0.0, recall: 0.0, f1:0.0, 

KFlod :  7
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits
precision: 0.0, recall: 0.0, f1:0.0, 

KFlod :  8
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits


  _warn_prf(average, modifier, msg_start, len(result))


precision: 0.0, recall: 0.0, f1:0.0, 

KFlod :  9
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits
precision: 0.0, recall: 0.0, f1:0.0, 

Elapsed time:  1.106  Minutes
	line_cg
KFlod :  0
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits


  _warn_prf(average, modifier, msg_start, len(result))


precision: 0.0, recall: 0.0, f1:0.0, 

KFlod :  1
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits


  _warn_prf(average, modifier, msg_start, len(result))


precision: 0.0, recall: 0.0, f1:0.0, 

KFlod :  2
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits


  _warn_prf(average, modifier, msg_start, len(result))


precision: 0.0, recall: 0.0, f1:0.0, 

KFlod :  3
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits


  _warn_prf(average, modifier, msg_start, len(result))


precision: 0.0, recall: 0.0, f1:0.0, 

KFlod :  4
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits


  _warn_prf(average, modifier, msg_start, len(result))


precision: 0.0, recall: 0.0, f1:0.0, 

KFlod :  5
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits


  _warn_prf(average, modifier, msg_start, len(result))


precision: 0.0, recall: 0.0, f1:0.0, 

KFlod :  6
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits


  _warn_prf(average, modifier, msg_start, len(result))


precision: 0.0, recall: 0.0, f1:0.0, 

KFlod :  7
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits


  _warn_prf(average, modifier, msg_start, len(result))


precision: 0.0, recall: 0.0, f1:0.0, 

KFlod :  8
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits


  _warn_prf(average, modifier, msg_start, len(result))


precision: 0.0, recall: 0.0, f1:0.0, 

KFlod :  9
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits


  _warn_prf(average, modifier, msg_start, len(result))


precision: 0.0, recall: 0.0, f1:0.0, 

Elapsed time:  1.354  Minutes
	node2vec_cg
KFlod :  0
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits
precision: 0.75, recall: 0.3, f1:0.429, 

KFlod :  1
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits
precision: 0.5, recall: 0.1, f1:0.167, 

KFlod :  2
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits
precision: 0.4, recall: 0.222, f1:0.286, 

KFlod :  3
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits
precision: 0.0, recall: 0.0, f1:0.0, 

KFlod :  4
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits
precision: 0.5, recall: 0.182, f1:0.267, 

KFlod :  5
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits
precision: 0.333, recall: 0.333, f1:0.333, 

KFlod :  6
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits
precision: 0.0, recall: 0.0, f1:0.0

  _warn_prf(average, modifier, msg_start, len(result))


precision: 0.0, recall: 0.0, f1:0.0, 

KFlod :  1
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits
precision: 0.0, recall: 0.0, f1:0.0, 

KFlod :  2
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits
precision: 0.0, recall: 0.0, f1:0.0, 

KFlod :  3
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits


  _warn_prf(average, modifier, msg_start, len(result))


precision: 0.0, recall: 0.0, f1:0.0, 

KFlod :  4
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits


  _warn_prf(average, modifier, msg_start, len(result))


precision: 0.0, recall: 0.0, f1:0.0, 

KFlod :  5
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits
precision: 0.0, recall: 0.0, f1:0.0, 

KFlod :  6
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits
precision: 0.0, recall: 0.0, f1:0.0, 

KFlod :  7
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits
precision: 0.5, recall: 0.1, f1:0.167, 

KFlod :  8
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits


  _warn_prf(average, modifier, msg_start, len(result))


precision: 0.0, recall: 0.0, f1:0.0, 

KFlod :  9
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits
precision: 0.0, recall: 0.0, f1:0.0, 

Elapsed time:  1.011  Minutes
	sdne_cg
KFlod :  0
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits
precision: 0.0, recall: 0.0, f1:0.0, 

KFlod :  1
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits
precision: 0.25, recall: 0.125, f1:0.167, 

KFlod :  2
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits
precision: 0.6, recall: 0.231, f1:0.333, 

KFlod :  3
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits
precision: 0.0, recall: 0.0, f1:0.0, 

KFlod :  4
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits
precision: 0.333, recall: 0.2, f1:0.25, 

KFlod :  5
Start training : 

Fitting 5 folds for each of 4 candidates, totalling 20 fits
precision: 0.2, recall: 0.125, f1:0.154, 

KFl

## Final Output File

In [17]:
df = pd.DataFrame(Final_Results)

dic = {}
i = 0
for code_emb in df.columns:
    for tree_emb in df.index:
        dic[i] = (code_emb, tree_emb, df.loc[tree_emb, code_emb]['Precision'], df.loc[tree_emb, code_emb]['Recall'], df.loc[tree_emb, code_emb]['F1'])
        i+=1
output = pd.DataFrame(dic, index = ["CodeGraph", "TreeGraph", "Prec", "Recall", "F1"]).T

output.to_csv("SVM_Results_2.csv")

In [18]:
output

Unnamed: 0,CodeGraph,TreeGraph,Prec,Recall,F1
0,PLBART,grarep_cg,0.0,0.0,0.0
1,PLBART,line_cg,0.0,0.0,0.0
2,PLBART,node2vec_cg,0.288333,0.142309,0.181429
3,PLBART,prone_cg,0.05,0.01,0.016667
4,PLBART,sdne_cg,0.183333,0.102168,0.12594
5,PLBART,walklets_cg,0.26,0.190833,0.209586
