In [105]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import SMOTE
from collections import Counter
import pandas as pd
import sys
import datetime
import time
import csv
import pathlib
import os

In [107]:

def KFold_Validation(X, y, nSplits = 5):
    
    # Initialize the K-fold cross-validator
    kf = KFold(n_splits = nSplits, shuffle = True, random_state = 42)
    
    Testing_Results = {}

    for i, (train_index, test_index) in enumerate(kf.split(X)):
        
        # Split the data into training and testing sets for this fold
        print("KFlod : ", i)
        X_t, X_test = X.iloc[train_index], X.iloc[test_index]
        y_t, y_test = y.iloc[train_index], y.iloc[test_index]

        smo = SMOTE(random_state=42)
        X_train, y_train = smo.fit_resample(X_t, y_t)
        
        # Train the model 
        model = train(X_train, y_train)
    
        # Test the model
        precision, recall, f1 = test(X_test, y_test, model)
        
        # Store the results
        Testing_Results[i] = (precision, recall, f1)
        
    return Testing_Results

In [108]:
'''
    Change the train method according to classifier used
'''

def train(X_train, y_train):
    """
    Use grid search to find optimal parameters and then test the optimal model
    :param X_train:
    :param y_train:
    :return: None
    """
    # grid search parameter list
    tuned_parameters = {
            "n_neighbors": range(1, 20),
            "weights": ['uniform', 'distance']
    }
    # generate model
    print("Start training : " + "\n")
    grid = GridSearchCV(KNeighborsClassifier(), tuned_parameters, cv=5, scoring='roc_auc', verbose=2, n_jobs=4)
    # Send data to model training
    model = grid.fit(X_train, y_train)
    return model

In [109]:
def test(X_test, y_test, estimator):
    
    y_pre = estimator.predict(X_test)
    precision = precision_score(y_test, y_pre, labels=None, pos_label=1, average='binary', sample_weight=None)
    recall = recall_score(y_test, y_pre, labels=None, pos_label=1, average='binary', sample_weight=None)
    f1 = f1_score(y_test, y_pre, labels=None, pos_label=1, average='binary', sample_weight=None)

    # precision = precision_score(y_test, y_pre, labels=None, average='macro', sample_weight=None)
    # recall = recall_score(y_test, y_pre, labels=None, average='macro', sample_weight=None)
    # f1 = f1_score(y_test, y_pre, labels=None, average='macro', sample_weight=None)


    print("precision: {:.3}, recall: {:.3}, f1:{:.3}, \n".format(precision, recall, f1))
          
    return precision, recall, f1

## Automation

In [110]:
''' 
    Change the input path accordingly
'''

train_dir = r"C:\Users\manoh\Documents\Project_DSCI\rems_modified\Training_CSV"

Final_Results = {}
for Code_Emb in os.listdir(train_dir):
    print(Code_Emb)
    Final_Results[Code_Emb] = {}
    
    for Tree_Emb in os.listdir(train_dir+"\\"+Code_Emb):
        print("\t"+Tree_Emb.split(".")[0])
        
        curr_dir = train_dir+"\\"+Code_Emb+"\\"+Tree_Emb
        
        data = pd.read_csv(curr_dir, header=None)
        
        X = data.iloc[:, :-1]
        y = data.iloc[:, -1]

        # Get the current time
        start_time = time.time()

        # Execute some code you want to time
        Results = KFold_Validation(X, y, nSplits = 10)

        # Get the elapsed time
        elapsed_time = round((time.time() - start_time)/60, 3)

        print("Elapsed time: ", elapsed_time, " Minutes")
        
        results_ = pd.DataFrame(Results, index=["Precision", "Recall", "F1"]).T
        
    
        Final_Results[Code_Emb][Tree_Emb.split(".")[0]] = dict(results_.describe().loc["mean"])
        
        


CodeBERT
	deepwalk_cg
KFlod :  0
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.176, recall: 0.6, f1:0.273, 

KFlod :  1
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.333, recall: 0.8, f1:0.471, 

KFlod :  2
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.172, recall: 0.556, f1:0.263, 

KFlod :  3
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.04, recall: 1.0, f1:0.0769, 

KFlod :  4
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.25, recall: 0.636, f1:0.359, 

KFlod :  5
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.115, recall: 1.0, f1:0.207, 

KFlod :  6
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.1, recall: 0.5, f1:0.167, 

KFlod :  7
Start training : 

Fi

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.0667, recall: 0.2, f1:0.1, 

Elapsed time:  3.003  Minutes
	walklets_cg
KFlod :  0
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.0725, recall: 0.833, f1:0.133, 

KFlod :  1
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.106, recall: 0.875, f1:0.189, 

KFlod :  2
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.169, recall: 1.0, f1:0.29, 

KFlod :  3
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.134, recall: 1.0, f1:0.237, 

KFlod :  4
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.0492, recall: 0.75, f1:0.0923, 

KFlod :  5
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.0714, recall: 1.0, f1:0.133, 

KFlod :  6
Start training : 

Fitti

precision: 0.1, recall: 1.0, f1:0.182, 

KFlod :  8
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.05, recall: 1.0, f1:0.0952, 

KFlod :  9
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.0606, recall: 0.857, f1:0.113, 

Elapsed time:  2.463  Minutes
	sdne_cg
KFlod :  0
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.143, recall: 0.875, f1:0.246, 

KFlod :  1
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.125, recall: 0.625, f1:0.208, 

KFlod :  2
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.19, recall: 0.615, f1:0.291, 

KFlod :  3
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.0833, recall: 1.0, f1:0.154, 

KFlod :  4
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision

precision: 0.0465, recall: 0.5, f1:0.0851, 

KFlod :  7
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.179, recall: 1.0, f1:0.303, 

KFlod :  8
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.0526, recall: 0.6, f1:0.0968, 

KFlod :  9
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.102, recall: 0.714, f1:0.179, 

Elapsed time:  2.785  Minutes
	prone_cg
KFlod :  0
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.12, recall: 1.0, f1:0.215, 

KFlod :  1
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.115, recall: 0.9, f1:0.205, 

KFlod :  2
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.103, recall: 1.0, f1:0.188, 

KFlod :  3
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision:

precision: 0.0294, recall: 1.0, f1:0.0571, 

KFlod :  6
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.0392, recall: 1.0, f1:0.0755, 

KFlod :  7
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.098, recall: 1.0, f1:0.179, 

KFlod :  8
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.049, recall: 1.0, f1:0.0935, 

KFlod :  9
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.0686, recall: 1.0, f1:0.128, 

Elapsed time:  2.831  Minutes
	node2vec_cg
KFlod :  0
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.132, recall: 0.5, f1:0.208, 

KFlod :  1
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.25, recall: 0.8, f1:0.381, 

KFlod :  2
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precisi

precision: 0.357, recall: 0.625, f1:0.455, 

KFlod :  5
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.571, recall: 0.444, f1:0.5, 

KFlod :  6
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.2, recall: 0.5, f1:0.286, 

KFlod :  7
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.286, recall: 0.4, f1:0.333, 

KFlod :  8
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.15, recall: 0.75, f1:0.25, 

KFlod :  9
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.167, recall: 0.4, f1:0.235, 

Elapsed time:  2.678  Minutes
	line_cg
KFlod :  0
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.0971, recall: 1.0, f1:0.177, 

KFlod :  1
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.097

precision: 0.027, recall: 1.0, f1:0.0526, 

KFlod :  4
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.25, recall: 0.636, f1:0.359, 

KFlod :  5
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.0588, recall: 0.667, f1:0.108, 

KFlod :  6
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.0741, recall: 0.5, f1:0.129, 

KFlod :  7
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.211, recall: 0.8, f1:0.333, 

KFlod :  8
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.0667, recall: 0.4, f1:0.114, 

KFlod :  9
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.147, recall: 0.714, f1:0.244, 

Elapsed time:  3.123  Minutes
	grarep_cg
KFlod :  0
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
preci

precision: 0.13, recall: 1.0, f1:0.23, 

KFlod :  3
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.12, recall: 1.0, f1:0.214, 

KFlod :  4
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.0541, recall: 1.0, f1:0.103, 

KFlod :  5
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.0597, recall: 1.0, f1:0.113, 

KFlod :  6
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.0735, recall: 0.833, f1:0.135, 

KFlod :  7
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.0533, recall: 0.8, f1:0.1, 

KFlod :  8
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.0857, recall: 0.857, f1:0.156, 

KFlod :  9
Start training : 

Fitting 5 folds for each of 38 candidates, totalling 190 fits
precision: 0.027, recall: 1.0, f1:0.0526, 

Elapsed

## Final Output File

In [111]:
df = pd.DataFrame(Final_Results)

dic = {}
i = 0
for code_emb in df.columns:
    for tree_emb in df.index:
        dic[i] = (code_emb, tree_emb, df.loc[tree_emb, code_emb]['Precision'], df.loc[tree_emb, code_emb]['Recall'], df.loc[tree_emb, code_emb]['F1'])
        i+=1
output = pd.DataFrame(dic, index = ["CodeGraph", "TreeGraph", "Prec", "Recall", "F1"]).T

output.to_csv("KNN_Results.csv")

In [112]:
output

Unnamed: 0,CodeGraph,TreeGraph,Prec,Recall,F1
0,CodeBERT,deepwalk_cg,0.168555,0.71062,0.26086
1,CodeBERT,grarep_cg,0.325304,0.576587,0.388762
2,CodeBERT,line_cg,0.068342,1.0,0.12617
3,CodeBERT,node2vec_cg,0.155878,0.659509,0.240136
4,CodeBERT,prone_cg,0.104026,0.660931,0.17418
5,CodeBERT,sdne_cg,0.195731,0.402477,0.244336
6,CodeBERT,walklets_cg,0.08546,0.878214,0.153777
7,CodeGPT,deepwalk_cg,0.122248,0.679711,0.199179
8,CodeGPT,grarep_cg,0.073851,0.902222,0.13553
9,CodeGPT,line_cg,0.068592,1.0,0.12659
