In [7]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import SMOTE
from collections import Counter
import pandas as pd
import sys
import datetime
import time
import csv
import pathlib
import os

In [8]:

def KFold_Validation(X, y, nSplits = 5):
    
    # Initialize the K-fold cross-validator
    kf = KFold(n_splits = nSplits, shuffle = True, random_state = 42)
    
    Testing_Results = {}

    for i, (train_index, test_index) in enumerate(kf.split(X)):
        
        # Split the data into training and testing sets for this fold
        print("KFlod : ", i)
        X_t, X_test = X.iloc[train_index], X.iloc[test_index]
        y_t, y_test = y.iloc[train_index], y.iloc[test_index]

        smo = SMOTE(random_state=42)
        X_train, y_train = smo.fit_resample(X_t, y_t)
        
        # Train the model 
        model = train(X_train, y_train)
    
        # Test the model
        precision, recall, f1 = test(X_test, y_test, model)
        
        # Store the results
        Testing_Results[i] = (precision, recall, f1)
        
    return Testing_Results

In [12]:
'''
    Change the train method according to classifier used
'''

def train(X_train, y_train):
    """
    :param X_train:
    :param y_train:
    :return: None
    """
    tuned_parameters = {
            'C': np.append(np.arange(0.01, 0.1, 0.01), [0.105]),
            'max_iter':range(120, 201, 40),
            'tol': [0.001, 0.01]
    }
    print("Start training : " + "\n")
    grid = GridSearchCV(LogisticRegression(), tuned_parameters, cv=5, scoring='roc_auc', verbose=2, n_jobs=4)
    model = grid.fit(X_train, y_train)
    return model

In [13]:
def test(X_test, y_test, estimator):
    
    y_pre = estimator.predict(X_test)
    precision = precision_score(y_test, y_pre, labels=None, pos_label=1, average='binary', sample_weight=None)
    recall = recall_score(y_test, y_pre, labels=None, pos_label=1, average='binary', sample_weight=None)
    f1 = f1_score(y_test, y_pre, labels=None, pos_label=1, average='binary', sample_weight=None)

    # precision = precision_score(y_test, y_pre, labels=None, average='macro', sample_weight=None)
    # recall = recall_score(y_test, y_pre, labels=None, average='macro', sample_weight=None)
    # f1 = f1_score(y_test, y_pre, labels=None, average='macro', sample_weight=None)


    print("precision: {:.3}, recall: {:.3}, f1:{:.3}, \n".format(precision, recall, f1))
          
    return precision, recall, f1

## Automation

In [None]:
''' 
    Change the input path accordingly
'''

train_dir = r"C:\Users\manoh\Documents\Project_DSCI\rems_modified\Training_CSV"

Final_Results = {}
for Code_Emb in os.listdir(train_dir):
    print(Code_Emb)
    Final_Results[Code_Emb] = {}
    
    for Tree_Emb in os.listdir(train_dir+"\\"+Code_Emb):
        print("\t"+Tree_Emb.split(".")[0])
        
        curr_dir = train_dir+"\\"+Code_Emb+"\\"+Tree_Emb
        
        data = pd.read_csv(curr_dir, header=None)
        
        X = data.iloc[:, :-1]
        y = data.iloc[:, -1]

        # Get the current time
        start_time = time.time()

        # Execute some code you want to time
        Results = KFold_Validation(X, y, nSplits = 10)

        # Get the elapsed time
        elapsed_time = round((time.time() - start_time)/60, 3)

        print("Elapsed time: ", elapsed_time, " Minutes")
        
        results_ = pd.DataFrame(Results, index=["Precision", "Recall", "F1"]).T
        
    
        Final_Results[Code_Emb][Tree_Emb.split(".")[0]] = dict(results_.describe().loc["mean"])
        
        


CodeBERT
	deepwalk_cg
KFlod :  0
Start training : 

Fitting 5 folds for each of 60 candidates, totalling 300 fits
precision: 0.161, recall: 0.5, f1:0.244, 

KFlod :  1
Start training : 

Fitting 5 folds for each of 60 candidates, totalling 300 fits
precision: 0.308, recall: 0.8, f1:0.444, 

KFlod :  2
Start training : 

Fitting 5 folds for each of 60 candidates, totalling 300 fits
precision: 0.118, recall: 0.444, f1:0.186, 

KFlod :  3
Start training : 

Fitting 5 folds for each of 60 candidates, totalling 300 fits
precision: 0.0345, recall: 1.0, f1:0.0667, 

KFlod :  4
Start training : 

Fitting 5 folds for each of 60 candidates, totalling 300 fits
precision: 0.226, recall: 0.636, f1:0.333, 

KFlod :  5
Start training : 

Fitting 5 folds for each of 60 candidates, totalling 300 fits
precision: 0.136, recall: 1.0, f1:0.24, 

KFlod :  6
Start training : 

Fitting 5 folds for each of 60 candidates, totalling 300 fits
precision: 0.0909, recall: 0.5, f1:0.154, 

KFlod :  7
Start training :



precision: 0.158, recall: 0.75, f1:0.261, 

KFlod :  7
Start training : 

Fitting 5 folds for each of 60 candidates, totalling 300 fits
precision: 0.222, recall: 0.182, f1:0.2, 

KFlod :  8
Start training : 

Fitting 5 folds for each of 60 candidates, totalling 300 fits
precision: 0.167, recall: 0.4, f1:0.235, 

KFlod :  9
Start training : 

Fitting 5 folds for each of 60 candidates, totalling 300 fits
precision: 0.182, recall: 0.4, f1:0.25, 

Elapsed time:  23.066  Minutes
	walklets_cg
KFlod :  0
Start training : 

Fitting 5 folds for each of 60 candidates, totalling 300 fits
precision: 0.273, recall: 0.5, f1:0.353, 

KFlod :  1
Start training : 

Fitting 5 folds for each of 60 candidates, totalling 300 fits
precision: 0.312, recall: 0.625, f1:0.417, 

KFlod :  2
Start training : 

Fitting 5 folds for each of 60 candidates, totalling 300 fits
precision: 0.412, recall: 0.7, f1:0.519, 

KFlod :  3
Start training : 

Fitting 5 folds for each of 60 candidates, totalling 300 fits
precision

precision: 0.0, recall: 0.0, f1:0.0, 

KFlod :  6
Start training : 

Fitting 5 folds for each of 60 candidates, totalling 300 fits
precision: 0.0, recall: 0.0, f1:0.0, 

KFlod :  7
Start training : 

Fitting 5 folds for each of 60 candidates, totalling 300 fits
precision: 0.333, recall: 0.3, f1:0.316, 

KFlod :  8
Start training : 

Fitting 5 folds for each of 60 candidates, totalling 300 fits
precision: 0.143, recall: 0.2, f1:0.167, 

KFlod :  9
Start training : 

Fitting 5 folds for each of 60 candidates, totalling 300 fits
precision: 0.222, recall: 0.286, f1:0.25, 

Elapsed time:  4.074  Minutes
	sdne_cg
KFlod :  0
Start training : 

Fitting 5 folds for each of 60 candidates, totalling 300 fits
precision: 0.25, recall: 0.5, f1:0.333, 

KFlod :  1
Start training : 

Fitting 5 folds for each of 60 candidates, totalling 300 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


precision: 0.2, recall: 0.25, f1:0.222, 

KFlod :  2
Start training : 

Fitting 5 folds for each of 60 candidates, totalling 300 fits
precision: 0.333, recall: 0.231, f1:0.273, 

KFlod :  3
Start training : 

Fitting 5 folds for each of 60 candidates, totalling 300 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


precision: 0.143, recall: 0.333, f1:0.2, 

KFlod :  4
Start training : 

Fitting 5 folds for each of 60 candidates, totalling 300 fits
precision: 0.214, recall: 0.6, f1:0.316, 

KFlod :  5
Start training : 

Fitting 5 folds for each of 60 candidates, totalling 300 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


precision: 0.5, recall: 0.5, f1:0.5, 

KFlod :  6
Start training : 

Fitting 5 folds for each of 60 candidates, totalling 300 fits
precision: 0.143, recall: 0.5, f1:0.222, 

KFlod :  7
Start training : 

Fitting 5 folds for each of 60 candidates, totalling 300 fits
precision: 0.429, recall: 0.273, f1:0.333, 

KFlod :  8
Start training : 

Fitting 5 folds for each of 60 candidates, totalling 300 fits
precision: 0.167, recall: 0.4, f1:0.235, 

KFlod :  9
Start training : 

Fitting 5 folds for each of 60 candidates, totalling 300 fits
precision: 0.0, recall: 0.0, f1:0.0, 

Elapsed time:  6.652  Minutes
	walklets_cg
KFlod :  0
Start training : 

Fitting 5 folds for each of 60 candidates, totalling 300 fits
precision: 0.231, recall: 0.5, f1:0.316, 

KFlod :  1
Start training : 

Fitting 5 folds for each of 60 candidates, totalling 300 fits
precision: 0.208, recall: 0.625, f1:0.312, 

KFlod :  2
Start training : 

Fitting 5 folds for each of 60 candidates, totalling 300 fits
precision: 0.238

  _warn_prf(average, modifier, msg_start, len(result))


precision: 0.0, recall: 0.0, f1:0.0, 

KFlod :  5
Start training : 

Fitting 5 folds for each of 60 candidates, totalling 300 fits
precision: 0.0, recall: 0.0, f1:0.0, 

KFlod :  6
Start training : 

Fitting 5 folds for each of 60 candidates, totalling 300 fits


## Final Output File

In [None]:
df = pd.DataFrame(Final_Results)

dic = {}
i = 0
for code_emb in df.columns:
    for tree_emb in df.index:
        dic[i] = (code_emb, tree_emb, df.loc[tree_emb, code_emb]['Precision'], df.loc[tree_emb, code_emb]['Recall'], df.loc[tree_emb, code_emb]['F1'])
        i+=1
output = pd.DataFrame(dic, index = ["CodeGraph", "TreeGraph", "Prec", "Recall", "F1"]).T

output.to_csv("LR_Results.csv")

In [None]:
output