In [23]:
from sklearn.naive_bayes import GaussianNB
from keras.models import Sequential
from keras.layers import Dense
import pandas as pd
from ipynb.fs.full.Pipeline import label_feature_data_and_merge
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
import time 
from imblearn.under_sampling import RandomUnderSampler

from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold

In [6]:
def load_cache_data_and_preprocess_it(refactored_file_path, non_refactored_file_path):
    
    refactored_data = pd.read_feather(refactored_file_path)
    refactored_data.set_index("index", inplace=True)
    
    non_refactored_data = pd.read_feather(non_refactored_file_path)
    non_refactored_data.set_index("index", inplace=True)
    
    data = label_feature_data_and_merge(refactored_data, non_refactored_data)
    y = data["refactored"]
    x = data.drop("refactored", axis=1)
    
    print(y.value_counts())
    
    return (x, y)

In [7]:
class GaussianNaiveBayesRefactoringModel():
    def feature_reduction(self) -> bool:
        return False

    def params_to_tune(self):
        return {"var_smoothing": [1e-10, 1e-09, 1e-08, 1e-07, 1e-06, 1e-05]}

    def model(self, best_params=None):
        if best_params is not None:
            return GaussianNB(var_smoothing=best_params["var_smoothing"])
        return GaussianNB()

In [131]:
METHOD_LEVEL_REFACTORINGS = [
    "Extract And Move Method",
    "Extract Method",
    "Inline Method",
    "Move Method",
    "Pull Up Method",
    "Push Down Method",
    "Rename Method",
    "Extract And Move Method",
    "Change Return Type",
    "Move And Inline Method",
    "Move And Rename Method",
    "Change Parameter Type",
    "Split Parameter",
    "Merge Parameter"]
# PICK first refactoring metric
refactored_file_path = ''.join(['data/', ''.join(METHOD_LEVEL_REFACTORINGS[13].lower().split()), '.ftr'])
non_refactored_file_path = '/Users/sanjaydutt/Documents/CSC503_project/data/nonrefactoreddata.ftr'

In [132]:
(x, y) = load_cache_data_and_preprocess_it(refactored_file_path, non_refactored_file_path)

0    66803
1     1213
Name: refactored, dtype: int64


In [133]:
undersample = RandomUnderSampler(sampling_strategy='majority')
X_over, y_over = undersample.fit_resample(x, y)

print(y_over.value_counts())

0    1213
1    1213
Name: refactored, dtype: int64


In [134]:
(X_train, X_test, y_train, y_test) = train_test_split(X_over, y_over, test_size=0.20, random_state=42)

In [135]:
def get_naive_bayes_random_cv():
    gussianNaiveBayes = GaussianNaiveBayesRefactoringModel()
    model = gussianNaiveBayes.model()
    param_dist = gussianNaiveBayes.params_to_tune()
    search = RandomizedSearchCV(
                model,
                param_dist,
                n_iter=100,
                cv=StratifiedKFold(
                    n_splits=7,
                    shuffle=True),
                scoring="accuracy",
                n_jobs=-1,
                verbose=1)
    return search

In [136]:
random_cv = get_naive_bayes_random_cv()



In [137]:
t0 = time.time()
random_cv.fit(X_train, y_train)
print("The training time:", time.time() - t0)
best_estimator = random_cv.best_estimator_
 
print(best_estimator)
t1 = time.time()
y_pred = best_estimator.predict(X_test)
print("The validation time:", time.time() - t1)
        
print("The accuracy of the model", best_estimator.score(X_test, y_test))
print("Recall score is", recall_score(y_test, best_estimator.predict(X_test)))
print("Precision score is", precision_score(y_test, best_estimator.predict(X_test)))



Fitting 7 folds for each of 6 candidates, totalling 42 fits
The training time: 0.2460489273071289
GaussianNB(var_smoothing=1e-10)
The validation time: 0.0033028125762939453
The accuracy of the model 0.602880658436214
Recall score is 0.9240506329113924
Precision score is 0.5558375634517766
