In [50]:
from ipynb.fs.full.Pipeline import label_feature_data_and_merge
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd

In [51]:
CACHE_DIR = "/Users/sanjaydutt/Documents/CSC503_project/data/"
METHOD_LEVEL_REFACTORINGS = [
    "Extract And Move Method",
    "Extract Method",
    "Inline Method",
    "Move Method",
    "Pull Up Method",
    "Push Down Method",
    "Rename Method",
    "Extract And Move Method",
    "Change Return Type",
    "Move And Inline Method",
    "Move And Rename Method",
    "Change Parameter Type",
    "Split Parameter",
    "Merge Parameter"]



In [52]:
def load_cache_data_and_preprocess_it(refactored_file_path, non_refactored_file_path):
    
    refactored_data = pd.read_feather(refactored_file_path)
    refactored_data.set_index("index", inplace=True)
    
    non_refactored_data = pd.read_feather(non_refactored_file_path)
    non_refactored_data.set_index("index", inplace=True)
    
    data = label_feature_data_and_merge(refactored_data, non_refactored_data)
    y = data["refactored"]
    x = data.drop("refactored", axis=1)
    
    print(y.value_counts())
    
    return (x, y)

In [53]:
# PICK first refactoring metric
refactored_file_path = ''.join([CACHE_DIR, ''.join(METHOD_LEVEL_REFACTORINGS[0].lower().split()), '.ftr'])
non_refactored_file_path = '/Users/sanjaydutt/Documents/CSC503_project/data/nonrefactoreddata.ftr'

In [54]:
(x, y) = load_cache_data_and_preprocess_it(refactored_file_path, non_refactored_file_path)

0    66803
1    29946
Name: refactored, dtype: int64


In [144]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler

In [180]:
(X_train, X_test, y_train, y_test) = train_test_split(x, y, test_size=0.20, random_state=42)

In [182]:
model = get_logistic_regression_model_random_cv().fit(X_train, y_train)
model

Fitting 5 folds for each of 30 candidates, totalling 150 fits




RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=True),
                   estimator=LogisticRegression(random_state=20, solver='saga'),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'C': [21.232131688717615,
                                              79.94012516306755,
                                              40.6990195703418,
                                              44.69252303240279,
                                              35.58661877888244],
                                        'max_iter': [100, 500, 1000, 2000, 5000,
                                                     10000]},
                   scoring='accuracy', verbose=1)

In [184]:
model.best_estimator_

LogisticRegression(C=21.232131688717615, max_iter=5000, random_state=20,
                   solver='saga')

In [203]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.inspection import permutation_importance
from random import uniform
from sklearn.ensemble import RandomForestClassifier
import time

class DecisionTreeRefactoringModel:

    def params_to_tune(self):
        return {"max_depth": [3, 6, 9, 12, 16],
                "max_features": ["sqrt", "log2", None],
                "min_samples_leaf": [1,10, 100],
                "min_samples_split": [2, 4, 10],
                "splitter": ["best", "random"],
                "criterion": ["gini", "entropy"]}

    def model(self, best_params=None):
        if best_params is not None:
            return DecisionTreeClassifier(
                random_state=SEED,
                max_depth=best_params["max_depth"],
                max_features=best_params["max_features"],
                min_samples_split=best_params["min_samples_split"],
                min_samples_leaf=best_params["min_samples_leaf"],
                splitter=best_params["splitter"],
                criterion=best_params["criterion"])

        return DecisionTreeClassifier(random_state=20)
    
class LogisticRegressionRefactoringModel():
    def feature_reduction(self) -> bool:
        return True

    def params_to_tune(self):
        return {
            "max_iter": [100, 500, 1000],
            "C": [uniform(0.01, 100) for i in range(0, 5)]}

    def model(self, best_params=None):
        if best_params is not None:
            return LogisticRegression(
                solver='saga',
                max_iter=best_params["max_iter"],
                C=best_params["C"],
                n_jobs=CORE_COUNT,
                random_state=SEED)
        return LogisticRegression(solver='saga', random_state=20)

class RandomForestRefactoringModel():
    def feature_reduction(self) -> bool:
        return False

    def params_to_tune(self):
        return {
            "max_depth": [3, 6, 12, 24, None],
            "max_features": ["auto", "log2", None],
            "min_samples_split": [2, 3, 4, 5, 10],
            "bootstrap": [True, False],
            "criterion": ["gini", "entropy"],
            "n_estimators": [10, 50, 100, 150, 200]
        }

    def model(self, best_params=None):
        if best_params is not None:
            return RandomForestClassifier(
                random_state=SEED,
                n_jobs=CORE_COUNT,
                max_depth=best_params["max_depth"],
                max_features=best_params["max_features"],
                min_samples_split=best_params["min_samples_split"],
                bootstrap=best_params["bootstrap"],
                criterion=best_params["criterion"],
                n_estimators=best_params["n_estimators"],
            )

        return RandomForestClassifier(random_state=20)

In [176]:
def get_decision_tree_model_random_cv():
    decisionTree = DecisionTreeRefactoringModel()
    model = decisionTree.model()
    param_dist = decisionTree.params_to_tune()
    search = RandomizedSearchCV(
                model,
                param_dist,
                n_iter=100,
                cv=StratifiedKFold(
                    n_splits=5,
                    shuffle=True),
                scoring="accuracy",
                n_jobs=-1,
                verbose=1)
    return search

def get_random_forest_tree_model_random_cv():
    randomForest = RandomForestRefactoringModel()
    model = randomForest.model()
    param_dist = randomForest.params_to_tune()
    search = RandomizedSearchCV(
                model,
                param_dist,
                n_iter=100,
                cv=StratifiedKFold(
                    n_splits=5,
                    shuffle=True),
                scoring="accuracy",
                n_jobs=-1,
                verbose=1)
    return search
def get_logistic_regression_model_random_cv():
    logisticReg = LogisticRegressionRefactoringModel()
    model = logisticReg.model()
    param_dist = logisticReg.params_to_tune()
    search = RandomizedSearchCV(
                model,
                param_dist,
                n_iter=100,
                cv=StratifiedKFold(
                    n_splits=5,
                    shuffle=True),
                scoring="accuracy",
                n_jobs=-1,
                verbose=1)
    return search

In [231]:
def train_model_method_refactoring_1():
    non_refactored_file_path = 'data/nonrefactoreddata.ftr'
    for method_refactoring in METHOD_LEVEL_REFACTORINGS:
        print("=================================")
        print("Best Model is finding for", method_refactoring)
        
        # PICK first refactoring metric
        refactored_file_path = ''.join(['data/', ''.join(method_refactoring.lower().split()), '.ftr'])
        print(refactored_file_path)
        (x, y) = load_cache_data_and_preprocess_it(refactored_file_path, non_refactored_file_path)
        
        print("The shape of sample is", x.shape)
        print("Number of sample in different classes before sampling")
        print(y.value_counts())
        
        undersample = RandomUnderSampler(sampling_strategy='majority')
        X_over, y_over = undersample.fit_resample(x, y)
        
        print("Number of sample in different classes after sampling")
        print(y_over.value_counts())
        
        (X_train, X_test, y_train, y_test) = train_test_split(X_over, y_over, test_size=0.20, random_state=42)
        
        randomizedCVModel = get_decision_tree_model_random_cv()
        
        t0 = time.time()
        randomizedCVModel.fit(X_train, y_train)
        print("The training time:", time.time() - t0)
        best_estimator = randomizedCVModel.best_estimator_
        
        t1 = time.time()
        y_pred = best_estimator.predict(X_test)
        print("The validation time:", time.time() - t1)
        
        print("The accuracy of the model", best_estimator.score(X_test, y_test))
        print("Recall score is", recall_score(y_test, best_estimator.predict(X_test)))
        print("Precision score is", precision_score(y_test, best_estimator.predict(X_test)))
              
        print(search.best_estimator_)
        print(search.best_params_)
        print(search.best_score_)
        
        print("Feature Importance")
        # result = permutation_importance(best_estimator, X_train, y_train, n_repeats=10,random_state=0)
        # print(result)
        print("=================================")
        
        r = permutation_importance(best_estimator, X_train, y_train, n_repeats=30,random_state=0)
        count = 0
        for i in r.importances_mean.argsort()[::-1]:
            if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
                count = count + 1
                print(f"{X_train.columns[i]:<8}"
                    f"{r.importances_mean[i]:.3f}"
                        f" +/- {r.importances_std[i]:.3f}")
                if count >= 10:
                    break
        
            
    

In [232]:
train_model_method_refactoring_1()


Best Model is finding for Extract And Move Method
data/extractandmovemethod.ftr
0    66803
1     8637
Name: refactored, dtype: int64
The shape of sample is (75440, 69)
Number of sample in different classes before sampling
0    66803
1     8637
Name: refactored, dtype: int64
Number of sample in different classes after sampling
0    8637
1    8637
Name: refactored, dtype: int64
Fitting 5 folds for each of 100 candidates, totalling 500 fits
The training time: 7.674571990966797
The validation time: 0.003090381622314453
The accuracy of the model 0.8819102749638206
Recall score is 0.89171974522293
Precision score is 0.8745031232254401
DecisionTreeClassifier(criterion='entropy', max_depth=16, max_features='log2',
                       min_samples_split=4, random_state=20)
{'splitter': 'best', 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 16, 'criterion': 'entropy'}
0.8991330656705351
Feature Importance
methodRfc0.150 +/- 0.002
startLine0.144 +/- 0.002
me

The training time: 23.401641368865967
The validation time: 0.00983119010925293
The accuracy of the model 0.9119956674790144
Recall score is 0.9089049502250103
Precision score is 0.9133890639989036
DecisionTreeClassifier(criterion='entropy', max_depth=16, max_features='log2',
                       min_samples_split=4, random_state=20)
{'splitter': 'best', 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 16, 'criterion': 'entropy'}
0.8991330656705351
Feature Importance
classWmc0.192 +/- 0.002
qtyOfCommits0.139 +/- 0.001
startLine0.116 +/- 0.001
classNumberOfMethods0.098 +/- 0.001
classLcom0.089 +/- 0.001
classUniqueWordsQty0.084 +/- 0.000
classCbo0.055 +/- 0.001
classLCC0.055 +/- 0.001
refactoringsInvolved0.053 +/- 0.001
classTCC0.049 +/- 0.000
Best Model is finding for Extract And Move Method
data/extractandmovemethod.ftr
0    66803
1     8637
Name: refactored, dtype: int64
The shape of sample is (75440, 69)
Number of sample in different classes befor

methodParametersQty0.202 +/- 0.013
startLine0.146 +/- 0.015
classNumberOfPublicMethods0.093 +/- 0.008
classLoc0.082 +/- 0.009
methodRfc0.073 +/- 0.007
classReturnQty0.067 +/- 0.008
authorOwnership0.051 +/- 0.005
classNumberOfDefaultFields0.048 +/- 0.006
methodLoc0.020 +/- 0.004
classMathOperationsQty0.019 +/- 0.005
Best Model is finding for Merge Parameter
data/mergeparameter.ftr
0    66803
1     1213
Name: refactored, dtype: int64
The shape of sample is (68016, 69)
Number of sample in different classes before sampling
0    66803
1     1213
Name: refactored, dtype: int64
Number of sample in different classes after sampling
0    1213
1    1213
Name: refactored, dtype: int64
Fitting 5 folds for each of 100 candidates, totalling 500 fits
The training time: 0.9277276992797852
The validation time: 0.002279043197631836
The accuracy of the model 0.911522633744856
Recall score is 0.9324894514767933
Precision score is 0.8911290322580645
DecisionTreeClassifier(criterion='entropy', max_depth=16, 

In [221]:
x.columns[12]

'classNumberOfAbstractMethods'

In [170]:
def train_model_method_refactoring():
    non_refactored_file_path = 'data/nonrefactoreddata.ftr'
    for method_refactoring in METHOD_LEVEL_REFACTORINGS:
        print("=================================")
        print("Best Model is finding for", method_refactoring)
        
        # PICK first refactoring metric
        refactored_file_path = ''.join(['data/', ''.join(method_refactoring.lower().split()), '.ftr'])
        print(refactored_file_path)
        (x, y) = load_cache_data_and_preprocess_it(refactored_file_path, non_refactored_file_path)
        
        print("The shape of sample is", x.shape)
        print("Number of sample in different classes before sampling")
        print(y.value_counts())
        
        undersample = RandomUnderSampler(sampling_strategy='majority')
        X_over, y_over = undersample.fit_resample(x, y)
        
        print("Number of sample in different classes after sampling")
        print(y_over.value_counts())
        
        (X_train, X_test, y_train, y_test) = train_test_split(X_over, y_over, test_size=0.20, random_state=42)
        
        randomizedCVModel = get_random_forest_tree_model_random_cv()
        
        randomizedCVModel.fit(X_train, y_train)
        
        best_estimator = randomizedCVModel.best_estimator_
        
        print("The accuracy of the model", best_estimator.score(X_test, y_test))
        print("Recall score is", recall_score(y_test, best_estimator.predict(X_test)))
        print("Precision score is", precision_score(y_test, best_estimator.predict(X_test)))
              
        print(search.best_estimator_)
        print(search.best_params_)
        print(search.best_score_)
        
        print("Feature Importance")
        result = permutation_importance(best_estimator, X_train, y_train, n_repeats=10,random_state=0)
        print(result)
        print("=================================")
        break 
        
train_model_method_refactoring()

Best Model is finding for Extract And Move Method
data/extractandmovemethod.ftr
0    66803
1     8637
Name: refactored, dtype: int64
The shape of sample is (75440, 69)
Number of sample in different classes before sampling
0    66803
1     8637
Name: refactored, dtype: int64
Number of sample in different classes after sampling
0    8637
1    8637
Name: refactored, dtype: int64
Fitting 5 folds for each of 100 candidates, totalling 500 fits
The accuracy of the model 0.9328509406657018
Recall score is 0.937463810075275
Precision score is 0.9288582903040734
DecisionTreeClassifier(criterion='entropy', max_depth=16, max_features='log2',
                       min_samples_split=4, random_state=20)
{'splitter': 'best', 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 16, 'criterion': 'entropy'}
0.8991330656705351
Feature Importance
{'importances_mean': array([0.00000000e+00, 0.00000000e+00, 3.69057095e-04, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00,

In [210]:
def train_model_method_refactoring_3():
    non_refactored_file_path = 'data/nonrefactoreddata.ftr'
    for method_refactoring in METHOD_LEVEL_REFACTORINGS:
        print("=================================")
        print("Best Model is finding for", method_refactoring)
        
        # PICK first refactoring metric
        refactored_file_path = ''.join(['data/', ''.join(method_refactoring.lower().split()), '.ftr'])
        print(refactored_file_path)
        (x, y) = load_cache_data_and_preprocess_it(refactored_file_path, non_refactored_file_path)
        
        print("The shape of sample is", x.shape)
        print("Number of sample in different classes before sampling")
        print(y.value_counts())
        
        undersample = RandomUnderSampler(sampling_strategy='majority')
        X_over, y_over = undersample.fit_resample(x, y)
        
        print("Number of sample in different classes after sampling")
        print(y_over.value_counts())
        
        (X_train, X_test, y_train, y_test) = train_test_split(X_over, y_over, test_size=0.20, random_state=42)
        
        logModel = get_logistic_regression_model_random_cv()
        
        t0 = time.time()
        logModel.fit(X_train, y_train)
        print("The training time:", time.time() - t0)
        
        best_estimator = logModel.best_estimator_
        
        print("The best estimator is", best_estimator)
        
        t1 = time.time()
        best_estimator.score(X_test, y_test)
        print("The validation time:", time.time() - t1)
        
        
        print("The accuracy of the model", best_estimator.score(X_test, y_test))
        print("Recall score is", recall_score(y_test, best_estimator.predict(X_test)))
        print("Precision score is", precision_score(y_test, best_estimator.predict(X_test)))
              
        
        # print("Feature Importance")
        # result = permutation_importance(best_estimator, X_train, y_train, n_repeats=10,random_state=0)
        # print(result)
        print("=================================")
         
        


In [211]:
train_model_method_refactoring_3()

Best Model is finding for Extract And Move Method
data/extractandmovemethod.ftr
0    66803
1     8637
Name: refactored, dtype: int64
The shape of sample is (75440, 69)
Number of sample in different classes before sampling
0    66803
1     8637
Name: refactored, dtype: int64
Number of sample in different classes after sampling
0    8637
1    8637
Name: refactored, dtype: int64
Fitting 5 folds for each of 15 candidates, totalling 75 fits




The training time: 52.104833126068115
The best estimator is LogisticRegression(C=43.642729943108215, max_iter=1000, random_state=20,
                   solver='saga')
The validation time: 0.0028569698333740234
The accuracy of the model 0.7539797395079595
Recall score is 0.7782281412854661
Precision score is 0.7421314191054665
Best Model is finding for Extract Method
data/extractmethod.ftr
0    66803
1    29946
Name: refactored, dtype: int64
The shape of sample is (96749, 69)
Number of sample in different classes before sampling
0    66803
1    29946
Name: refactored, dtype: int64
Number of sample in different classes after sampling
0    29946
1    29946
Name: refactored, dtype: int64
Fitting 5 folds for each of 15 candidates, totalling 75 fits




The training time: 222.261803150177
The best estimator is LogisticRegression(C=92.3142004705179, max_iter=1000, random_state=20,
                   solver='saga')
The validation time: 0.00617527961730957
The accuracy of the model 0.784957008097504
Recall score is 0.8417836498761354
Precision score is 0.7590469099032018
Best Model is finding for Inline Method
data/inlinemethod.ftr
0    66803
1     7098
Name: refactored, dtype: int64
The shape of sample is (73901, 69)
Number of sample in different classes before sampling
0    66803
1     7098
Name: refactored, dtype: int64
Number of sample in different classes after sampling
0    7098
1    7098
Name: refactored, dtype: int64
Fitting 5 folds for each of 15 candidates, totalling 75 fits




The training time: 43.351651668548584
The best estimator is LogisticRegression(C=11.747787191855886, max_iter=1000, random_state=20,
                   solver='saga')
The validation time: 0.0031321048736572266
The accuracy of the model 0.6704225352112676
Recall score is 0.6636107193229901
Precision score is 0.6721428571428572
Best Model is finding for Move Method
data/movemethod.ftr
0    66803
1    23835
Name: refactored, dtype: int64
The shape of sample is (90638, 69)
Number of sample in different classes before sampling
0    66803
1    23835
Name: refactored, dtype: int64
Number of sample in different classes after sampling
0    23835
1    23835
Name: refactored, dtype: int64
Fitting 5 folds for each of 15 candidates, totalling 75 fits




The training time: 196.11906504631042
The best estimator is LogisticRegression(C=96.11762373027274, max_iter=1000, random_state=20,
                   solver='saga')
The validation time: 0.005357980728149414
The accuracy of the model 0.6985525487728131
Recall score is 0.6463035825222614
Precision score is 0.7280149288546769
Best Model is finding for Pull Up Method
data/pullupmethod.ftr
0    66803
1    14062
Name: refactored, dtype: int64
The shape of sample is (80865, 69)
Number of sample in different classes before sampling
0    66803
1    14062
Name: refactored, dtype: int64
Number of sample in different classes after sampling
0    14062
1    14062
Name: refactored, dtype: int64
Fitting 5 folds for each of 15 candidates, totalling 75 fits




The training time: 109.31514096260071
The best estimator is LogisticRegression(C=80.42722269942684, max_iter=1000, random_state=20,
                   solver='saga')
The validation time: 0.004240989685058594
The accuracy of the model 0.7249777777777778
Recall score is 0.67889575009081
Precision score is 0.7381516587677726
Best Model is finding for Push Down Method
data/pushdownmethod.ftr
0    66803
1     8039
Name: refactored, dtype: int64
The shape of sample is (74842, 69)
Number of sample in different classes before sampling
0    66803
1     8039
Name: refactored, dtype: int64
Number of sample in different classes after sampling
0    8039
1    8039
Name: refactored, dtype: int64
Fitting 5 folds for each of 15 candidates, totalling 75 fits




The training time: 60.65658903121948
The best estimator is LogisticRegression(C=71.67244715342615, max_iter=1000, random_state=20,
                   solver='saga')
The validation time: 0.0032148361206054688
The accuracy of the model 0.7369402985074627
Recall score is 0.6858024691358025
Precision score is 0.7672651933701657
Best Model is finding for Rename Method
data/renamemethod.ftr
0    66803
1    36929
Name: refactored, dtype: int64
The shape of sample is (103732, 69)
Number of sample in different classes before sampling
0    66803
1    36929
Name: refactored, dtype: int64
Number of sample in different classes after sampling
0    36929
1    36929
Name: refactored, dtype: int64
Fitting 5 folds for each of 15 candidates, totalling 75 fits




The training time: 405.3032410144806
The best estimator is LogisticRegression(C=84.19273471358763, max_iter=1000, random_state=20,
                   solver='saga')
The validation time: 0.008512020111083984
The accuracy of the model 0.6809504467912266
Recall score is 0.583799263602891
Precision score is 0.7204644900706832
Best Model is finding for Extract And Move Method
data/extractandmovemethod.ftr
0    66803
1     8637
Name: refactored, dtype: int64
The shape of sample is (75440, 69)
Number of sample in different classes before sampling
0    66803
1     8637
Name: refactored, dtype: int64
Number of sample in different classes after sampling
0    8637
1    8637
Name: refactored, dtype: int64
Fitting 5 folds for each of 15 candidates, totalling 75 fits




The training time: 68.61100172996521
The best estimator is LogisticRegression(C=95.48512107683783, max_iter=1000, random_state=20,
                   solver='saga')
The validation time: 0.004611015319824219
The accuracy of the model 0.7452966714905933
Recall score is 0.7735958309206716
Precision score is 0.7320547945205479
Best Model is finding for Change Return Type
data/changereturntype.ftr
0    66803
1    58308
Name: refactored, dtype: int64
The shape of sample is (125111, 69)
Number of sample in different classes before sampling
0    66803
1    58308
Name: refactored, dtype: int64
Number of sample in different classes after sampling
0    58308
1    58308
Name: refactored, dtype: int64
Fitting 5 folds for each of 15 candidates, totalling 75 fits




The training time: 590.4755721092224
The best estimator is LogisticRegression(C=76.98847033211139, max_iter=1000, random_state=20,
                   solver='saga')
The validation time: 0.017905235290527344
The accuracy of the model 0.7457125707425828
Recall score is 0.7343314763231198
Precision score is 0.745558992487848
Best Model is finding for Move And Inline Method
data/moveandinlinemethod.ftr
0    66803
1     3751
Name: refactored, dtype: int64
The shape of sample is (70554, 69)
Number of sample in different classes before sampling
0    66803
1     3751
Name: refactored, dtype: int64
Number of sample in different classes after sampling
0    3751
1    3751
Name: refactored, dtype: int64
Fitting 5 folds for each of 15 candidates, totalling 75 fits




The training time: 22.86668372154236
The best estimator is LogisticRegression(C=96.39622825592856, max_iter=1000, random_state=20,
                   solver='saga')
The validation time: 0.004361867904663086
The accuracy of the model 0.6888740839440373
Recall score is 0.677762982689747
Precision score is 0.6934604904632152
Best Model is finding for Move And Rename Method
data/moveandrenamemethod.ftr
0    66803
1     4826
Name: refactored, dtype: int64
The shape of sample is (71629, 69)
Number of sample in different classes before sampling
0    66803
1     4826
Name: refactored, dtype: int64
Number of sample in different classes after sampling
0    4826
1    4826
Name: refactored, dtype: int64
Fitting 5 folds for each of 15 candidates, totalling 75 fits




The training time: 42.686026096343994
The best estimator is LogisticRegression(C=22.776828394194695, max_iter=1000, random_state=20,
                   solver='saga')
The validation time: 0.004266977310180664
The accuracy of the model 0.6462972553081305
Recall score is 0.48601036269430054
Precision score is 0.7149390243902439
Best Model is finding for Change Parameter Type
data/changeparametertype.ftr
1    84221
0    66803
Name: refactored, dtype: int64
The shape of sample is (151024, 69)
Number of sample in different classes before sampling
1    84221
0    66803
Name: refactored, dtype: int64
Number of sample in different classes after sampling
0    66803
1    66803
Name: refactored, dtype: int64
Fitting 5 folds for each of 15 candidates, totalling 75 fits




The training time: 704.1615979671478
The best estimator is LogisticRegression(C=80.59347212924007, max_iter=1000, random_state=20,
                   solver='saga')
The validation time: 0.014760971069335938
The accuracy of the model 0.7816031734151635
Recall score is 0.7886087348041423
Precision score is 0.776833234772324
Best Model is finding for Split Parameter
data/splitparameter.ftr
0    66803
1      390
Name: refactored, dtype: int64
The shape of sample is (67193, 69)
Number of sample in different classes before sampling
0    66803
1      390
Name: refactored, dtype: int64
Number of sample in different classes after sampling
0    390
1    390
Name: refactored, dtype: int64
Fitting 5 folds for each of 15 candidates, totalling 75 fits




The training time: 2.0720057487487793
The best estimator is LogisticRegression(C=95.76894157179753, max_iter=1000, random_state=20,
                   solver='saga')
The validation time: 0.003196239471435547
The accuracy of the model 0.6858974358974359
Recall score is 0.5641025641025641
Precision score is 0.7457627118644068
Best Model is finding for Merge Parameter
data/mergeparameter.ftr
0    66803
1     1213
Name: refactored, dtype: int64
The shape of sample is (68016, 69)
Number of sample in different classes before sampling
0    66803
1     1213
Name: refactored, dtype: int64
Number of sample in different classes after sampling
0    1213
1    1213
Name: refactored, dtype: int64
Fitting 5 folds for each of 15 candidates, totalling 75 fits




The training time: 6.373842000961304
The best estimator is LogisticRegression(C=55.6270148855751, max_iter=1000, random_state=20,
                   solver='saga')
The validation time: 0.0032351016998291016
The accuracy of the model 0.6152263374485597
Recall score is 0.5021097046413502
Precision score is 0.6329787234042553




In [178]:
print(get_logistic_regression_model_random_cv())

RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=True),
                   estimator=LogisticRegression(random_state=20, solver='saga'),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'C': [29.544703196961578,
                                              1.81665029382471,
                                              93.98997453592514,
                                              13.192479504371025,
                                              10.562249798760876],
                                        'max_iter': [100, 500, 1000, 2000, 5000,
                                                     10000]},
                   scoring='accuracy', verbose=1)


In [109]:
print(search.best_estimator_)
print(search.best_params_)
print(search.best_score_)

DecisionTreeClassifier(criterion='entropy', max_depth=16, max_features='log2',
                       min_samples_split=4, random_state=20)
{'splitter': 'best', 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 16, 'criterion': 'entropy'}
0.8991330656705351


In [111]:
print(search.best_estimator_.score(X_test, y_test))

0.9026356589147286


In [119]:
print(search.best_estimator_.feature_importances_)
x.shape

[2.10230930e-03 1.25422719e-02 1.87025782e-02 8.26570899e-03
 1.69277184e-03 1.08887474e-02 3.91618629e-02 8.96882293e-03
 9.14782029e-03 6.66712156e-03 4.33803501e-03 6.54343324e-03
 5.77972356e-04 5.59271337e-03 5.76159130e-03 7.96584203e-03
 9.37898424e-03 6.65229101e-03 8.94126397e-03 7.98527144e-03
 6.56953824e-03 3.86973463e-03 5.98182543e-03 2.35362368e-03
 1.49884631e-02 4.61586016e-03 4.69508665e-03 0.00000000e+00
 2.24338439e-03 8.40676841e-03 7.20806304e-03 1.65596338e-02
 1.49561414e-02 1.29034555e-02 2.87597234e-03 5.53578894e-03
 2.15611458e-02 1.40223034e-02 2.49129540e-02 1.14726178e-02
 3.42480570e-03 1.86560218e-03 4.88305539e-03 1.05240866e-01
 1.60346493e-03 6.97721227e-05 1.48882329e-01 1.13845939e-03
 1.21197920e-03 3.10391197e-03 1.09766747e-03 3.57871976e-03
 7.47084164e-04 2.24702025e-03 2.07470479e-02 2.48449745e-03
 0.00000000e+00 4.48335872e-04 8.78219552e-03 1.48258771e-02
 4.49207243e-02 6.18844973e-02 1.16854241e-02 1.93919313e-02
 1.67866096e-02 3.342361

(96749, 69)

In [137]:
from imblearn.under_sampling import RandomUnderSampler

In [139]:
undersample = RandomUnderSampler(sampling_strategy='majority')
X_over, y_over = undersample.fit_resample(x, y)

In [142]:
X_over.shape
y_over.value_counts()

0    29946
1    29946
Name: refactored, dtype: int64