In [81]:
import pandas as pd
import numpy as np
import sqlite3

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import make_scorer, fbeta_score
from joblib import dump, load
from sklearn.model_selection import cross_val_score
from time import localtime, strftime

# Utils

In [82]:
#CHANGE THIS
#DATA_PATH = "C:\\Users\\dluis\\.gradle\\caches\\modules-2\\files-2.1\\com.jetbrains.intellij.idea\\ideaIC\\2021.1.1\\e051d885e757b286781f50305504d7b8db3e1dba\\ideaIC-2021.1.1\\bin\\tmp\\metrics.db"
DATA_PATH = "data/metrics.db"

In [83]:
def scale_data(data, refactoring):
    scaler = StandardScaler()
    scaler.fit(data)
    dump(scaler, 'scaler' + refactoring + '.pkl')
    
    temp = scaler.transform(data)
    return pd.DataFrame(temp, columns=data.columns, index=data.index)

In [84]:
def covariance(df):
    unstack_matrix = df.corr().unstack().sort_values(ascending=False)

    # Drop Repetitive Values

    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            drop = (cols[i], cols[j])
            if(drop in unstack_matrix.keys()):
                unstack_matrix.drop(drop, inplace=True)

    print(unstack_matrix[unstack_matrix > 0.9])
    print(unstack_matrix[unstack_matrix < -0.9])

In [85]:
def grid_search(data):
    X = data.values
    X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)
    
    model_params = {
        'OneClassSVM': {
            'model': OneClassSVM(),
            'params': {
                'kernel': ['rbf', 'linear', 'poly', 'sigmoid'],
                'nu': [0.1, 0.2, 0.3],
                'gamma': ['scale', 'auto']
            }
        },
        'IsolationForest': {
            'model': IsolationForest(),
            'params': {
                'contamination': [0.1, 0.2],
                'n_estimators': [50, 100, 200, 300, 400, 500],
                'max_samples': [100, 200, 300, 400, 500],
                'max_features': [1, 2, 3, 4, 5]
            }
        },
        'EllipticEnvelope': {
            'model': EllipticEnvelope(),
            'params': {
                'contamination': [0.01, 0.02, 0.03, 0.04, 0.05]
            }
        }
    }
    
    
    kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=0)
    y_train = [1 for i in range(len(X_train))] # 1 for inliers
    
    scores = []
    
    f2_score = make_scorer(fbeta_score, beta=2, pos_label=1)
    
    start_time = strftime("%H:%M:%S", localtime())
    
    for model_name, mp in model_params.items():
        grid_search = GridSearchCV(mp['model'],
                                    param_grid=mp['params'],
                                    return_train_score=False,
                                    cv=kf,
                                    n_jobs=-1,
                                    verbose=True,
                                     scoring=f2_score)
        grid_search.fit(X_train, y_train)
        scores.append({
            'model': model_name,
            'best_score': grid_search.best_score_,
            'best_params': grid_search.best_params_
        })
        
    print("Started at: " + start_time)
    print("Finished at: " + strftime("%H:%M:%S", localtime()))
    
    return pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])

In [86]:
def save_models(df, data, refactoring):
    one_class_svm = OneClassSVM(kernel=df.iloc[0]['best_params']['kernel'], nu=df.iloc[0]['best_params']['nu'], gamma=df.iloc[0]['best_params']['gamma'])
    isolation_forest = IsolationForest(contamination=df.iloc[1]['best_params']['contamination'], n_estimators=df.iloc[1]['best_params']['n_estimators'], max_samples=df.iloc[1]['best_params']['max_samples'], max_features=df.iloc[1]['best_params']['max_features'])
    elliptic_envelope = EllipticEnvelope(contamination=df.iloc[2]['best_params']['contamination'])
    
    X = data.values
    one_class_svm.fit(X)
    isolation_forest.fit(X)
    elliptic_envelope.fit(X)

    dump(one_class_svm, 'models/one_class_svm_' + refactoring + '.joblib')
    dump(isolation_forest, 'models/isolation_forest_' + refactoring + '.joblib')
    dump(elliptic_envelope, 'models/elliptic_envelope_' + refactoring + '.joblib')

# Extract Method

In [87]:
def load_em_data(data_path):
    conn = sqlite3.connect(data_path)
    
    query = '''
            SELECT numberLinesOfCodeBef, numberCommentsBef, numberBlankLinesBef, totalLinesBef, numParametersBef,
                numStatementsBef, halsteadLengthBef, halsteadVocabularyBef, halsteadVolumeBef, halsteadDifficultyBef,
                halsteadEffortBef, halsteadLevelBef, halsteadTimeBef, halsteadBugsDeliveredBef, halsteadMaintainabilityBef,
                cyclomaticComplexityBef, cognitiveComplexityBef, lackOfCohesionInMethodBef
            FROM methodMetrics;
    '''

    df = pd.read_sql_query(query, conn)
    
    conn.close()
    
    return df

In [88]:
def delete_em_infinite_values(data_path):
    conn = sqlite3.connect(data_path)
    cursor = conn.cursor()

    cursor.execute("DELETE FROM methodMetrics WHERE CAST(halsteadLevelBef AS CHARACTER) ='Inf';")
    conn.commit()
    
    cursor.close()
    conn.close()
    
#delete_em_infinite_values(DATA_PATH)

In [89]:
em_data = load_em_data(DATA_PATH)
em_data.head()

Unnamed: 0,numberLinesOfCodeBef,numberCommentsBef,numberBlankLinesBef,totalLinesBef,numParametersBef,numStatementsBef,halsteadLengthBef,halsteadVocabularyBef,halsteadVolumeBef,halsteadDifficultyBef,halsteadEffortBef,halsteadLevelBef,halsteadTimeBef,halsteadBugsDeliveredBef,halsteadMaintainabilityBef,cyclomaticComplexityBef,cognitiveComplexityBef,lackOfCohesionInMethodBef
0,57,0,5,62,2,55,77.0,321.0,482.542564,14.721973,7103.978639,0.067926,394.66548,0.000333,40.351702,19,122,1.0
1,4,0,2,6,2,2,45.0,91.0,247.133389,8.419355,2080.703697,0.118774,115.59465,0.000333,69.976845,1,1,1.0
2,3,0,2,5,2,1,21.0,36.0,92.238666,1.451613,133.894838,0.688889,7.438602,0.000333,75.699244,1,1,1.0
3,3,0,0,3,1,1,29.0,40.0,140.881449,5.537037,780.0658,0.180602,43.336989,0.000333,74.411289,1,1,0.333333
4,11,0,0,11,2,6,26.0,49.0,122.211433,3.405405,416.179473,0.293651,23.121082,0.000333,62.131101,4,4,1.0


In [97]:
len(em_data)

23935

In [91]:
scaled_em_data = scale_data(em_data, "EM")
scaled_em_data.head()

Unnamed: 0,numberLinesOfCodeBef,numberCommentsBef,numberBlankLinesBef,totalLinesBef,numParametersBef,numStatementsBef,halsteadLengthBef,halsteadVocabularyBef,halsteadVolumeBef,halsteadDifficultyBef,halsteadEffortBef,halsteadLevelBef,halsteadTimeBef,halsteadBugsDeliveredBef,halsteadMaintainabilityBef,cyclomaticComplexityBef,cognitiveComplexityBef,lackOfCohesionInMethodBef
0,0.403648,-0.41882,0.101325,0.298221,0.328182,0.650995,0.249069,0.0008,0.185985,0.116235,-0.077534,-0.160668,-0.077534,0.0,-0.947121,0.886061,0.086448,0.582478
1,-0.636126,-0.41882,-0.326478,-0.616459,0.328182,-0.650054,-0.48503,-0.537087,-0.480368,-0.43238,-0.278426,-0.079343,-0.278426,0.0,1.402077,-0.517675,-0.13331,0.582478
2,-0.655745,-0.41882,-0.326478,-0.632793,0.328182,-0.674602,-1.035605,-0.665712,-0.918816,-1.038891,-0.356283,0.832475,-0.356283,0.0,1.855848,-0.517675,-0.13331,0.582478
3,-0.655745,-0.41882,-0.611681,-0.66546,-0.247152,-0.674602,-0.85208,-0.656357,-0.781127,-0.683273,-0.330442,0.019542,-0.330442,0.0,1.753717,-0.517675,-0.13331,-1.009267
4,-0.498798,-0.41882,-0.611681,-0.534791,0.328182,-0.551862,-0.920902,-0.635309,-0.833974,-0.868822,-0.344994,0.200348,-0.344994,0.0,0.779929,-0.283719,-0.127861,0.582478


In [92]:
scaled_em_data.cov()

Unnamed: 0,numberLinesOfCodeBef,numberCommentsBef,numberBlankLinesBef,totalLinesBef,numParametersBef,numStatementsBef,halsteadLengthBef,halsteadVocabularyBef,halsteadVolumeBef,halsteadDifficultyBef,halsteadEffortBef,halsteadLevelBef,halsteadTimeBef,halsteadBugsDeliveredBef,halsteadMaintainabilityBef,cyclomaticComplexityBef,cognitiveComplexityBef,lackOfCohesionInMethodBef
numberLinesOfCodeBef,1.000042,0.65353,0.705351,0.990549,0.109991,0.967108,0.815876,0.898723,0.829638,0.693731,0.714521,0.006454,0.714521,0.0,-0.786879,0.882481,0.679321,0.088524
numberCommentsBef,0.65353,1.000042,0.620489,0.733246,0.094741,0.634936,0.542341,0.568725,0.547823,0.445984,0.426044,-0.003983,0.426044,0.0,-0.55373,0.627433,0.39604,0.096831
numberBlankLinesBef,0.705351,0.620489,1.000042,0.775053,0.041247,0.694164,0.629408,0.653089,0.626495,0.525578,0.450076,0.012479,0.450076,0.0,-0.645658,0.566044,0.392342,0.066479
totalLinesBef,0.990549,0.733246,0.775053,1.000042,0.107485,0.959654,0.815393,0.890196,0.827165,0.690431,0.696738,0.006332,0.696738,0.0,-0.794457,0.873635,0.657276,0.092749
numParametersBef,0.109991,0.094741,0.041247,0.107485,1.000042,0.114711,0.161571,0.114549,0.158327,0.089893,0.097314,-0.010975,0.097314,0.0,-0.139711,0.186868,0.069146,0.073566
numStatementsBef,0.967108,0.634936,0.694164,0.959654,0.114711,1.000042,0.81696,0.909956,0.829568,0.69439,0.708066,0.013857,0.708066,0.0,-0.78928,0.905565,0.708627,0.079651
halsteadLengthBef,0.815876,0.542341,0.629408,0.815393,0.161571,0.81696,1.000042,0.830237,0.996629,0.855779,0.740259,-0.050304,0.740259,0.0,-0.861594,0.691123,0.407869,0.10673
halsteadVocabularyBef,0.898723,0.568725,0.653089,0.890196,0.114549,0.909956,0.830237,1.000042,0.842625,0.729922,0.741789,-0.014375,0.741789,0.0,-0.747051,0.778781,0.620011,0.064655
halsteadVolumeBef,0.829638,0.547823,0.626495,0.827165,0.158327,0.829568,0.996629,0.842625,1.000042,0.853208,0.776986,-0.044112,0.776986,0.0,-0.832094,0.707648,0.438992,0.103455
halsteadDifficultyBef,0.693731,0.445984,0.525578,0.690431,0.089893,0.69439,0.855779,0.729922,0.853208,1.000042,0.812399,-0.174598,0.812399,0.0,-0.740635,0.565223,0.332073,0.079078


In [93]:
covariance(scaled_em_data)

halsteadEffortBef     halsteadTimeBef            1.000000
halsteadLengthBef     halsteadVolumeBef          0.996587
numberLinesOfCodeBef  totalLinesBef              0.990508
                      numStatementsBef           0.967068
totalLinesBef         numStatementsBef           0.959614
numStatementsBef      halsteadVocabularyBef      0.909918
                      cyclomaticComplexityBef    0.905527
dtype: float64
Series([], dtype: float64)


In [94]:
data = scaled_em_data.copy()

In [95]:
df = grid_search(data)
df.head()

Fitting 25 folds for each of 24 candidates, totalling 600 fits
Fitting 25 folds for each of 300 candidates, totalling 7500 fits
Fitting 25 folds for each of 5 candidates, totalling 125 fits




Started at: 10:50:23
Finished at: 11:37:28


Unnamed: 0,model,best_score,best_params
0,OneClassSVM,0.91857,"{'gamma': 'auto', 'kernel': 'sigmoid', 'nu': 0.1}"
1,IsolationForest,0.918824,"{'contamination': 0.1, 'max_features': 5, 'max..."
2,EllipticEnvelope,0.991886,{'contamination': 0.01}


In [98]:
save_models(df, scaled_em_data, 'EM')



# Extract Class

In [63]:
def load_ec_data(data_path):
    conn = sqlite3.connect(data_path)

    query = '''
        SELECT numProperties, numPublicAttributes, numPublicMethods, numProtectedFields, numProtectedMethods,
            numLongMethods, numLinesCode, lackOfCohesion, cyclomaticComplexity, cognitiveComplexity, numMethods,
            numConstructors, halsteadLength, halsteadVocabulary, halsteadVolume, halsteadDifficulty, halsteadEffort,
            halsteadLevel, halsteadTime, halsteadBugsDelivered, halsteadMaintainability
        FROM classMetrics;
    '''

    df = pd.read_sql_query(query, conn)
    
    conn.close()
    
    return df

In [69]:
def delete_ec_error_values(data_path):
    conn = sqlite3.connect(data_path)
    cursor = conn.cursor()

    cursor.execute("DELETE FROM classMetrics WHERE CAST(halsteadLevel AS CHARACTER) ='Inf' OR CAST(halsteadMaintainability AS CHARACTER) ='Inf';")
    conn.commit()
    
    cursor.execute("DELETE FROM classMetrics WHERE cyclomaticComplexity is NULL;")
    conn.commit()
    
    cursor.close()
    conn.close()
    
delete_ec_error_values(DATA_PATH)

In [70]:
ec_data = load_ec_data(DATA_PATH)
ec_data.head()

Unnamed: 0,numProperties,numPublicAttributes,numPublicMethods,numProtectedFields,numProtectedMethods,numLongMethods,numLinesCode,lackOfCohesion,cyclomaticComplexity,cognitiveComplexity,...,numConstructors,halsteadLength,halsteadVocabulary,halsteadVolume,halsteadDifficulty,halsteadEffort,halsteadLevel,halsteadTime,halsteadBugsDelivered,halsteadMaintainability
0,9,0,6,0,1,2,408,1.0,6.090909,47.545455,...,2,52.363636,271.363636,314.722895,8.76669,4325.058421,0.243589,240.281023,0.000333,60.505985
1,6,0,10,0,0,1,516,1.0,3.448276,12.448276,...,4,32.551724,98.689655,169.250819,5.524539,1529.115347,0.311825,84.950853,0.000333,64.707328
2,0,0,26,0,0,11,877,0.0,1.333333,1.030303,...,0,52.424242,265.272727,310.011364,11.344954,4772.094495,0.209429,265.116361,0.000333,57.298861
3,6,0,8,0,0,5,476,1.0,3.333333,6.133333,...,1,70.733333,257.533333,455.5654,11.634445,8068.931954,0.190076,448.273997,0.000333,53.749081
4,12,0,96,0,0,0,541,1.0,1.0,1.0,...,1,28.706422,54.045872,140.820775,4.191078,732.682787,0.33658,40.704599,0.000333,72.752686


In [71]:
len(ec_data)

2513

In [73]:
#ec_data[ec_data == (np.inf or -np.inf)].count()

#ec_data.isna().sum()

In [74]:
scaled_ec_data = scale_data(ec_data, "EC")
scaled_ec_data.head()

Unnamed: 0,numProperties,numPublicAttributes,numPublicMethods,numProtectedFields,numProtectedMethods,numLongMethods,numLinesCode,lackOfCohesion,cyclomaticComplexity,cognitiveComplexity,...,numConstructors,halsteadLength,halsteadVocabulary,halsteadVolume,halsteadDifficulty,halsteadEffort,halsteadLevel,halsteadTime,halsteadBugsDelivered,halsteadMaintainability
0,-0.077269,-0.143305,-0.481923,-0.23456,-0.079668,-0.010988,-0.011218,0.410104,1.213342,0.347662,...,1.033267,1.147254,1.338405,1.106933,0.63399,0.67093,-0.44237,0.67093,5.421011e-20,-0.744836
1,-0.254872,-0.143305,-0.24101,-0.23456,-0.298952,-0.273961,0.165456,0.410104,0.246679,0.009862,...,2.934145,-0.162419,-0.158722,-0.189405,-0.260326,-0.253588,0.060127,-0.253588,-1.6263029999999999e-19,-0.124478
2,-0.610076,-0.143305,0.722646,-0.23456,-0.298952,2.355774,0.756006,-2.542984,-0.526956,-0.100033,...,-0.867611,1.15126,1.285595,1.064948,1.34518,0.818748,-0.693924,0.818748,-1.6263029999999999e-19,-1.21839
3,-0.254872,-0.143305,-0.361467,-0.23456,-0.298952,0.777933,0.100021,0.410104,0.204634,-0.050918,...,0.082828,2.361588,1.218492,2.362016,1.425033,1.908894,-0.836446,1.908894,-5.421011e-20,-1.742539
4,0.100333,-0.143305,4.938637,-0.23456,-0.298952,-0.536935,0.206353,0.410104,-0.648888,-0.100325,...,0.082828,-0.416613,-0.545795,-0.442752,-0.628149,-0.51694,0.242424,-0.51694,2.168404e-19,1.063474


In [75]:
scaled_ec_data.cov()

Unnamed: 0,numProperties,numPublicAttributes,numPublicMethods,numProtectedFields,numProtectedMethods,numLongMethods,numLinesCode,lackOfCohesion,cyclomaticComplexity,cognitiveComplexity,...,numConstructors,halsteadLength,halsteadVocabulary,halsteadVolume,halsteadDifficulty,halsteadEffort,halsteadLevel,halsteadTime,halsteadBugsDelivered,halsteadMaintainability
numProperties,1.000398,0.8162378,0.1804738,0.2533185,0.2102418,0.1944908,0.3546961,0.1646918,0.1293301,0.05577607,...,0.1198095,0.01554553,0.04277745,0.01860175,0.0204274,0.05351475,0.02606095,0.05351475,1.811044e-21,-0.02755178
numPublicAttributes,0.8162378,1.000398,0.04556043,0.01131135,0.1455773,0.03797176,0.1272528,0.02628637,0.02308698,0.0170051,...,0.04529902,-0.04151982,-0.02076877,-0.03631769,-0.04323545,-0.01856341,0.07317956,-0.01856341,-1.37212e-21,0.048215
numPublicMethods,0.1804738,0.04556043,1.000398,0.1103555,0.02358325,0.3864909,0.5560189,0.029328,-0.05073971,-0.02033884,...,0.1580884,-0.05042762,-0.01951543,-0.04927163,-0.0571481,-0.04930229,0.05877501,-0.04930229,4.154793e-21,0.05482025
numProtectedFields,0.2533185,0.01131135,0.1103555,1.000398,0.3183291,0.166769,0.2484274,0.0201595,0.09791951,0.02684528,...,0.04988972,0.01845604,0.01190579,0.01730858,0.01519138,0.01669557,-0.01263018,0.01669557,4.406368e-21,-0.04517904
numProtectedMethods,0.2102418,0.1455773,0.02358325,0.3183291,1.000398,0.1535042,0.2036082,-0.06700875,0.05749197,0.009430815,...,0.005522773,-0.01158274,-0.01317894,-0.01333187,-0.02842935,-0.02169439,0.01024466,-0.02169439,4.943436e-21,-0.01349914
numLongMethods,0.1944908,0.03797176,0.3864909,0.166769,0.1535042,1.000398,0.7730111,0.05131475,0.2670263,0.05835977,...,0.02926754,0.4703558,0.480598,0.4490458,0.455697,0.4069042,-0.285061,0.4069042,4.955839e-21,-0.4993783
numLinesCode,0.3546961,0.1272528,0.5560189,0.2484274,0.2036082,0.7730111,1.000398,0.08669405,0.2437911,0.07158615,...,0.1012641,0.2816404,0.3202484,0.265749,0.251656,0.221762,-0.191,0.221762,8.74048e-21,-0.3338937
lackOfCohesion,0.1646918,0.02628637,0.029328,0.0201595,-0.06700875,0.05131475,0.08669405,1.000398,0.1022294,0.02812987,...,0.164523,-0.02115994,-0.003220042,-0.02365359,-0.01350873,-0.003266075,-0.004842769,-0.003266075,2.590619e-22,-0.003014564
cyclomaticComplexity,0.1293301,0.02308698,-0.05073971,0.09791951,0.05749197,0.2670263,0.2437911,0.1022294,1.000398,0.8342603,...,0.01332868,0.3400421,0.3940005,0.3302798,0.3138058,0.3239795,-0.1993521,0.3239795,3.4931310000000003e-22,-0.4061637
cognitiveComplexity,0.05577607,0.0170051,-0.02033884,0.02684528,0.009430815,0.05835977,0.07158615,0.02812987,0.8342603,1.000398,...,0.00725352,0.0730648,0.1878501,0.07426137,0.06346323,0.08203167,-0.01062337,0.08203167,5.906033e-22,-0.1016168


In [76]:
covariance(scaled_ec_data)

halsteadEffort  halsteadTime          1.000000
halsteadLength  halsteadVolume        0.988144
                halsteadDifficulty    0.927414
dtype: float64
Series([], dtype: float64)


In [77]:
df = grid_search(scaled_ec_data)
df.head()

Fitting 25 folds for each of 24 candidates, totalling 600 fits
Fitting 25 folds for each of 300 candidates, totalling 7500 fits
Fitting 25 folds for each of 5 candidates, totalling 125 fits




Started at: 09:36:21
Finished at: 09:52:13


Unnamed: 0,model,best_score,best_params
0,OneClassSVM,0.918951,"{'gamma': 'scale', 'kernel': 'sigmoid', 'nu': ..."
1,IsolationForest,0.919134,"{'contamination': 0.1, 'max_features': 3, 'max..."
2,EllipticEnvelope,0.99178,{'contamination': 0.01}


In [80]:
save_models(df, scaled_ec_data, 'EC')

