In [1]:
import pandas as pd
import numpy as np
import sqlite3

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import make_scorer, fbeta_score
from joblib import dump, load
from sklearn.model_selection import cross_val_score

In [2]:
#CHANGE THIS
#DATA_PATH = "C:\\Users\\dluis\\.gradle\\caches\\modules-2\\files-2.1\\com.jetbrains.intellij.idea\\ideaIC\\2021.1.1\\e051d885e757b286781f50305504d7b8db3e1dba\\ideaIC-2021.1.1\\bin\\tmp\\metrics.db"
DATA_PATH = "data/metrics.db"

In [3]:
def load_data(data_path):
    conn = sqlite3.connect(data_path)
    cursor = conn.cursor()

    cursor.execute("SELECT numberLinesOfCodeBef, numberCommentsBef, " +
                   "numberBlankLinesBef, totalLinesBef, numParametersBef, " +
                   "numStatementsBef, halsteadLengthBef, halsteadVocabularyBef, " +
                   "halsteadVolumeBef, halsteadDifficultyBef, halsteadEffortBef, " +
                   "halsteadLevelBef, halsteadTimeBef, halsteadBugsDeliveredBef, " +
                   "halsteadMaintainabilityBef, cyclomaticComplexityBef, " +
                   "cognitiveComplexityBef, lackOfCohesionInMethodBef " +
                   "FROM metrics")
    rows = cursor.fetchall()

    df = pd.DataFrame(rows, columns=[
        'numberLinesOfCodeBef', 'numberCommentsBef',
        'numberBlankLinesBef', 'totalLinesBef', 'numParametersBef',
        'numStatementsBef', 'halsteadLengthBef', 'halsteadVocabularyBef',
        'halsteadVolumeBef', 'halsteadDifficultyBef', 'halsteadEffortBef',
        'halsteadLevelBef', 'halsteadTimeBef', 'halsteadBugsDeliveredBef',
        'halsteadMaintainabilityBef', 'cyclomaticComplexityBef',
        'cognitiveComplexityBef', 'lackOfCohesionInMethodBef'
    ])
    
    cursor.close()
    conn.close()
    
    return df

In [8]:
def delete_infinite_values(data_path):
    conn = sqlite3.connect(data_path)
    cursor = conn.cursor()

    cursor.execute("DELETE FROM metrics WHERE CAST(halsteadLevelBef AS CHARACTER) ='Inf';")
    conn.commit()
    
    cursor.close()
    conn.close()
    
delete_infinite_values(DATA_PATH)

"""
is_inf = data.isin([np.inf, -np.inf])
data[is_inf.any(axis=1)]
"""

In [9]:
data = load_data(DATA_PATH)
data.head()

Unnamed: 0,numberLinesOfCodeBef,numberCommentsBef,numberBlankLinesBef,totalLinesBef,numParametersBef,numStatementsBef,halsteadLengthBef,halsteadVocabularyBef,halsteadVolumeBef,halsteadDifficultyBef,halsteadEffortBef,halsteadLevelBef,halsteadTimeBef,halsteadBugsDeliveredBef,halsteadMaintainabilityBef,cyclomaticComplexityBef,cognitiveComplexityBef,lackOfCohesionInMethodBef
0,57,0,5,62,2,55,77.0,321.0,482.542564,14.721973,7103.978639,0.067926,394.66548,0.000333,40.351702,19,122,1.0
1,4,0,2,6,2,2,45.0,91.0,247.133389,8.419355,2080.703697,0.118774,115.59465,0.000333,69.976845,1,1,1.0
2,3,0,2,5,2,1,21.0,36.0,92.238666,1.451613,133.894838,0.688889,7.438602,0.000333,75.699244,1,1,1.0
3,3,0,0,3,1,1,29.0,40.0,140.881449,5.537037,780.0658,0.180602,43.336989,0.000333,74.411289,1,1,0.333333
4,11,0,0,11,2,6,26.0,49.0,122.211433,3.405405,416.179473,0.293651,23.121082,0.000333,62.131101,4,4,1.0


In [10]:
len(data)

23935

In [49]:
scaler = StandardScaler()
scaler.fit(data)
dump(scaler, 'scaler.pkl')

['scaler.pkl']

In [50]:
temp = scaler.transform(data)
scaled_data = pd.DataFrame(temp, columns=data.columns, index=data.index)
scaled_data.head()

Unnamed: 0,numberLinesOfCodeBef,numberCommentsBef,numberBlankLinesBef,totalLinesBef,numParametersBef,numStatementsBef,halsteadLengthBef,halsteadVocabularyBef,halsteadVolumeBef,halsteadDifficultyBef,halsteadEffortBef,halsteadLevelBef,halsteadTimeBef,halsteadBugsDeliveredBef,halsteadMaintainabilityBef,cyclomaticComplexityBef,cognitiveComplexityBef,lackOfCohesionInMethodBef
0,0.403648,-0.41882,0.101325,0.298221,0.328182,0.650995,0.249069,0.0008,0.185985,0.116235,-0.077534,-0.160668,-0.077534,0.0,-0.947121,0.886061,0.086448,0.582478
1,-0.636126,-0.41882,-0.326478,-0.616459,0.328182,-0.650054,-0.48503,-0.537087,-0.480368,-0.43238,-0.278426,-0.079343,-0.278426,0.0,1.402077,-0.517675,-0.13331,0.582478
2,-0.655745,-0.41882,-0.326478,-0.632793,0.328182,-0.674602,-1.035605,-0.665712,-0.918816,-1.038891,-0.356283,0.832475,-0.356283,0.0,1.855848,-0.517675,-0.13331,0.582478
3,-0.655745,-0.41882,-0.611681,-0.66546,-0.247152,-0.674602,-0.85208,-0.656357,-0.781127,-0.683273,-0.330442,0.019542,-0.330442,0.0,1.753717,-0.517675,-0.13331,-1.009267
4,-0.498798,-0.41882,-0.611681,-0.534791,0.328182,-0.551862,-0.920902,-0.635309,-0.833974,-0.868822,-0.344994,0.200348,-0.344994,0.0,0.779929,-0.283719,-0.127861,0.582478


In [51]:
scaled_data.cov()

Unnamed: 0,numberLinesOfCodeBef,numberCommentsBef,numberBlankLinesBef,totalLinesBef,numParametersBef,numStatementsBef,halsteadLengthBef,halsteadVocabularyBef,halsteadVolumeBef,halsteadDifficultyBef,halsteadEffortBef,halsteadLevelBef,halsteadTimeBef,halsteadBugsDeliveredBef,halsteadMaintainabilityBef,cyclomaticComplexityBef,cognitiveComplexityBef,lackOfCohesionInMethodBef
numberLinesOfCodeBef,1.000042,0.65353,0.705351,0.990549,0.109991,0.967108,0.815876,0.898723,0.829638,0.693731,0.714521,0.006454,0.714521,0.0,-0.786879,0.882481,0.679321,0.088524
numberCommentsBef,0.65353,1.000042,0.620489,0.733246,0.094741,0.634936,0.542341,0.568725,0.547823,0.445984,0.426044,-0.003983,0.426044,0.0,-0.55373,0.627433,0.39604,0.096831
numberBlankLinesBef,0.705351,0.620489,1.000042,0.775053,0.041247,0.694164,0.629408,0.653089,0.626495,0.525578,0.450076,0.012479,0.450076,0.0,-0.645658,0.566044,0.392342,0.066479
totalLinesBef,0.990549,0.733246,0.775053,1.000042,0.107485,0.959654,0.815393,0.890196,0.827165,0.690431,0.696738,0.006332,0.696738,0.0,-0.794457,0.873635,0.657276,0.092749
numParametersBef,0.109991,0.094741,0.041247,0.107485,1.000042,0.114711,0.161571,0.114549,0.158327,0.089893,0.097314,-0.010975,0.097314,0.0,-0.139711,0.186868,0.069146,0.073566
numStatementsBef,0.967108,0.634936,0.694164,0.959654,0.114711,1.000042,0.81696,0.909956,0.829568,0.69439,0.708066,0.013857,0.708066,0.0,-0.78928,0.905565,0.708627,0.079651
halsteadLengthBef,0.815876,0.542341,0.629408,0.815393,0.161571,0.81696,1.000042,0.830237,0.996629,0.855779,0.740259,-0.050304,0.740259,0.0,-0.861594,0.691123,0.407869,0.10673
halsteadVocabularyBef,0.898723,0.568725,0.653089,0.890196,0.114549,0.909956,0.830237,1.000042,0.842625,0.729922,0.741789,-0.014375,0.741789,0.0,-0.747051,0.778781,0.620011,0.064655
halsteadVolumeBef,0.829638,0.547823,0.626495,0.827165,0.158327,0.829568,0.996629,0.842625,1.000042,0.853208,0.776986,-0.044112,0.776986,0.0,-0.832094,0.707648,0.438992,0.103455
halsteadDifficultyBef,0.693731,0.445984,0.525578,0.690431,0.089893,0.69439,0.855779,0.729922,0.853208,1.000042,0.812399,-0.174598,0.812399,0.0,-0.740635,0.565223,0.332073,0.079078


In [52]:
unstack_matrix = scaled_data.corr().unstack().sort_values(ascending=False)

# Drop Repetitive Values

cols = scaled_data.columns
for i in range(0, scaled_data.shape[1]):
    for j in range(0, i+1):
        drop = (cols[i], cols[j])
        if(drop in unstack_matrix.keys()):
            unstack_matrix.drop(drop, inplace=True)

print(unstack_matrix[unstack_matrix > 0.9])
print(unstack_matrix[unstack_matrix < -0.9])

halsteadEffortBef     halsteadTimeBef            1.000000
halsteadLengthBef     halsteadVolumeBef          0.996587
numberLinesOfCodeBef  totalLinesBef              0.990508
                      numStatementsBef           0.967068
totalLinesBef         numStatementsBef           0.959614
numStatementsBef      halsteadVocabularyBef      0.909918
                      cyclomaticComplexityBef    0.905527
dtype: float64
Series([], dtype: float64)


In [53]:
X = scaled_data.values
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)
y_train = [1 for i in range(len(X_train))]

one_class_svm = OneClassSVM(kernel='rbf', nu=0.1, gamma='auto')
one_class_svm.fit(X_train, y_train)

y = [1 for i in range(len(X))]
scores = cross_val_score(one_class_svm, X, y, cv=5, scoring=f2_score)

print("F2 Scores:", scores)
print("Mean F2 Score:", scores.mean())

NameError: name 'f2_score' is not defined

In [None]:
data = scaled_data.head(10000).copy()

In [None]:
def grid_search(X_train):
    model_params = {
        'OneClassSVM': {
            'model': OneClassSVM(),
            'params': {
                'kernel': ['rbf', 'linear', 'poly', 'sigmoid'],
                'nu': [0.1, 0.2, 0.3],
                'gamma': ['scale', 'auto']
            }
        },
        'IsolationForest': {
            'model': IsolationForest(),
            'params': {
                'contamination': [0.1, 0.2],
                'n_estimators': [50, 100, 200, 300, 400, 500],
                'max_samples': [100, 200, 300, 400, 500],
                'max_features': [1, 2, 3, 4, 5]
            }
        },
        'EllipticEnvelope': {
            'model': EllipticEnvelope(),
            'params': {
                'contamination': [0.01, 0.02, 0.03, 0.04, 0.05]
            }
        }
    }
     
    """
    # Testing Parameters
    model_params = {
        'OneClassSVM': {
            'model': OneClassSVM(),
            'params': {
                'kernel': ['rbf', 'linear'],
                'nu': [0.1, 0.2],
                'gamma': ['scale', 'auto']
            }
        },
        'IsolationForest': {
            'model': IsolationForest(),
            'params': {
                'contamination': [0.1, 0.2],
                'n_estimators': [50, 100],
                'max_samples': [100, 200],
                'max_features': [1, 2]
            }
        },
        'EllipticEnvelope': {
            'model': EllipticEnvelope(),
            'params': {
                'contamination': [0.01, 0.02]
            }
        }
    }
    """
    
    kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=0)
    y_train = [1 for i in range(len(X_train))] # 1 for inliers
    
    scores = []
    
    f2_score = make_scorer(fbeta_score, beta=2, pos_label=1)
    
    for model_name, mp in model_params.items():
        grid_search = GridSearchCV(mp['model'],
                                    param_grid=mp['params'],
                                    return_train_score=False,
                                    cv=kf,
                                    n_jobs=-1,
                                    verbose=True,
                                     scoring=f2_score)
        grid_search.fit(X_train, y_train)
        scores.append({
            'model': model_name,
            'best_score': grid_search.best_score_,
            'best_params': grid_search.best_params_
        })
    
    df = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
    
    return df

In [54]:
from time import localtime, strftime
strftime("%H:%M:%S", localtime())

'12:17:03'

In [55]:
temp_data = data
X = temp_data.values
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

grid_search_values = grid_search(X_train)

Fitting 25 folds for each of 24 candidates, totalling 600 fits
Fitting 25 folds for each of 300 candidates, totalling 7500 fits
Fitting 25 folds for each of 5 candidates, totalling 125 fits




In [56]:
from time import localtime, strftime
strftime("%H:%M:%S", localtime())

'13:34:13'

In [39]:
df = grid_search_values

In [40]:
df.head()

Unnamed: 0,model,best_score,best_params
0,OneClassSVM,0.918192,"{'gamma': 'scale', 'kernel': 'sigmoid', 'nu': ..."
1,IsolationForest,0.918893,"{'contamination': 0.1, 'max_features': 4, 'max..."
2,EllipticEnvelope,0.991579,{'contamination': 0.01}


In [41]:
one_class_svm = OneClassSVM(kernel=df.iloc[0]['best_params']['kernel'], nu=df.iloc[0]['best_params']['nu'], gamma=df.iloc[0]['best_params']['gamma'])
isolation_forest = IsolationForest(contamination=df.iloc[1]['best_params']['contamination'], n_estimators=df.iloc[1]['best_params']['n_estimators'], max_samples=df.iloc[1]['best_params']['max_samples'], max_features=df.iloc[1]['best_params']['max_features'])
elliptic_envelope = EllipticEnvelope(contamination=df.iloc[2]['best_params']['contamination'])

one_class_svm.fit(X_train)
isolation_forest.fit(X_train)
elliptic_envelope.fit(X_train)

dump(one_class_svm, 'models/one_class_svm.joblib')
dump(isolation_forest, 'models/isolation_forest.joblib')
dump(elliptic_envelope, 'models/elliptic_envelope.joblib')



['models/elliptic_envelope.joblib']