In [5]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import make_scorer, fbeta_score
from joblib import dump, load

In [6]:
#CHANGE THIS
DATA_PATH = "C:\\Users\\dluis\\.gradle\\caches\\modules-2\files-2.1\com.jetbrains.intellij.idea\ideaIC\2021.1.1\e051d885e757b286781f50305504d7b8db3e1dba\ideaIC-2021.1.1\bin\tmp\metrics.db""

In [7]:
def load_data(data_path):
    conn = sqlite3.connect(data_path)
    cursor = conn.cursor()

    cursor.execute("SELECT author, numberLinesOfCodeBef, numberCommentsBef, " +
                   "numberBlankLinesBef, totalLinesBef, numParametersBef, " +
                   "numStatementsBef, halsteadLengthBef, halsteadVocabularyBef, " +
                   "halsteadVolumeBef, halsteadDifficultyBef, halsteadEffortBef, " +
                   "halsteadLevelBef, halsteadTimeBef, halsteadBugsDeliveredBef, " +
                   "halsteadMaintainabilityBef, cyclomaticComplexityBef, " +
                   "cognitiveComplexityBef, lackOfCohesionInMethodBef " +
                   "FROM metrics")
    rows = cursor.fetchall()
    conn.close()

    df = pd.DataFrame(rows, columns=[
        'author', 'numberLinesOfCodeBef', 'numberCommentsBef',
        'numberBlankLinesBef', 'totalLinesBef', 'numParametersBef',
        'numStatementsBef', 'halsteadLengthBef', 'halsteadVocabularyBef',
        'halsteadVolumeBef', 'halsteadDifficultyBef', 'halsteadEffortBef',
        'halsteadLevelBef', 'halsteadTimeBef', 'halsteadBugsDeliveredBef',
        'halsteadMaintainabilityBef', 'cyclomaticComplexityBef',
        'cognitiveComplexityBef', 'lackOfCohesionInMethodBef'
    ])
    
    return df

In [8]:
data = load_data(DATA_PATH)
data.head()

Unnamed: 0,author,numberLinesOfCodeBef,numberCommentsBef,numberBlankLinesBef,totalLinesBef,numParametersBef,numStatementsBef,halsteadLengthBef,halsteadVocabularyBef,halsteadVolumeBef,halsteadDifficultyBef,halsteadEffortBef,halsteadLevelBef,halsteadTimeBef,halsteadBugsDeliveredBef,halsteadMaintainabilityBef,cyclomaticComplexityBef,cognitiveComplexityBef,lackOfCohesionInMethodBef
0,up201906807@fe.up.pt,33,1,0,34,1,29,58.0,226.0,339.762898,11.217949,3811.442763,0.089143,211.74682,0.000333,46.999817,16,156,1.0
1,up201906807@fe.up.pt,33,1,0,34,1,29,58.0,226.0,339.762898,11.217949,3811.442763,0.089143,211.74682,0.000333,46.999817,16,156,1.0
2,up201906807@fe.up.pt,33,1,0,34,1,29,58.0,226.0,339.762898,11.217949,3811.442763,0.089143,211.74682,0.000333,46.999817,16,156,1.0
3,up201906807@fe.up.pt,33,1,0,34,1,29,58.0,226.0,339.762898,11.217949,3811.442763,0.089143,211.74682,0.000333,46.999817,16,156,1.0
4,andrejesusferflores@gmail.com,22,0,1,23,2,18,46.0,151.0,254.08385,11.134021,2828.974825,0.089815,157.165268,0.000333,53.338747,4,4,1.0


In [9]:
def grid_search(X_train):
    
    #TODO: change to the actual parameters
    """ Actual Parameters to be used in the models
    model_params = {
        'OneClassSVM': {
            'model': OneClassSVM(),
            'params': {
                'kernel': ['rbf', 'linear', 'poly', 'sigmoid'],
                'nu': [0.1, 0.2, 0.3, 0.4, 0.5],
                'gamma': ['scale', 'auto']
            }
        },
        'IsolationForest': {
            'model': IsolationForest(),
            'params': {
                'contamination': [0.1, 0.2],
                'n_estimators': [50, 100, 200, 300, 400, 500],
                'max_samples': [100, 200, 300, 400, 500],
                'max_features': [1, 2, 3, 4, 5]
            }
        },
        'EllipticEnvelope': {
            'model': EllipticEnvelope(),
            'params': {
                'contamination': [0.01, 0.02, 0.03, 0.04, 0.05]
            }
        }
    } """
 
    # Testing Parameters
    model_params = {
        'OneClassSVM': {
            'model': OneClassSVM(),
            'params': {
                'kernel': ['rbf', 'linear'],
                'nu': [0.1, 0.2],
                'gamma': ['scale', 'auto']
            }
        },
        'IsolationForest': {
            'model': IsolationForest(),
            'params': {
                'contamination': [0.1, 0.2],
                'n_estimators': [50, 100],
                'max_samples': [100, 200],
                'max_features': [1, 2]
            }
        },
        'EllipticEnvelope': {
            'model': EllipticEnvelope(),
            'params': {
                'contamination': [0.01, 0.02]
            }
        }
    }
    
    kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=0)
    y_train = [1 for i in range(len(X_train))] # 1 for inliers
    
    scores = []
    
    f2_score = make_scorer(fbeta_score, beta=2, pos_label=1)
    
    for model_name, mp in model_params.items():
        grid_search = GridSearchCV(mp['model'],
                                    param_grid=mp['params'],
                                    return_train_score=False,
                                    cv=kf,
                                    n_jobs=-1,
                                    verbose=True,
                                     scoring=f2_score)
        grid_search.fit(X_train, y_train)
        scores.append({
            'model': model_name,
            'best_score': grid_search.best_score_,
            'best_params': grid_search.best_params_
        })
    
    df = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
    
    return df

In [12]:
temp_data = data.drop(columns='author')
X = temp_data.values
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

grid_search_values = grid_search(X_train)

Fitting 25 folds for each of 8 candidates, totalling 200 fits
Fitting 25 folds for each of 16 candidates, totalling 400 fits


  warn(


Fitting 25 folds for each of 2 candidates, totalling 50 fits




In [13]:
X_train

array([[3.30000000e+01, 1.00000000e+00, 0.00000000e+00, 3.40000000e+01,
        1.00000000e+00, 2.90000000e+01, 5.80000000e+01, 2.26000000e+02,
        3.39762898e+02, 1.12179487e+01, 3.81144276e+03, 8.91428571e-02,
        2.11746820e+02, 3.33333333e-04, 4.69998174e+01, 1.60000000e+01,
        1.56000000e+02, 1.00000000e+00],
       [1.10000000e+01, 0.00000000e+00, 2.00000000e+00, 1.30000000e+01,
        1.00000000e+00, 7.00000000e+00, 3.90000000e+01, 9.10000000e+01,
        2.06130687e+02, 5.91044776e+00, 1.21832465e+03, 1.69191919e-01,
        6.76847030e+01, 3.33333333e-04, 6.08104343e+01, 2.00000000e+00,
        1.00000000e+00, 0.00000000e+00],
       [3.30000000e+01, 1.00000000e+00, 0.00000000e+00, 3.40000000e+01,
        1.00000000e+00, 2.90000000e+01, 5.80000000e+01, 2.26000000e+02,
        3.39762898e+02, 1.12179487e+01, 3.81144276e+03, 8.91428571e-02,
        2.11746820e+02, 3.33333333e-04, 4.69998174e+01, 1.60000000e+01,
        1.56000000e+02, 1.00000000e+00],
       [3.300

In [14]:
df = grid_search_values
 
one_class_svm = OneClassSVM(kernel=df.iloc[0]['best_params']['kernel'], nu=df.iloc[0]['best_params']['nu'], gamma=df.iloc[0]['best_params']['gamma'])
isolation_forest = IsolationForest(contamination=df.iloc[1]['best_params']['contamination'], n_estimators=df.iloc[1]['best_params']['n_estimators'], max_samples=df.iloc[1]['best_params']['max_samples'], max_features=df.iloc[1]['best_params']['max_features'])
elliptic_envelope = EllipticEnvelope(contamination=df.iloc[2]['best_params']['contamination'])

one_class_svm.fit(X_train)
isolation_forest.fit(X_train)
elliptic_envelope.fit(X_train)

dump(one_class_svm, 'models/one_class_svm.joblib')
dump(isolation_forest, 'models/isolation_forest.joblib')
dump(elliptic_envelope, 'models/elliptic_envelope.joblib')

  warn(


['models/elliptic_envelope.joblib']

In [15]:
#Untested sample weights - will give double weight to data with a specific author (may need to adjust)
def get_sample_weights(X_train, author):
    sample_weights = np.where(X_train['author_name'] == author, 2, 1)

In [16]:
temp = OneClassSVM(kernel=df.iloc[0]['best_params']['kernel'], nu=df.iloc[0]['best_params']['nu'], gamma=df.iloc[0]['best_params']['gamma'])

sample_weights = get_sample_weights(X_train, "author")

temp.fit(X_train, sample_weight=sample_weights)

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices