In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import make_scorer, fbeta_score
from joblib import dump, load

In [6]:
DATA_PATH = "C:\\Users\\dluis\\Documents\\Docs\\Universidade\\M 2 ano\\Thesis\\DISS\Classification Model\\extracted_metrics.csv"

In [7]:
def load_data(path):
    df = pd.read_csv(path)
    
    # keeping only the needed features - 1st column (id) and 19-36st columns (after changes metrics)
    #df.drop(df.columns[[0, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36]], axis=1, inplace=True)
    
    return df

In [8]:
data = load_data(DATA_PATH)
data.head()

Unnamed: 0,numberLinesOfCodeBef,numberCommentsBef,numberBlankLinesBef,totalLinesBef,numParametersBef,numStatementsBef,halsteadLengthBef,halsteadVocabularyBef,halsteadVolumeBef,halsteadDifficultyBef,halsteadEffortBef,halsteadLevelBef,halsteadTimeBef,halsteadBugsDeliveredBef,halsteadMaintainabilityBef,cyclomaticComplexityBef,cognitiveComplexityBef,lackOfCohesionInMethodBef
0,47,0,0,47,1,42,59.0,363.0,347.07594,13.392857,4648.338481,0.074667,258.241027,0.000333,43.315779,18,193,1.0
1,47,0,0,47,1,42,59.0,363.0,347.07594,13.392857,4648.338481,0.074667,258.241027,0.000333,43.315779,18,193,1.0
2,47,0,0,47,1,42,59.0,363.0,347.07594,13.392857,4648.338481,0.074667,258.241027,0.000333,43.315779,18,193,1.0
3,47,0,0,47,1,42,59.0,363.0,347.07594,13.392857,4648.338481,0.074667,258.241027,0.000333,43.315779,18,193,1.0
4,47,0,0,47,1,42,59.0,363.0,347.07594,13.392857,4648.338481,0.074667,258.241027,0.000333,43.315779,18,193,1.0


In [7]:
def grid_search(X_train):
    
    #TODO: change to the actual parameters
    """ Actual Parameters to be used in the models
    model_params = {
        'OneClassSVM': {
            'model': OneClassSVM(),
            'params': {
                'kernel': ['rbf', 'linear', 'poly', 'sigmoid'],
                'nu': [0.1, 0.2, 0.3, 0.4, 0.5],
                'gamma': ['scale', 'auto']
            }
        },
        'IsolationForest': {
            'model': IsolationForest(),
            'params': {
                'contamination': [0.1, 0.2],
                'n_estimators': [50, 100, 200, 300, 400, 500],
                'max_samples': [100, 200, 300, 400, 500],
                'max_features': [1, 2, 3, 4, 5]
            }
        },
        'EllipticEnvelope': {
            'model': EllipticEnvelope(),
            'params': {
                'contamination': [0.01, 0.02, 0.03, 0.04, 0.05]
            }
        }
    } """
 
    # Testing Parameters
    model_params = {
        'OneClassSVM': {
            'model': OneClassSVM(),
            'params': {
                'kernel': ['rbf', 'linear'],
                'nu': [0.1, 0.2],
                'gamma': ['scale', 'auto']
            }
        },
        'IsolationForest': {
            'model': IsolationForest(),
            'params': {
                'contamination': [0.1, 0.2],
                'n_estimators': [50, 100],
                'max_samples': [100, 200],
                'max_features': [1, 2]
            }
        },
        'EllipticEnvelope': {
            'model': EllipticEnvelope(),
            'params': {
                'contamination': [0.01, 0.02]
            }
        }
    }
    
    kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=0)
    y_train = [1 for i in range(len(X_train))] # 1 for inliers
    
    scores = []
    
    f2_score = make_scorer(fbeta_score, beta=2, pos_label=1)
    
    for model_name, mp in model_params.items():
        grid_search = GridSearchCV(mp['model'],
                                    param_grid=mp['params'],
                                    return_train_score=False,
                                    cv=kf,
                                    n_jobs=-1,
                                    verbose=True,
                                     scoring=f2_score)
        grid_search.fit(X_train, y_train)
        scores.append({
            'model': model_name,
            'best_score': grid_search.best_score_,
            'best_params': grid_search.best_params_
        })
    
    df = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
    
    return df

In [8]:
X = data.values
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

grid_search_values = grid_search(X_train)

Fitting 25 folds for each of 8 candidates, totalling 200 fits
Fitting 25 folds for each of 16 candidates, totalling 400 fits


  warn(


Fitting 25 folds for each of 2 candidates, totalling 50 fits




In [9]:
df = grid_search_values
 
one_class_svm = OneClassSVM(kernel=df.iloc[0]['best_params']['kernel'], nu=df.iloc[0]['best_params']['nu'], gamma=df.iloc[0]['best_params']['gamma'])
isolation_forest = IsolationForest(contamination=df.iloc[1]['best_params']['contamination'], n_estimators=df.iloc[1]['best_params']['n_estimators'], max_samples=df.iloc[1]['best_params']['max_samples'], max_features=df.iloc[1]['best_params']['max_features'])
elliptic_envelope = EllipticEnvelope(contamination=df.iloc[2]['best_params']['contamination'])

one_class_svm.fit(X_train)
isolation_forest.fit(X_train)
elliptic_envelope.fit(X_train)

dump(one_class_svm, 'models/one_class_svm.joblib')
dump(isolation_forest, 'models/isolation_forest.joblib')
dump(elliptic_envelope, 'models/elliptic_envelope.joblib')

  warn(


['models/elliptic_envelope.joblib']