# Parametrizing

In [1]:
DATASET_CONFIG_PATH = 'config/dataset/kc1_001.json'
MODEL_CONFIG_PATH = 'config/model/knndd_001.json'
PROFILE_REPORTING = False

In [2]:
# Parameters
DATASET_CONFIG_PATH = "config/dataset/000118.json"
MODEL_CONFIG_PATH = "config/model/000004.json"
PROFILE_REPORTING = False


In [3]:
import json


with open(DATASET_CONFIG_PATH) as f:
    dataset_config = json.load(f)

with open(MODEL_CONFIG_PATH) as f:
    model_config = json.load(f)

# Profiling

In [4]:
import pandas as pd
from pandas_profiling import ProfileReport

In [5]:
df = pd.read_csv(dataset_config['dataset_path'])

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 498 entries, 0 to 497
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 498 non-null    int64  
 1   loc                498 non-null    float64
 2   v(g)               498 non-null    float64
 3   ev(g)              498 non-null    float64
 4   iv(g)              498 non-null    float64
 5   n                  498 non-null    float64
 6   v                  498 non-null    float64
 7   l                  498 non-null    float64
 8   d                  498 non-null    float64
 9   i                  498 non-null    float64
 10  e                  498 non-null    float64
 11  b                  498 non-null    float64
 12  t                  498 non-null    float64
 13  lOCode             498 non-null    int64  
 14  lOComment          498 non-null    int64  
 15  lOBlank            498 non-null    int64  
 16  locCodeAndComment  498 non

In [7]:
df.head()

Unnamed: 0,id,loc,v(g),ev(g),iv(g),n,v,l,d,i,...,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,defects
0,1,1.1,1.4,1.4,1.4,1.3,1.3,1.3,1.3,1.3,...,2,2,2,2,1.2,1.2,1.2,1.2,1.4,False
1,2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1,1,1,1,1.0,1.0,1.0,1.0,1.0,True
2,3,24.0,5.0,1.0,3.0,63.0,309.13,0.11,9.5,32.54,...,1,0,6,0,15.0,15.0,44.0,19.0,9.0,False
3,4,20.0,4.0,4.0,2.0,47.0,215.49,0.06,16.0,13.47,...,0,0,3,0,16.0,8.0,31.0,16.0,7.0,False
4,5,24.0,6.0,6.0,2.0,72.0,346.13,0.06,17.33,19.97,...,0,0,3,0,16.0,12.0,46.0,26.0,11.0,False


In [8]:
del df['id']

In [9]:
if PROFILE_REPORTING:
    ProfileReport(df).to_widgets()

# Preprocess

In [10]:
working_df = df.copy()

In [11]:
working_df['defects'] = working_df['defects'].replace([False, True], [0, 1])

In [12]:
for col in dataset_config['delete_columns']:
    del working_df[col]

In [13]:
working_df = working_df.drop_duplicates()

# Split

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
inlier, outlier = working_df[working_df['defects'] == 0], working_df[working_df['defects'] == 1]

In [16]:
X_inlier = inlier.copy()
y_inlier = X_inlier.pop('defects')
X_outlier = outlier.copy()
y_outlier = X_outlier.pop('defects')

In [17]:
test_size = dataset_config['test_size']
random_state = dataset_config['random_state']

X_inlier_train, X_inlier_test, y_inlier_train, y_inlier_test = train_test_split(
    X_inlier,
    y_inlier,
    test_size=test_size,
    random_state=random_state
)

In [18]:
test_outlier_size = dataset_config['test_outlier_size']
if test_outlier_size > 0.0:
    X_outlier_train, X_outlier_test, y_outlier_train, y_outlier_test = train_test_split(
        X_outlier,
        y_outlier,
        test_size=test_outlier_size,
        random_state=random_state
    )
else:
    X_outlier_train, y_outlier_train = pd.DataFrame(columns=X_outlier.columns), pd.Series(dtype='int64')
    X_outlier_test, y_outlier_test = X_outlier.copy(), y_outlier.copy()

In [19]:
X_train, y_train = pd.concat([X_inlier_train, X_outlier_train], ignore_index=True), pd.concat([y_inlier_train, y_outlier_train], ignore_index=True)
X_test, y_test = pd.concat([X_inlier_test, X_outlier_test], ignore_index=True), pd.concat([y_inlier_test, y_outlier_test], ignore_index=True)

# Model

In [20]:
from math import sqrt


class KNNDataDescription:
    def __init__(self, k=5, outlier_threshold=1.0):
        self.k = k
        self.outlier_threshold = outlier_threshold
        self.X = []
    
    def fit(self, X):
        self.X = [features.to_list() for index, features in X.iterrows()]

    def predict(self, entries):
        return [self.predict_entry(features.to_list()) for index, features in entries.iterrows()]
    
    def predict_entry(self, entry):
        kth_neighbor, kth_distance = self.get_kth_neighbor(entry, self.k)
        kth_kth_neighbor, kth_kth_distance = self.get_kth_neighbor(kth_neighbor, self.k + 1)

        outlier_score = kth_distance / (kth_kth_distance + 0.000001)

        return 1 if outlier_score < self.outlier_threshold else -1
        
    def euclidean_distance(self, features1, features2):
        distance = 0.0
        for feat1, feat2 in zip(features1, features2):
            distance += (feat1 - feat2)**2
                
        return sqrt(distance)

    def get_neighbors(self, entry, k):
        distance_to_data_points = [
            (data_point, self.euclidean_distance(entry, data_point))
            for data_point in self.X
        ]
        sorted_by_distance = sorted(distance_to_data_points, key=lambda tup: tup[1])
        neighbors = sorted_by_distance[:k]
        return neighbors

    def get_kth_neighbor(self, entry, k):
        neighbors = self.get_neighbors(entry, k)
        return neighbors[-1]


In [21]:
from sklearn.svm import OneClassSVM


models = {
    'oneclasssvm': OneClassSVM,
    'knndatadescription': KNNDataDescription,
}

In [22]:
model_cls = models[model_config['model_type']]
model = model_cls(**model_config['hyperparameters'])

# Fit

In [23]:
model.fit(X_train)

OneClassSVM()

# Predict

In [24]:
predictions = model.predict(X_test)

# Evaluate

In [25]:
from sklearn.metrics import f1_score

In [26]:
y_test = y_test.replace([0, 1], [1, -1])

In [27]:
def evaluate(truths, predictions, pos_label, neg_label):
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    for truth, pred in zip(truths, predictions): 
        if truth == pred == pos_label:
           TP += 1
        elif pred == pos_label and truth == neg_label:
           FP += 1
        elif truth == pred == neg_label:
           TN += 1
        elif pred == neg_label and truth == pos_label:
           FN += 1

    precision = TP / (TP + FP)
    sensitivity = TP / (TP + FN)
    
    return {
        'true_pos': TP,
        'false_pos': FP,
        'false_neg': FN,
        'true_neg': TN,
        'precision': precision,
        'sensitivity': sensitivity,
        'f1': (2 * (precision * sensitivity)) / (precision + sensitivity)
    }

score = evaluate(y_test, predictions, pos_label=-1, neg_label=1)

# Report

In [28]:
import csv


dataset_config_name = DATASET_CONFIG_PATH.split('/')[-1].split('.')[0]
model_config_name = MODEL_CONFIG_PATH.split('/')[-1].split('.')[0]
configuration = f"dataset_{dataset_config_name}-model_{model_config_name}"

with open('results.csv', 'a') as f:
    report = dict({'configuration': configuration}, **score)
    writer = csv.DictWriter(f, report.keys())
    writer.writerow(report)