<span style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">An Exception was encountered at '<a href="#papermill-error-cell">In [20]</a>'.</span>

# Parametrizing

In [1]:
MODEL_CONFIG_PATH = 'config/model/knndd_001.json'
DATASET_CONFIG_PATH = 'config/dataset/kc1_001.json'
PROFILE_REPORTING = False

In [2]:
# Parameters
MODEL_CONFIG_PATH = "config/model/knndd_001.json"
DATASET_CONFIG_PATH = "config/dataset/kc1_001.json"
PROFILE_REPORTING = False


In [3]:
import json


with open(DATASET_CONFIG_PATH) as f:
    dataset_config = json.load(f)

with open(MODEL_CONFIG_PATH) as f:
    model_config = json.load(f)

# Profiling

In [4]:
import pandas as pd
from pandas_profiling import ProfileReport

In [5]:
df = pd.read_csv(dataset_config['dataset_path'])

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2109 entries, 0 to 2108
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 2109 non-null   int64  
 1   loc                2109 non-null   float64
 2   v(g)               2109 non-null   float64
 3   ev(g)              2109 non-null   float64
 4   iv(g)              2109 non-null   float64
 5   n                  2109 non-null   float64
 6   v                  2109 non-null   float64
 7   l                  2109 non-null   float64
 8   d                  2109 non-null   float64
 9   i                  2109 non-null   float64
 10  e                  2109 non-null   float64
 11  b                  2109 non-null   float64
 12  t                  2109 non-null   float64
 13  lOCode             2109 non-null   int64  
 14  lOComment          2109 non-null   int64  
 15  lOBlank            2109 non-null   int64  
 16  locCodeAndComment  2109 

In [7]:
df.head()

Unnamed: 0,id,loc,v(g),ev(g),iv(g),n,v,l,d,i,...,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,defects
0,1,1.1,1.4,1.4,1.4,1.3,1.3,1.3,1.3,1.3,...,2,2,2,2,1.2,1.2,1.2,1.2,1.4,False
1,2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1,1,1,1,1.0,1.0,1.0,1.0,1.0,True
2,3,83.0,11.0,1.0,11.0,171.0,927.89,0.04,23.04,40.27,...,65,10,6,0,18.0,25.0,107.0,64.0,21.0,True
3,4,46.0,8.0,6.0,8.0,141.0,769.78,0.07,14.86,51.81,...,37,2,5,0,16.0,28.0,89.0,52.0,15.0,True
4,5,25.0,3.0,1.0,3.0,58.0,254.75,0.11,9.35,27.25,...,21,0,2,0,11.0,10.0,41.0,17.0,5.0,True


In [8]:
if PROFILE_REPORTING:
    ProfileReport(df).to_widgets()

# Preprocess

In [9]:
working_df = df.copy()

In [10]:
working_df['defects'] = working_df['defects'].replace([False, True], [0, 1])

In [11]:
for col in dataset_config['delete_columns']:
    del working_df[col]

# Split

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
inlier, outlier = working_df[working_df['defects'] == 0], working_df[working_df['defects'] == 1]

In [14]:
X_inlier = inlier.copy()
y_inlier = X_inlier.pop('defects')
X_outlier = outlier.copy()
y_outlier = X_outlier.pop('defects')

In [15]:
test_size = dataset_config['test_size']
X_inlier_train, X_inlier_test, y_inlier_train, y_inlier_test = train_test_split(X_inlier, y_inlier, test_size=test_size, random_state=1)
X_train, y_train = X_inlier_train, y_inlier_train
X_test, y_test = pd.concat([X_inlier_test, X_outlier]), pd.concat([y_inlier_test, y_outlier])

# Model

In [16]:
from math import sqrt


class KNNDataDescription:
    def __init__(self, k=5, outlier_threshold=1.0):
        self.k = k
        self.X = []
    
    def fit(self, X):
        self.X = X
    
    def predict(self, entry):
        kth_neighbor, kth_distance = self.get_kth_neighbor(entry, self.k)
        kth_kth_neighbor, kth_kth_distance = self.get_kth_neighbor(kth_neighbor[0], self.k + 1)
        outlier_score = kth_distance / kth_kth_distance
        
        return 1 if outlier_score < outlier_threshold else -1
        
    def euclidean_distance(self, features1, features2):
        distance = 0.0
        for feat1, feat2 in zip(features1, features2):
            try:
                distance += (feat1 - feat2)**2
            except:
                print(feat1, feat2)
        return sqrt(distance)

    def get_neighbors(self, entry, k):
        distance_to_data_points = [
            (data_point, self.euclidean_distance(entry, data_point))
            for data_point in self.X
        ]
        sorted_by_distance = sorted(distance_to_data_points, key=lambda tup: tup[1])
        neighbors = distances[:k]
        return neighbors

    def get_kth_neighbor(self, entry, k):
        neighbors = self.get_neighbors(entry, self.k)
        return neighbors[-1]


In [17]:
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM


models = {
    'isolationforest': IsolationForest,
    'oneclasssvm': OneClassSVM,
    'knndatadescription': KNNDataDescription
}

In [18]:
model_cls = models[model_config['model_type']]
model = model_cls(**model_config['hyperparameters'])

# Fit

In [19]:
model.fit(X_train)

# Predict

<span id="papermill-error-cell" style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">Execution using papermill encountered an exception here and stopped:</span>

In [20]:
predictions = model.predict(X_test)

id i
v(g) d
id v
v(g) (
ev(g) g
iv(g) )
id e
v(g) v
ev(g) (
iv(g) g
n )
id i
v(g) v
ev(g) (
iv(g) g
n )
id n
id v
id l
id d
id i
id e
id b
id t
id l
v(g) O
ev(g) C
iv(g) o
n d
v e
id l
v(g) O
ev(g) C
iv(g) o
n m
v m
l e
d n
i t
id l
v(g) O
ev(g) B
iv(g) l
n a
v n
l k
id u
v(g) n
ev(g) i
iv(g) q
n _
v O
l p
id u
v(g) n
ev(g) i
iv(g) q
n _
v O
l p
d n
i d
id t
v(g) o
ev(g) t
iv(g) a
n l
v _
l O
d p
id t
v(g) o
ev(g) t
iv(g) a
n l
v _
l O
d p
i n
e d
id b
v(g) r
ev(g) a
iv(g) n
n c
v h
l C
d o
i u
e n
b t


NameError: name 'distances' is not defined

# Evaluate

In [None]:
from sklearn.metrics import f1_score

In [None]:
y_test = y_test.replace([0, 1], [1, -1])

In [None]:
score = f1_score(y_test, predictions, pos_label=-1)
print('F1 Score: %.3f' % score)