# Zadatak 5: Implementacija klasifikacije

In [None]:
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

from zadatak5_knn import KNN

### Učitavanje podataka

In [None]:
df = pd.read_csv('./db/scrapy_vozila_20220602.csv', sep=';', names=['url', 'naslov', 'cena', 'stanje', 'marka', 'model', 'godiste', 'kilometraza', 'karoserija', 'gorivo', 'kubikaza', 'snaga', 'menjac', 'vrata', 'boja', 'lokacija_prodavca'])

df

In [None]:
df.dtypes

### Uklanjanje nenumeričkih vrednosti cena i snage

In [None]:
# Cena
df = df.loc[pd.to_numeric(df['cena'], errors='coerce').notnull()]
df.loc[:, 'cena'] = df.loc[:, 'cena'].astype('int64')


# Snaga - zadrzava se brojka u konjskim snagama
def sredi_snagu(snaga: str) -> str:
    try:
        return re.search('(?<=/)\d+', snaga).group(0)
    except AttributeError:
        return None

df['snaga'] = df['snaga'].apply(sredi_snagu)

df = df.loc[pd.to_numeric(df['snaga'], errors='coerce').notnull()]
df.loc[:, 'snaga'] = df.loc[:, 'snaga'].astype('int64')

In [None]:
df.dtypes

In [None]:
df.shape

### Dodeljivanje kategorije cenovnog opsega vozilima.

In [None]:
bins = [0, 2000, 5000, 10000, 15000, 20000, 25000, 30000, math.inf] # math.inf or np.inf?
labels = ['<=1999', '2000-4999', '5000-9999', '10000-14999', '15000-19999', '20000-24999', '25000-29999', '>=30000']
df['cenovni_opseg'] = pd.cut(df['cena'], bins=bins, labels=labels, right=False)

df

### Odabir relevantnih podataka i pretvaranje kategoričkih vrednosti u numeričke

In [None]:
df_for_training = df.copy()

features_to_exclude = ['cena', 'url', 'naslov', 'model', 'karoserija', 'gorivo', 'menjac', 'vrata', 'boja', 'lokacija_prodavca']
df_for_training = df_for_training.drop(features_to_exclude, axis=1)

categorical_features = ['cenovni_opseg', 'stanje', 'marka']
df_for_training = pd.get_dummies(df_for_training, columns=categorical_features)

df_for_training

### Skaliranje numeričkih podataka

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
numeric_features = ['godiste', 'kilometraza', 'kubikaza', 'snaga']
scaler.fit(df_for_training[numeric_features])
scaled_features = scaler.transform(df_for_training[numeric_features])
for i in range(len(numeric_features)):
    df_for_training[numeric_features[i]] = scaled_features[:, i]
    
df_for_training

## KNN - priprema

### Podela podataka

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(df_for_training, df['cenovni_opseg'], test_size=0.30, random_state=183)

### Računanje K (standardan način - neparan broj najbliži korenu broja podataka)

In [None]:
K = KNN.compute_k(xtrain.shape[0])

K

### Funkcija za treniranje i primenu KNN

In [None]:
def train_and_predict(knn_cls, k: int, metric: str, xtrain, xtest, ytrain, ytest, output: bool = True):
    knn = knn_cls(n_neighbors=k, metric=metric)
    knn.fit(xtrain, ytrain)
    pred = knn.predict(xtest)
    
    if output:
        print(f"k = {k}, metric = {metric}")
        print(confusion_matrix(ytest, pred))
        print(classification_report(ytest, pred))
        
    return pred

### Funkcija za traženje optimalne vrednosti K

In [None]:
def find_optimal_k(KNN_cls, K_min: int, K_max: int, metric: str, xtrain, xtest, ytrain, ytest) -> int:
    error_rate = []
    
    for i in range(K_min, K_max):
        pred_i = train_and_predict(KNN_cls, i, metric, xtrain, xtest, ytrain, ytest, output=False)
        error_rate.append(np.mean(pred_i != ytest)) # pred_i != ytest returns a list of bool values, whose mean is calculated by turning every True into 1 and False into 0

    # Draw plot
    plt.figure(figsize=(20, 10))
    plt.plot(range(K_min, K_max), error_rate, color='blue', marker='o', markerfacecolor='red', markersize=6)
    plt.title('Error rate vs. K value')
    plt.xlabel('K')
    plt.ylabel('Error rate')
    
    return len(error_rate) - error_rate[::-1].index(min(error_rate))

## KNN - korišćenjem bibliotečke implementacije

In [None]:
train_and_predict(KNeighborsClassifier, K, 'euclidean', xtrain, xtest, ytrain, ytest, output=True)

### Odabir i primena optimalne vrednosti K (može da potraje)

In [None]:
K_min, K_max = 1, K + 10
K_optimal = find_optimal_k(KNeighborsClassifier, K_min, K_max, 'euclidean', xtrain, xtest, ytrain, ytest)
train_and_predict(KNeighborsClassifier, K_optimal, 'euclidean', xtrain, xtest, ytrain, ytest, output=True)

## KNN - korišćenjem sopstvene implementacije

In [None]:
train_and_predict(KNN, K, 'euclidean', xtrain, xtest, ytrain, ytest, output=True)

In [None]:
K_min, K_max = 1, K + 10
K_optimal = find_optimal_k(KNN, K_min, K_max, 'euclidean', xtrain, xtest, ytrain, ytest)
train_and_predict(KNN, K_optimal, 'euclidean', xtrain, xtest, ytrain, ytest, output=True)