# Plan
✅ 1. Skini podatke \
✅ 2. Izdvoji train i test\
✅ 3. Analiza train skupa\
✅ 4. Normalizacija/standardizacija\
✅ 5. Vizuelizacija\
✅ 6. Dodavanje suma

------
✅ 1. Rez kao matrica + granice odlucivanja \
✅ 2. Baseline performanse - par modela\
✅ 3. Regularizacija modela + sve iz prvog koraka\
✅ 4. Dim reduction + 1. korak\
✅ 5. Otklanjanje outlier + 1. korak\
✅ 6. Redukcija suma atributa + 1. korak\
✅ 7. Otklanjanje suma labela + 1. korak\
8. OvO vs OvA\
9. Modeli otporni na sum\
10. Autoencoder za denoising?

- Ubaci da iscrta sve heatmape sa odgov. naslovom - da bi uporedio base, +reg, +dimred...

# Import

In [1]:
%pip install kagglehub openpyxl imbalanced-learn seaborn torch

You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [None]:
import kagglehub as kg
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import PowerTransformer, MinMaxScaler

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from helper.plot import plot_attr_label, plot_attributes
from helper.model import MLP
from helper.transform import transform_pca, remove_outliers_zscore, remove_outliers_db, remove_outliers_isf, bin_attributes_mean, bin_attributes_median, regression_reduce_noise
from helper.train import run_models

# Load data

In [None]:
path = kg.dataset_download("muratkokludataset/dry-bean-dataset")
file_name = '/Dry_Bean_Dataset/Dry_Bean_Dataset.xlsx'
print("Downloaded at: ", path)

In [None]:
data = pd.read_excel(path+file_name)

In [None]:
data

In [None]:
def get_onehot(class_count, class_ind):
    rez = np.zeros(class_count)
    rez[class_ind] = 1
    return rez

unique_vals = np.unique(data['Class'])
class_map_oh = {x: get_onehot(len(unique_vals), i) for i, x in enumerate(unique_vals)}
class_map_label = {x: i for i, x in enumerate(unique_vals)}

In [None]:
data['Class_OneHot'] = data['Class'].apply(lambda x: class_map_oh[x])
data['Class_Label'] = data['Class'].apply(lambda x: class_map_label[x])
data.drop(axis=1, labels=['Class'], inplace=True)

In [None]:
data

# Train/Test split

Odmah izdvajamo train i test set u razmeri 70:30

In [None]:
train_perc = 0.8
test_perc = 1 - train_perc

In [None]:
split = StratifiedShuffleSplit(n_splits=1, test_size=test_perc, random_state=42)

# Perform the split
for train_idx, test_idx in split.split(data, data['Class_Label']):
    train_set = data.iloc[train_idx]
    test_set = data.iloc[test_idx]

print('Train size: ', len(train_set), 'x', len(train_set.iloc[0]))
print('Test size: ', len(test_set), 'x', len(test_set.iloc[0]))

# EDA

In [None]:
train_set

In [None]:
train_set.describe()

Atributi 
- Area
- Perimeter
- Major Axis Length
- Minor Axis Length
- AspectRation
- ConvexArea
- EquivDiameter 

imaju dosta velike vrednosti dok ostali atributi su u range $[0-1]$

In [None]:
plot_attributes(train_set)

Sa grafikona vidimo da vrednosti atributa ne prate normalnu raspodelu.\
Atributi kao ShapeFactor4 i Solidity imaju velike repove. Zato ćemo tokom preprocesiranja normalizovati raspodele.\
Pored toga skup podataka nije balansiran:
- Klasa 1 je slabo zastupljena sa manje od 500 instanci, klasa 0 ima oko 1000 dok klasa 3 dominira sa oko 2500 instanci
- Zato ćemo izvršiti under i oversampling na oko 1-1.5 hiljade instanci

In [None]:
plot_attr_label(train_set)

Po grafikonu iznad vidimo da se klasa 1 karakteriše visokim vrednostima Major i Minor axis length, EquivDiameter, ConvexArea, Area i Perimeter. Pored toga ima male vrednosti ShapeFactor1.

<div style="background-color:#1212AA; height:auto; border-radius:10px; padding:16px; width:600px; color:white">
<h3>Zaključci</h3>
<ul>
<li>Podaci su nebalansirani, potrebno je under i over sample-ovati na oko 1-1.5 hiljade instanci</li>
<li>Atribute je potrebno normalizovati, a neke i skalirati kao što su Area</li>
<li>Klasa 1 je nedovoljno zastupljena ali lako prepoznatljiva po atributima koji imaju visoke vrednosti kao što je Area</li>
</ul>
</div>


# Normalizacija i Standardizacija

In [None]:
y_oh = train_set['Class_OneHot']
y_lab = train_set['Class_Label']
X = train_set.drop(axis = 1, labels=['Class_OneHot','Class_Label'])

In [None]:
pt = PowerTransformer(method='yeo-johnson')
X_pt = pt.fit_transform(X)

In [None]:
scaler = MinMaxScaler()
train_set_scaled = scaler.fit_transform(X_pt)
X_pt = pd.DataFrame(train_set_scaled, columns=X.columns)

In [None]:
plot_attributes(X_pt)

Podaci sada prate raspodele dosta bliže normalnoj raspodeli

In [None]:
plot_attr_label(X_pt, labels=y_lab)

# Under/OverSampling

In [None]:
target_samples = 1250 # Pick a number in the range

# Oversample with SMOTE and undersample with RandomUnderSampler
smote = SMOTE(sampling_strategy=lambda y: {k: max(target_samples, v) for k, v in Counter(y).items() if v < target_samples}, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_pt, y_lab)

under = RandomUnderSampler(sampling_strategy=lambda y: {k: min(target_samples, v) for k, v in Counter(y).items() if v > target_samples}, random_state=42)
X_resampled, y_resampled = under.fit_resample(X_resampled, y_resampled)

# Convert back to DataFrame
df_balanced = pd.DataFrame(X_resampled, columns=X.columns)
df_balanced['target'] = y_resampled

In [None]:
df_balanced['target_oh'] = df_balanced['target'].apply(lambda x: get_onehot(len(np.unique(df_balanced['target'])), x))

In [None]:
df_balanced

Sada imamo tačno 1250 instaci u svakoj klasi

<div style="background-color:#12AA12; height:auto; border-radius:10px; padding:16px; width:600px; color:white">
<h3>Zaključci</h3>
<ul>
<li>Podaci su sada izbalansirani - svaka klasa ima 1250 instanci</li>
<li>Atribute su normalizovani i skalirani na 0-1</li>
</ul>
</div>

# Baseline performanse

In [None]:
X_t = df_balanced.drop(axis=1,labels=['target','target_oh'])
y = df_balanced['target']

all_res = []
all_res_change=[]

In [None]:
X_t

In [None]:
feature_num = len(X_t.values[0])
rfc = (RandomForestClassifier(n_jobs=4), False)
svc = (SVC(), False)
gbc = (GradientBoostingClassifier(), False)
abc = (AdaBoostClassifier(), False)
knc = (KNeighborsClassifier(), False)
mlp = (MLP(input_size=feature_num, hidden_size=5, output_size=len(np.unique(y))), True)

models = [rfc, svc, gbc, abc, knc, mlp]
noise_schedule = [0, 0.1, 0.2, 0.3, 0.4, 0.5]

In [None]:
new_res, new_res_change = run_models(X_t, y, models, all_res, all_res_change, transforms=[], title="Baseline")

<div style="background-color:#1212AA; height:auto; border-radius:10px; padding:16px; width:1000px; color:white">
<h3>Zaključci</h3>
<ul>
<li>Kod svih modela se može primetiti da imaju poteškoća kod predviđanja instanci koje se nalaze u gustim oblastima gde ima dosta mešanja instanca različitih klasa.</li>
<li>Potencijalno je poboljšanje sa regularizacijom modela</li>
<li>Pored toga postoje instance koje su dosta odvojene od ostatka instance što bi moglo da reši uklanjanje outlier-a</li>
<li>Na heatmap-i vidimo preciznost modela kao i promene u preciznosti usled povećanja šuma</li>
<li>Svi modeli imaju dosta dobre performanse (oko 93% preciznosti) kada nema šuma, dok povećanjem šuma se preciznost smanjuje</li>
<li>RandomForest i GradientBoosting klasifikatori su za sada najotporniji na šum labela (pad oko 15-20% kada pola skupa se zameni nasumičnim vrednostima), dok su se perceptron i KNeighbors klasifikator najgore pokazali</li>
<li>Svi modeli imaju velik pad u preciznosti kada se pojavi šum medju labelama, što se vidi i na dijagramima sa granicama odlučivanja</li>
<li>AdaBoost model postiže čak bolje performanse kada se unese malo šuma. Šum ovde služi kao način regularizacije modela.</li>
</ul>
</div>

# Regularizacija modela

In [None]:
feature_num = len(X_t.values[0])
rfc = (RandomForestClassifier(n_jobs=4, max_depth=4, max_features='log2'), False)
svc = (SVC(C = 10, kernel='linear'), False)
gbc = (GradientBoostingClassifier(max_features='log2'), False)
abc = (AdaBoostClassifier(), False)
knc = (KNeighborsClassifier(n_neighbors=15), False)
mlp = (MLP(input_size=feature_num, hidden_size=5, output_size=len(np.unique(y)), l2_reg=0.005), True)

models = [rfc, svc, gbc, abc, knc, mlp]
noise_schedule = [0, 0.1, 0.2, 0.3, 0.4, 0.5]

In [None]:
new_res, new_res_change = run_models(X_t, y, models, all_res, all_res_change, transforms=[], title="With regularization")
all_res.append(("After regularization", new_res))
all_res_change.append(("After regularization", new_res_change))

<div style="background-color:#1212AA; height:auto; border-radius:10px; padding:16px; width:1000px; color:white">
<h3>Zaključci</h3>
<ul>
<li>Regularizacijom modela se u ovom slučaju ne postižu bolji rezultati, čak se često dobijaju i gori</li>
<li>U nekim slučajevima je regularizacija poboljšala performanse, ali poboljšanja nisu značajna</li>
</ul>
</div>

# Redukcija dimenzionalnosti

Na dalje će testiranje biti vršeno na sledeći način
- U skup podataka se doda određena količina šuma
- Nad trening podacima se primena neka od tehnika - PCA, uklanjanje outliera, redukcija šuma ili se nad labela primeni otklanjanje šuma labela
- Nad tako dobijenim podacima se vrši trening i procena performansi

In [None]:
feature_num = 5
rfc = (RandomForestClassifier(n_jobs=4), False)
svc = (SVC(), False)
gbc = (GradientBoostingClassifier(), False)
abc = (AdaBoostClassifier(), False)
knc = (KNeighborsClassifier(), False)
mlp = (MLP(input_size=feature_num, hidden_size=5, output_size=len(np.unique(y))), True)

models = [rfc, svc, gbc, abc, knc, mlp]
noise_schedule = [0, 0.1, 0.2, 0.3, 0.4, 0.5]

In [None]:
new_res, new_res_change = run_models(X_t, y, models, all_res, all_res_change, transforms=[transform_pca], title="With dimensionality reduction")

<div style="background-color:#1212AA; height:auto; border-radius:10px; padding:16px; width:1000px; color:white">
<h3>Zaključci</h3>
<ul>
<li>Regularizacijom modela se u ovom slučaju ne postižu bolji rezultati, čak se često dobijaju i gori</li>
<li>U nekim slučajevima je regularizacija poboljšala performanse, ali poboljšanja nisu značajna</li>
</ul>
</div>

# Izbacivanje outliera

## ZScore

In [None]:
feature_num = 16
rfc = (RandomForestClassifier(n_jobs=4), False)
svc = (SVC(), False)
gbc = (GradientBoostingClassifier(), False)
abc = (AdaBoostClassifier(), False)
knc = (KNeighborsClassifier(), False)
mlp = (MLP(input_size=feature_num, hidden_size=5, output_size=len(np.unique(y))), True)

models = [rfc, svc, gbc, abc, knc, mlp]
noise_schedule = [0, 0.1, 0.2, 0.3, 0.4, 0.5]

In [None]:
new_res, new_res_change = run_models(X_t, y, models, all_res, all_res_change, transforms=[remove_outliers_zscore], title="With zscore outlier removal")

## IsolationForest

In [None]:
new_res, new_res_change = run_models(X_t, y, models, all_res, all_res_change, transforms=[remove_outliers_isf], title="With isolation forest outlier removal")

## DBSCAN

In [None]:
run_models(X_t, y, models, all_res, all_res_change, transforms=[remove_outliers_db], title="With db outlier removal")

# Redukcija šuma atributa

In [None]:
run_models(X_t, y, models, all_res, all_res_change, transforms=[bin_attributes_mean], title="attribute binning, 10 bins")

In [None]:
run_models(X_t, y, models, all_res, all_res_change, transforms=[bin_attributes_median], title="attribute binning, 10 bins")

In [None]:
run_models(X_t, y, models, all_res, all_res_change, transforms=[regression_reduce_noise], title="reduce attribute noise with regression")

# Otklanjanje suma labela