# Algorithms

In [1]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

### Ucitavanje pretprocesiranih podataka

In [2]:
def dataset_partitioning(X_balanced, y_balanced):
    X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.111, random_state=42, stratify=y_balanced)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42, stratify=y_train)

    print('Broj ciljnih vrednosti \'closed\' u trening skupu:', np.sum(y_train==0))
    print('Broj ciljnih vrednosti \'closed\' u test skupu:', np.sum(y_test==0))
    print('Broj ciljnih vrednosti \'closed\' u validacionom skupu:', np.sum(y_val==0))
    return X_train, y_train, X_test, y_test, X_val, y_val

In [3]:
X_oversampling = pd.read_csv('oversampling_x.csv')
y_oversampling = pd.read_csv('oversampling_y.csv')
y_oversampling = y_oversampling.values.reshape(-1)

In [4]:
X_train_o, y_train_o, X_test_o, y_test_o, X_val_o, y_val_o = dataset_partitioning(X_oversampling, y_oversampling)

Broj ciljnih vrednosti 'closed' u trening skupu: 18591
Broj ciljnih vrednosti 'closed' u test skupu: 2580
Broj ciljnih vrednosti 'closed' u validacionom skupu: 2066


In [5]:
X_undersampling = pd.read_csv('undersampling_x.csv')
y_undersampling = pd.read_csv('undersampling_y.csv')
y_oversampling = y_oversampling.values.reshape(-1)

Unnamed: 0,status
0,0
1,0
2,0
3,0
4,0
...,...
1103,1
1104,1
1105,1
1106,1


In [8]:
X_train_u, y_train_u, X_test_u, y_test_u, X_val_u, y_val_u = dataset_partitioning(X_undersampling, y_undersampling)

Broj ciljnih vrednosti 'closed' u trening skupu: status    443
dtype: int64
Broj ciljnih vrednosti 'closed' u test skupu: status    62
dtype: int64
Broj ciljnih vrednosti 'closed' u validacionom skupu: status    49
dtype: int64


## Precision, recall i F1 mera: za manjinsku klasu i ceo dataset

In [9]:
def get_precision_recall_f1score_for_minority_class(y_test, y_test_pred):
    # Izvlacenje **manjiske klase**
    y_test1 = np.array(y_test)
    y_test_pred1 = np.array(y_test_pred)

    # Izdvajanje indeksa gde se nalaze vrednosti 0 u y_test
    idx = np.where(y_test1 == 0)[0]

    # Izdvajanje samo vrednosti 0 iz y_test i y_test_pred pomocu indeksa
    y_test1 = y_test1[idx]
    y_test_pred1 = y_test_pred1[idx]

    precision = precision_score(y_test1, y_test_pred1, pos_label=0)
    recall = recall_score(y_test1, y_test_pred1, pos_label=0)
    f1 = f1_score(y_test1, y_test_pred1, pos_label=0)

    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1-score: ", f1)

In [10]:
def get_precision_recall_f1score(y_test, y_test_pred):
    precision = precision_score(y_test, y_test_pred)
    recall = recall_score(y_test, y_test_pred)
    f1 = f1_score(y_test, y_test_pred)

    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1-score: ", f1)

## Random Forest 

In [11]:
# Kreiraj instancu Random Forest klasifikatora sa 10 stabala
def random_forest(X_train, y_train, X_test, y_test, n_estimator): 
    rfc = RandomForestClassifier(n_estimators=n_estimator)

    # Obuči model na X_train i y_train podacima
    rfc.fit(X_train, y_train)

    # Predvidi klase za X_train i X_test podatke
    y_train_pred = rfc.predict(X_train)
    y_test_pred = rfc.predict(X_test)

    # Izračunaj tačnost modela
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    print("Train Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
#     print(y_test_pred)
    return y_test_pred

In [12]:
# Random Forest
y_rnd_forest_u_pred = random_forest(X_train_u, y_train_u, X_test_u, y_test_u, 10)
print("\n*** Precision recall i f1 mera nad celim skupom ***")
get_precision_recall_f1score(y_test_u, y_rnd_forest_u_pred)
print("\n*** Precision recall i f1 mera nad manjinskoj klasi ***")
get_precision_recall_f1score_for_minority_class(y_test_u, y_rnd_forest_u_pred)

Train Accuracy: 0.9571106094808126
Test Accuracy: 0.7723577235772358

*** Precision recall i f1 mera nad celim skupom ***
Precision:  0.7894736842105263
Recall:  0.7377049180327869
F1-score:  0.7627118644067797

*** Precision recall i f1 mera nad manjinskoj klasi ***
Precision:  1.0
Recall:  0.8064516129032258
F1-score:  0.8928571428571428


  rfc.fit(X_train, y_train)


## Linearna regresija (TODO: nije na spisku - obrisati?)

In [13]:
def linear_regression(X_train, y_train, X_test, y_test):
    reg = LinearRegression()

    # Obuči model na trening podacima
    reg.fit(X_train, y_train)

    # Predvidi vrednosti za test podatke
    y_pred = reg.predict(X_test)

    # Izračunaj srednju kvadratnu grešku
    mse = mean_squared_error(y_test, y_pred)

    # Prikaz rezultata
#     print("Koeficijenti regresije:", reg.coef_)
#     print("Intercept:", reg.intercept_)
    print("Linear regression accuracy:", mse)

In [14]:
linear_regression(X_train_u, y_train_u, X_test_u, y_test_u)

Linear regression accuracy: 0.17895286652193892


## Logisticka regresija

In [15]:
def logistic_regression(X_train, y_train, X_test, y_test):
    # kreiranje objekta modela logističke regresije
    lr_model = LogisticRegression()

    # učenje modela na trening skupu podataka
    lr_model.fit(X_train, y_train)

    # predviđanje izlaza za test skup podataka
    y_pred = lr_model.predict(X_test)

    # izračunavanje tačnosti modela
    accuracy = lr_model.score(X_test, y_test)

    # ispisivanje tačnosti
    print("Logistic regression accuracy:", accuracy)

In [16]:
logistic_regression(X_train_u, y_train_u, X_test_u, y_test_u)

Logistic regression accuracy: 0.36585365853658536


  y = column_or_1d(y, warn=True)


## K-Nearest Neighbour

In [17]:
def knn(X_train, y_train, X_test, y_test, n_neighbors=5):
    # Inicijalizacija klasifikatora k-najbližih suseda
    knn_model = KNeighborsClassifier(n_neighbors=n_neighbors)
    
    # Treniranje klasifikatora k-najbližih suseda na trening skupu podataka
    knn_model.fit(X_train, y_train)
    
    # Predviđanje ciljnih vrednosti na test skupu podataka
    y_pred = knn_model.predict(X_test)
    
    # Izračunavanje tačnosti predikcije
    accuracy = accuracy_score(y_test, y_pred)
    print("KNN accuracy: ", accuracy)

In [18]:
knn(X_train_u, y_train_u, X_test_u, y_test_u, 5)
knn(X_train_u, y_train_u, X_test_u, y_test_u, 3) 

  return self._fit(X, y)


KNN accuracy:  0.6747967479674797
KNN accuracy:  0.6747967479674797


  return self._fit(X, y)


## Decision tree

In [19]:
def decision_tree(X_train, y_train, X_test, y_test, max_depth=None, min_samples_split=2):
    # Inicijalizacija klasifikatora odlučivanja
    dtc = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split)
    
    # Treniranje klasifikatora odlučivanja na trening skupu podataka
    dtc.fit(X_train, y_train)
    
    # Predviđanje ciljnih vrednosti na test skupu podataka
    y_pred = dtc.predict(X_test)
    
    # Izračunavanje tačnosti predikcije
    accuracy = accuracy_score(y_test, y_pred)
    print("Decision tree accuracy: ", accuracy)

In [20]:
decision_tree(X_train_u, y_train_u, X_test_u, y_test_u, 1, 5)

Decision tree accuracy:  0.7317073170731707


## Bagging 

In [21]:
def bagging(X_train, y_train, X_test, y_test):
    # kreiranje objekta baznog modela - odlučivanje drveta (Decision Tree)
    base_model = DecisionTreeClassifier()

    # kreiranje objekta modela Bagging
    bagging_model = BaggingClassifier(estimator=base_model, n_estimators=10, random_state=42)

    # treniranje modela
    bagging_model.fit(X_train, y_train)

    # evaluacija modela
    accuracy = bagging_model.score(X_test, y_test)
    print("Bagging accuracy:", accuracy)

In [22]:
bagging(X_train_u, y_train_u, X_test_u, y_test_u)

Bagging accuracy: 0.7967479674796748


  y = column_or_1d(y, warn=True)


## XGBoost

In [23]:
def xgboost(X_train, y_train, X_test, y_test):
    # kreiranje objekta modela XGBoost
    xgb_model = xgb.XGBClassifier()

    # treniranje modela
    xgb_model.fit(X_train, y_train)

    # evaluacija modela
    accuracy = xgb_model.score(X_test, y_test)
    print("XGBoost accuracy:", accuracy)
    return xgb_model

In [24]:
def optimize_hyperparameters(model, X_train, y_train, X_test, y_test):
    # definisanje mreže hiperparametara koje želimo da isprobamo
    param_grid = {'max_depth': [3, 4, 5],
                  'learning_rate': [0.1, 0.01, 0.001],
                  'n_estimators': [50, 100, 200]}
#                   'subsample': [0.6, 0.8, 1.0],
#                   'colsample_bytree': [0.6, 0.8, 1.0]}

    # kreiranje objekta GridSearchCV
    grid_search = GridSearchCV(xgb_model, param_grid=param_grid, cv=5)

    # treniranje modela sa GridSearchCV objektom
    grid_search.fit(X_train, y_train)

    # evaluacija modela
    accuracy = grid_search.score(X_test, y_test)
    print('Optimizing hyperparameters:', accuracy)

In [25]:
xgb_model = xgboost(X_train_u, y_train_u, X_test_u, y_test_u)

XGBoost accuracy: 0.8048780487804879


In [26]:
# optimize_hyperparameters(xgb_model, X_train_u, y_train_u, X_test_u, y_test_u)