# Algorithms

In [15]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

### Ucitavanje pretprocesiranih podataka

In [2]:
def dataset_partitioning(X_balanced, y_balanced):
    X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.111, random_state=42, stratify=y_balanced)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42, stratify=y_train)

    print('Broj ciljnih vrednosti \'closed\' u trening skupu:', np.sum(y_train==0))
    print('Broj ciljnih vrednosti \'closed\' u test skupu:', np.sum(y_test==0))
    print('Broj ciljnih vrednosti \'closed\' u validacionom skupu:', np.sum(y_val==0))
    return X_train, y_train, X_test, y_test, X_val, y_val

In [3]:
X_o = pd.read_csv('undersampling_x.csv')
y_o = pd.read_csv('undersampling_y.csv')
y_o = y_o.values.reshape(-1)

In [4]:
X_undersampling = pd.read_csv('oversampling_x.csv')
y_undersampling = pd.read_csv('oversampling_y.csv')
y_undersampling = y_undersampling.values.reshape(-1)

In [5]:
X_train_u, y_train_u, X_test_u, y_test_u, X_val_u, y_val_u = dataset_partitioning(X_undersampling, y_undersampling)

Broj ciljnih vrednosti 'closed' u trening skupu: 18591
Broj ciljnih vrednosti 'closed' u test skupu: 2580
Broj ciljnih vrednosti 'closed' u validacionom skupu: 2066


## Precision, recall i F1 mera: za manjinsku klasu i ceo dataset

In [6]:
def get_precision_recall_f1score_for_minority_class(y_test, y_test_pred):
    # Izvlacenje **manjiske klase**
    y_test1 = np.array(y_test)
    y_test_pred1 = np.array(y_test_pred)

    # Izdvajanje indeksa gde se nalaze vrednosti 0 u y_test
    idx = np.where(y_test1 == 0)[0]

    # Izdvajanje samo vrednosti 0 iz y_test i y_test_pred pomocu indeksa
    y_test1 = y_test1[idx]
    y_test_pred1 = y_test_pred1[idx]

    precision = precision_score(y_test1, y_test_pred1, pos_label=0)
    recall = recall_score(y_test1, y_test_pred1, pos_label=0)
    f1 = f1_score(y_test1, y_test_pred1, pos_label=0)

    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1-score: ", f1)

In [7]:
def get_precision_recall_f1score(y_test, y_test_pred):
    precision = precision_score(y_test, y_test_pred)
    recall = recall_score(y_test, y_test_pred)
    f1 = f1_score(y_test, y_test_pred)

    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1-score: ", f1)

## Random Forest 

In [8]:
# Kreiraj instancu Random Forest klasifikatora sa 10 stabala
def random_forest(X_train, y_train, X_test, y_test, n_estimator): 
    rfc = RandomForestClassifier(n_estimators=n_estimator)

    # Obuči model na X_train i y_train podacima
    rfc.fit(X_train, y_train)

    # Predvidi klase za X_train i X_test podatke
    y_train_pred = rfc.predict(X_train)
    y_test_pred = rfc.predict(X_test)

    # Izračunaj tačnost modela
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    print("Train Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
#     print(y_test_pred)
    return y_test_pred

In [9]:
# Random Forest
y_rnd_forest_u_pred = random_forest(X_train_u, y_train_u, X_test_u, y_test_u, 10)
print("\n*** Precision recall i f1 mera nad celim skupom ***")
get_precision_recall_f1score(y_test_u, y_rnd_forest_u_pred)
print("\n*** Precision recall i f1 mera nad manjinskoj klasi ***")
get_precision_recall_f1score_for_minority_class(y_test_u, y_rnd_forest_u_pred)

Train Accuracy: 0.9901030040609956
Test Accuracy: 0.9730567939523164

*** Precision recall i f1 mera nad celim skupom ***
Precision:  0.9780564263322884
Recall:  0.9678169833268708
F1-score:  0.9729097641785226

*** Precision recall i f1 mera nad manjinskoj klasi ***
Precision:  1.0
Recall:  0.9782945736434109
F1-score:  0.9890282131661442


## Linearna regresija (TODO: nije na spisku - obrisati?)

In [16]:
def linear_regression(X_train, y_train, X_test, y_test):
    reg = LinearRegression()

    # Obuči model na trening podacima
    reg.fit(X_train, y_train)

    # Predvidi vrednosti za test podatke
    y_pred = reg.predict(X_test)

    # Izračunaj srednju kvadratnu grešku
    mse = mean_squared_error(y_test, y_pred)

    # Prikaz rezultata
    print("Koeficijenti regresije:", reg.coef_)
    print("Intercept:", reg.intercept_)
    print("Srednja kvadratna greška:", mse)
#     print(y_pred)
    return y_pred

In [17]:
y_lr_u_pred = linear_regression(X_train_u, y_train_u, X_test_u, y_test_u)

Koeficijenti regresije: [ 5.57847225e-05 -5.75094778e-04 -1.14142926e-02  1.51955368e-10
  1.09839786e-02  9.58689327e-04  2.76285817e-04  1.37444186e-01
 -2.09388420e-02  7.97522016e-01  8.98107375e-01  9.28167449e-01
  8.89023166e-01  8.55291772e-01  8.89863837e-01  9.45852852e-01
  7.37548364e-01  9.20344150e-01  8.42737356e-01  8.97422131e-01
  8.36630111e-01  5.60453005e-01  9.13908026e-01  7.82166176e-01
  9.43752346e-01  8.88106819e-01  9.18545949e-01  9.35484245e-01
  9.64161018e-01  9.01999871e-01  4.11058092e-01  7.32599695e-01
  8.89011471e-01  9.90475392e-01  8.94440590e-01  9.15507149e-01
  9.16571482e-01  8.77965088e-01  9.13678586e-01  8.37464576e-01
  8.88215269e-01  9.03335993e-01  8.54167332e-01  9.23073916e-01
  8.81199188e-01  7.82823280e-01  8.50912689e-01  8.60738419e-01
  4.91355965e-01  8.21852058e-01  4.41531408e-01 -5.49961648e-02
  5.49961647e-02]
Intercept: 0.02213355561358127
Srednja kvadratna greška: 0.0727173774451587


## Logisticka regresija

In [11]:
def logistic_regression(X, y, learning_rate, num_iterations):
    # Inicijaliziraj težine i pristranosti na nulu
    weights = np.zeros(X.shape[1])
    bias = 0

    # broj uzoraka
    m = X.shape[0]

    # Prolazak kroz broj iteracija
    for i in range(num_iterations):
        # Izračunaj linearnu funkciju
        linear_function = np.dot(X, weights) + bias

        # Izračunaj sigmoidnu funkciju
        y_predicted = 1 / (1 + np.exp(-linear_function))

        # Izračunaj gradijent
        dw = (1 / m) * np.dot(X.T, (y_predicted - y))
        db = (1 / m) * np.sum(y_predicted - y)

        # Ažuriraj težine i pristranost
        weights = weights - learning_rate * dw
        bias = bias - learning_rate * db

    # Vrati težine i pristranost
    parameters = {"weights": weights, "bias": bias}
    return parameters

## K-Nearest Neighbour

In [12]:
def knn(X_train, y_train, X_test, y_test, n_neighbors=5):
    # Inicijalizacija klasifikatora k-najbližih suseda
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    
    # Treniranje klasifikatora k-najbližih suseda na trening skupu podataka
    knn.fit(X_train, y_train)
    
    # Predviđanje ciljnih vrednosti na test skupu podataka
    y_pred = knn.predict(X_test)
    
    # Izračunavanje tačnosti predikcije
    accuracy = accuracy_score(y_test, y_pred)
     print("KNN accuracy: ", accuracy)
    

Preciznost KNN klasifikatora je: 1.0


In [None]:
knn(X_train_u, y_train_u, X_test_u, y_test_u)

## Decision tree

In [30]:
def decision_tree(X_train, y_train, X_test, y_test, max_depth=None, min_samples_split=2):
    # Inicijalizacija klasifikatora odlučivanja
    dtc = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split)
    
    # Treniranje klasifikatora odlučivanja na trening skupu podataka
    dtc.fit(X_train, y_train)
    
    # Predviđanje ciljnih vrednosti na test skupu podataka
    y_pred = dtc.predict(X_test)
    
    # Izračunavanje tačnosti predikcije
    accuracy = accuracy_score(y_test, y_pred)
    print("Decision tree accuracy: ", accuracy)

In [34]:
decision_tree(X_train_u, y_train_u, X_test_u, y_test_u, 1, 5)

Decision tree accuracy:  0.723589842992828
