# Gaussian Naive Bayes - removed cyanophycae

## Without preprocessing

In [None]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

In [None]:
train_csv = pd.read_csv('data/no_processing/train_100.csv', header=None)
print("read train")
test_csv = pd.read_csv('data/no_processing/test_100.csv', header=None)
print("read test")
validation_csv = pd.read_csv('data/no_processing/val_100.csv', header=None)
print("read val")
Y_train = train_csv.iloc[:, -1]
X_train = train_csv.iloc[:,:-1]
X_test = test_csv.iloc[:,:-1]
Y_test = test_csv.iloc[:, -1]

In [None]:
Y_train.nunique()

In [None]:
X_val = validation_csv.iloc[:, :-1]
Y_val = validation_csv.iloc[:, -1]
y_to_remove = [18, 19, 1, 37, 47, 10, 35, 9, 31, 2]

train_mask = np.isin(Y_train, y_to_remove, invert=True)
X_train = X_train[train_mask]
Y_train = Y_train[train_mask]

test_mask = np.isin(Y_test, y_to_remove, invert=True)
X_test = X_test[test_mask]
Y_test = Y_test[test_mask]

val_mask = np.isin(Y_val, y_to_remove, invert=True)
X_val = X_val[val_mask]
Y_val = Y_val[val_mask]
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit_transform(X_train)
scaler.fit_transform(X_test)
scaler.fit_transform(X_val)

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV

cv_method = RepeatedStratifiedKFold(n_splits=3,
                                    n_repeats=1,
                                    random_state=999)

from sklearn.preprocessing import PowerTransformer

params_NB = {'var_smoothing': np.logspace(0, -9, num=30)}
model = GaussianNB()

gs_NB = GridSearchCV(estimator=model,
                     param_grid=params_NB,
                     cv=cv_method,
                     verbose=1,
                     scoring='f1_micro')

Data_transformed = PowerTransformer().fit_transform(X_test)

gs_NB.fit(X_train, Y_train)

In [None]:
gs_NB.best_params_

In [None]:
gs_NB.best_score_

In [None]:
# gs_NB.fit(X_train, Y_train)
gs_NB.fit(X_train, Y_train)
Y_pred = gs_NB.predict(X_test)
print(classification_report(Y_test, Y_pred, zero_division=0))

### Ensembling - Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier

base_model = GaussianNB()
ensemble_model = BaggingClassifier(estimator=base_model, n_estimators=10)

ensemble_model.fit(X_train, Y_train)
ensemble_model.fit(X_val, Y_val)
Y_bag_pred = ensemble_model.predict(X_test)

print(classification_report(Y_test, Y_bag_pred, zero_division=0))

### Ensembling - Boosting

In [None]:
from sklearn.ensemble import AdaBoostClassifier

base_model = GaussianNB()
ensemble_model = AdaBoostClassifier(estimator=base_model, n_estimators=10)

ensemble_model.fit(X_train, Y_train)
ensemble_model.fit(X_val, Y_val)
Y_boost_pred = ensemble_model.predict(X_test)

print(classification_report(Y_test, Y_bag_pred, zero_division=0))

## With PCA

In [None]:
pca_train_csv = pd.read_csv('data/pca/train_pca_100.csv', header=None)
print("read pca_train")
pca_test_csv = pd.read_csv('data/pca/test_pca_100.csv', header=None)
print("read pca_test")
pca_validation_csv = pd.read_csv('data/pca/val_pca_100.csv', header=None)
print("read pca_val")
Y_pca_train = pca_train_csv.iloc[:, -1]
X_pca_train = pca_train_csv.iloc[:,:-1]
X_pca_test = pca_test_csv.iloc[:,:-1]
Y_pca_test = pca_test_csv.iloc[:, -1]

In [None]:
Y_pca_train.nunique()

In [None]:
X_pca_val = pca_validation_csv.iloc[:, :-1]
Y_pca_val = pca_validation_csv.iloc[:, -1]

X_pca_val.drop(X_pca_val.tail(1).index, inplace=True)
Y_pca_val.drop(Y_pca_val.tail(1).index, inplace=True)
print(X_pca_val)
y_to_remove = [18, 19, 1, 37, 47, 10, 35, 9, 31, 2]

train_mask = np.isin(Y_pca_train, y_to_remove, invert=True)
X_pca_train = X_pca_train[train_mask]
Y_pca_train = Y_pca_train[train_mask]

test_mask = np.isin(Y_pca_test, y_to_remove, invert=True)
X_pca_test = X_pca_test[test_mask]
Y_pca_test = Y_pca_test[test_mask]

val_mask = np.isin(Y_pca_val, y_to_remove, invert=True)
X_pca_val = X_pca_val[val_mask]
Y_pca_val = Y_pca_val[val_mask]

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit_transform(X_pca_train)
scaler.fit_transform(X_pca_test)
scaler.fit_transform(X_pca_val)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import time
st = time.time()
from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV

cv_method = RepeatedStratifiedKFold(n_splits=4,
                                    n_repeats=3,
                                    random_state=999)

from sklearn.preprocessing import PowerTransformer
params_NB = {'var_smoothing': np.logspace(0,-9, num=5000)}
pca_model = GaussianNB()

gs_NB = RandomizedSearchCV(estimator=pca_model,
                     param_distributions=params_NB,
                     cv=cv_method,
                     verbose=1,
                     scoring='accuracy')

# Data_transformed = PowerTransformer().fit_transform(X_pca_test)
gs_NB.fit(X_pca_train, Y_pca_train)
end = time.time()
elapsed = end - st
print("elapsed time", elapsed)

In [None]:
gs_NB.best_params_

In [None]:
gs_NB.best_score_

In [None]:
# gs_NB.fit(X_pca_train, Y_pca_train)
Y_pca_pred = gs_NB.predict(X_pca_test)

In [None]:
print(classification_report(Y_pca_test, Y_pca_pred, zero_division=0))

In [None]:
from joblib import dump, load
dump(gs_NB, 'gaussian_nb_tuned.joblib')

### Ensembling - Bagging


In [None]:
from sklearn.ensemble import BaggingClassifier
import warnings
import time
warnings.filterwarnings('ignore')
st = time.time()

ensemble_model = BaggingClassifier(estimator=gs_NB, n_estimators=50)

ensemble_model.fit(X_pca_train, Y_pca_train)
ensemble_model.fit(X_pca_val, Y_pca_val)
Y_bag_pred = ensemble_model.predict(X_pca_test)
end = time.time()

print(classification_report(Y_pca_test, Y_bag_pred, zero_division=0))
print("time: ", end - st)

### Ensembling - Boosting

In [None]:
from sklearn.ensemble import AdaBoostClassifier
st = time.time()

ensemble_model = AdaBoostClassifier(estimator=gs_NB, n_estimators=50)

ensemble_model.fit(X_pca_train, Y_pca_train)
ensemble_model.fit(X_pca_val, Y_pca_val)
Y_bag_pred = ensemble_model.predict(X_pca_test)
end = time.time()

print(classification_report(Y_pca_test, Y_bag_pred, zero_division=0))