# Bernoulli Naive Bayes - removed cyanophycae

## Without preprocessing

In [1]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import classification_report

In [1]:
train_csv = pd.read_csv('data/no_processing/train_100.csv', header=None)
print("read train")
test_csv = pd.read_csv('data/no_processing/test_100.csv', header=None)
print("read test")
validation_csv = pd.read_csv('data/no_processing/val_100.csv', header=None)
print("read val")
Y_train = train_csv.iloc[:, -1]
X_train = train_csv.iloc[:,:-1]
X_test = test_csv.iloc[:,:-1]
Y_test = test_csv.iloc[:, -1]

read train
read test
read val


In [2]:
Y_train.nunique()

50

In [3]:
X_val = validation_csv.iloc[:, :-1]
Y_val = validation_csv.iloc[:, -1]

In [3]:
y_to_remove = [18, 19, 1, 37, 47, 10, 35, 9, 31, 2]

train_mask = np.isin(Y_train, y_to_remove, invert=True)
X_train = X_train[train_mask]
Y_train = Y_train[train_mask]

test_mask = np.isin(Y_test, y_to_remove, invert=True)
X_test = X_test[test_mask]
Y_test = Y_test[test_mask]

val_mask = np.isin(Y_val, y_to_remove, invert=True)
X_val = X_val[val_mask]
Y_val = Y_val[val_mask]

In [4]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit_transform(X_train)
scaler.fit_transform(X_test)
scaler.fit_transform(X_val)

array([[-0.16512934, -0.16518021, -0.16526912, ..., -0.16076251,
        -0.16076376, -0.16076075],
       [ 0.53273529,  0.53267264,  0.53258152, ...,  0.53747973,
         0.53748393,  0.53747385],
       [ 0.18380298,  0.18374621,  0.1836562 , ...,  0.18835861,
         0.18836008,  0.18835655],
       ...,
       [-0.51406165, -0.51410663, -0.51419443, ..., -0.50988363,
        -0.50988761, -0.50987805],
       [ 0.18380298,  0.18374621,  0.1836562 , ...,  0.18835861,
         0.18836008,  0.18835655],
       [-1.21192628, -1.21195948, -1.21204507, ..., -1.20812587,
        -1.2081353 , -1.20811266]])

In [5]:
model = BernoulliNB()
model.fit(X_train, Y_train)
model.fit(X_val, Y_val)
Y_pred = model.predict(X_test)

In [6]:
from sklearn.metrics import f1_score, accuracy_score

print(f1_score(Y_test, Y_pred, average='micro'))
print(classification_report(Y_test, Y_pred, zero_division=0))

0.5434826677478208
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         4
         3.0       0.00      0.00      0.00        20
         4.0       0.16      0.28      0.20        72
         5.0       0.00      0.00      0.00         8
         6.0       0.56      0.40      0.47       208
         7.0       1.00      0.18      0.31        33
         8.0       0.00      0.00      0.00        15
        11.0       0.67      0.05      0.10        37
        12.0       0.20      0.01      0.02       108
        13.0       0.62      0.84      0.71      1025
        14.0       0.00      0.00      0.00        16
        15.0       0.00      0.00      0.00        31
        16.0       0.49      0.48      0.48       216
        17.0       1.00      0.06      0.11        34
        20.0       0.00      0.00      0.00        16
        21.0       0.36      0.50      0.41       338
        22.0       0.00      0.00      0.00         4
        

### Ensembling - Bagging

In [7]:
from sklearn.ensemble import BaggingClassifier

base_model = BernoulliNB()
ensemble_model = BaggingClassifier(estimator=base_model, n_estimators=10)

ensemble_model.fit(X_train, Y_train)
ensemble_model.fit(X_val, Y_val)
Y_bag_pred = ensemble_model.predict(X_test)


In [8]:
print(classification_report(Y_test, Y_bag_pred, zero_division=0))


              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         4
         3.0       0.00      0.00      0.00        20
         4.0       0.17      0.22      0.19        72
         5.0       0.00      0.00      0.00         8
         6.0       0.52      0.45      0.48       208
         7.0       0.00      0.00      0.00        33
         8.0       0.00      0.00      0.00        15
        11.0       0.00      0.00      0.00        37
        12.0       0.00      0.00      0.00       108
        13.0       0.61      0.85      0.71      1025
        14.0       0.00      0.00      0.00        16
        15.0       0.00      0.00      0.00        31
        16.0       0.48      0.49      0.49       216
        17.0       1.00      0.03      0.06        34
        20.0       0.00      0.00      0.00        16
        21.0       0.36      0.48      0.41       338
        22.0       0.00      0.00      0.00         4
        23.0       0.00    

### Ensembling - Boosting

In [9]:
from sklearn.ensemble import AdaBoostClassifier

base_model = BernoulliNB()
ensemble_model = AdaBoostClassifier(estimator=base_model, n_estimators=10)

ensemble_model.fit(X_train, Y_train)
ensemble_model.fit(X_val, Y_val)
Y_bag_pred = ensemble_model.predict(X_test)

print(classification_report(Y_test, Y_bag_pred, zero_division=0))

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         4
         3.0       0.00      0.00      0.00        20
         4.0       0.15      0.44      0.22        72
         5.0       0.00      0.00      0.00         8
         6.0       0.00      0.00      0.00       208
         7.0       0.00      0.00      0.00        33
         8.0       0.00      0.00      0.00        15
        11.0       0.00      0.00      0.00        37
        12.0       0.02      0.84      0.05       108
        13.0       0.00      0.00      0.00      1025
        14.0       0.00      0.00      0.00        16
        15.0       0.00      0.00      0.00        31
        16.0       0.00      0.00      0.00       216
        17.0       0.00      0.00      0.00        34
        20.0       0.00      0.00      0.00        16
        21.0       0.00      0.00      0.00       338
        22.0       0.00      0.00      0.00         4
        23.0       0.00    

## With PCA

In [10]:
pca_train_csv = pd.read_csv('data/pca/train_pca_100.csv', header=None)
print("read pca_train")
pca_test_csv = pd.read_csv('data/pca/test_pca_100.csv', header=None)
print("read pca_test")
pca_validation_csv = pd.read_csv('data/pca/val_pca_100.csv', header=None)
print("read pca_val")
Y_pca_train = pca_train_csv.iloc[:, -1]
X_pca_train = pca_train_csv.iloc[:,:-1]
X_pca_test = pca_test_csv.iloc[:,:-1]
Y_pca_test = pca_test_csv.iloc[:, -1]

read pca_train
read pca_test
read pca_val


In [11]:
Y_pca_train.nunique()

50

In [12]:
X_pca_val = pca_validation_csv.iloc[:, :-1]
Y_pca_val = pca_validation_csv.iloc[:, -1]
X_pca_val.drop(X_pca_val.tail(1).index,inplace=True)
Y_pca_val.drop(Y_pca_val.tail(1).index, inplace=True)
print(X_pca_val)

             0          1          2          3         4          5   \
0    -16.466336  -2.714494  11.992666 -10.834296 -0.900435  -3.540316   
1     -8.783142   3.957514  -2.074658   9.007975  0.816249   1.089003   
2    -17.093755  33.573846   5.749220  -3.429278 -0.537504  -3.497083   
3    -10.569433  -9.104736   2.482312  10.366223  0.910529  -3.078894   
4      2.345496 -11.760298 -10.589292  -4.828380  1.360099  15.326894   
...         ...        ...        ...        ...       ...        ...   
3246 -11.496391  -8.861822   5.634696 -14.977110  0.699933   7.998366   
3247  -2.400615   5.305121  -8.608640  10.670448  1.238588   0.343435   
3248  -2.273494   6.826449  -3.050299   9.093809  1.448333  -7.470119   
3249   4.975186  39.419058 -17.034515  -4.816109  3.506012   7.746236   
3250 -14.590118  40.628793  -3.396723   9.476272  0.752511   2.059976   

            6         7          8          9   ...        90        91  \
0     0.895354  3.264754   1.058304   1.483603  

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_pca_val.drop(X_pca_val.tail(1).index,inplace=True)


In [13]:
y_to_remove = [18, 19, 1, 37, 47, 10, 35, 9, 31, 2]

train_mask = np.isin(Y_pca_train, y_to_remove, invert=True)
X_pca_train = X_pca_train[train_mask]
Y_pca_train = Y_pca_train[train_mask]

test_mask = np.isin(Y_pca_test, y_to_remove, invert=True)
X_pca_test = X_pca_test[test_mask]
Y_pca_test = Y_pca_test[test_mask]

val_mask = np.isin(Y_pca_val, y_to_remove, invert=True)
X_pca_val = X_pca_val[val_mask]
Y_pca_val = Y_pca_val[val_mask]


In [14]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit_transform(X_pca_train)
scaler.fit_transform(X_pca_test)
scaler.fit_transform(X_pca_val)

array([[-0.33819592,  0.1312604 , -0.05546641, ..., -0.30820458,
         0.80524538,  0.42536806],
       [-0.43783136, -0.53153036,  0.30623968, ..., -0.29034844,
        -0.16813204, -0.13781354],
       [ 0.36535118, -0.1758031 , -0.58086436, ..., -0.20793324,
         1.29997204, -0.43141635],
       ...,
       [ 0.01780757,  0.19963922, -0.5740963 , ...,  0.09412514,
         0.1849537 , -0.98100148],
       [ 0.02489807,  0.27683284, -0.13290716, ...,  0.15680207,
         0.69311946,  0.57715795],
       [-0.66209643,  1.99199532, -0.16040426, ..., -0.47081438,
        -0.35236594,  0.14080632]])

In [15]:
pca_model = BernoulliNB()
pca_model.fit(X_pca_train, Y_pca_train)
pca_model.fit(X_pca_val, Y_pca_val)
Y_pca_pred = pca_model.predict(X_pca_test)

In [16]:
from sklearn.metrics import f1_score

print(f1_score(Y_pca_test, Y_pca_pred, average='micro'))

print(classification_report(Y_pca_test, Y_pca_pred, zero_division=0))

0.47536995742955607
              precision    recall  f1-score   support

         0.0       0.18      0.50      0.27         4
         3.0       0.48      0.65      0.55        20
         4.0       0.17      0.28      0.21        72
         5.0       0.00      0.00      0.00         8
         6.0       0.40      0.47      0.43       208
         7.0       0.53      0.30      0.38        33
         8.0       0.00      0.00      0.00        15
        11.0       0.14      0.08      0.10        37
        12.0       0.13      0.17      0.14       108
        13.0       0.78      0.45      0.57      1025
        14.0       0.01      0.06      0.02        16
        15.0       0.20      0.42      0.27        31
        16.0       0.56      0.57      0.57       216
        17.0       0.22      0.26      0.24        34
        20.0       0.00      0.00      0.00        16
        21.0       0.46      0.55      0.50       338
        22.0       0.00      0.00      0.00         4
       

### Ensembling - Bagging

In [17]:
from sklearn.ensemble import BaggingClassifier

base_model = BernoulliNB()
ensemble_model = BaggingClassifier(estimator=base_model, n_estimators=10)

ensemble_model.fit(X_pca_train, Y_pca_train)
ensemble_model.fit(X_pca_val, Y_pca_val)
Y_bag_pred = ensemble_model.predict(X_pca_test)

print(classification_report(Y_pca_test, Y_bag_pred, zero_division=0))

              precision    recall  f1-score   support

         0.0       0.22      0.50      0.31         4
         3.0       0.52      0.60      0.56        20
         4.0       0.19      0.22      0.20        72
         5.0       0.00      0.00      0.00         8
         6.0       0.41      0.55      0.47       208
         7.0       0.75      0.27      0.40        33
         8.0       0.00      0.00      0.00        15
        11.0       0.18      0.08      0.11        37
        12.0       0.13      0.15      0.14       108
        13.0       0.76      0.48      0.59      1025
        14.0       0.02      0.06      0.03        16
        15.0       0.28      0.35      0.31        31
        16.0       0.53      0.57      0.55       216
        17.0       0.32      0.24      0.27        34
        20.0       0.00      0.00      0.00        16
        21.0       0.45      0.57      0.50       338
        22.0       0.00      0.00      0.00         4
        23.0       0.00    

### Ensembling - Boosting


In [18]:
from sklearn.ensemble import AdaBoostClassifier

base_model = BernoulliNB()
ensemble_model = AdaBoostClassifier(estimator=base_model, n_estimators=60)

ensemble_model.fit(X_pca_train, Y_pca_train)
ensemble_model.fit(X_pca_val, Y_pca_val)
Y_bag_pred = ensemble_model.predict(X_pca_test)

print(classification_report(Y_pca_test, Y_bag_pred, zero_division=0))

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         4
         3.0       0.00      0.00      0.00        20
         4.0       0.23      0.07      0.11        72
         5.0       0.00      0.00      0.00         8
         6.0       0.30      0.60      0.40       208
         7.0       1.00      0.06      0.11        33
         8.0       0.00      0.00      0.00        15
        11.0       0.00      0.00      0.00        37
        12.0       0.17      0.01      0.02       108
        13.0       0.61      0.88      0.72      1025
        14.0       0.00      0.00      0.00        16
        15.0       0.00      0.00      0.00        31
        16.0       0.42      0.40      0.41       216
        17.0       0.40      0.06      0.10        34
        20.0       0.00      0.00      0.00        16
        21.0       0.50      0.34      0.40       338
        22.0       0.00      0.00      0.00         4
        23.0       0.00    