In [112]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [113]:
train = pd.read_csv('cleaned_dataset/train.csv')
test = pd.read_csv('cleaned_dataset/test.csv')

In [114]:
train['species_PIPIENS/RESTUANS']

0        1
1        1
2        1
3        1
4        1
        ..
10501    1
10502    1
10503    1
10504    1
10505    1
Name: species_PIPIENS/RESTUANS, Length: 10506, dtype: int64

In [115]:
X = train.drop(['wnvpresent', 'nummosquitos', 'species_PIPIENS/RESTUANS'], axis=1)
y = train['species_PIPIENS/RESTUANS']

In [116]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy=0.5)

In [117]:
# Create an instance of StandardScaler
scaler = MinMaxScaler(feature_range=(0, 1))

# Transform the data using the scaler
scaled_X = scaler.fit_transform(X)
species = test['species']
test.drop('species', axis=1, inplace =True)
test = test.reindex(columns=X.columns)
scaled_test = scaler.transform(test)

In [118]:
X_resampled, y_resampled = smote.fit_resample(scaled_X, y)

In [119]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.40)

In [120]:
# Define the models
model_rf = RandomForestClassifier()
model_dt = DecisionTreeClassifier()
model_knn = KNeighborsClassifier(n_neighbors=15)
model_gb = GradientBoostingClassifier()
model_ab = AdaBoostClassifier()

In [121]:
# List of models
models = [model_rf, model_dt, model_knn, model_gb, model_ab]


# Loop over the models
for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f'Model: {model.__class__.__name__}, Train score is {model.score(X_train, y_train)}, Test score is {model.score(X_test, y_test)}')
    cm = confusion_matrix(y_test, y_pred)
    cm_df = pd.DataFrame(cm, columns = ['Pred 0', 'Pred 1'], index = ['Actual 0','Actual 1'])
    display(cm_df)
    print(classification_report(y_test, y_pred))
    print()

Model: RandomForestClassifier, Train score is 0.978955402900447, Test score is 0.9551921504497138


Unnamed: 0,Pred 0,Pred 1
Actual 0,1866,161
Actual 1,113,3975


              precision    recall  f1-score   support

           0       0.94      0.92      0.93      2027
           1       0.96      0.97      0.97      4088

    accuracy                           0.96      6115
   macro avg       0.95      0.95      0.95      6115
weighted avg       0.96      0.96      0.96      6115


Model: DecisionTreeClassifier, Train score is 0.978955402900447, Test score is 0.9386753883892068


Unnamed: 0,Pred 0,Pred 1
Actual 0,1824,203
Actual 1,172,3916


              precision    recall  f1-score   support

           0       0.91      0.90      0.91      2027
           1       0.95      0.96      0.95      4088

    accuracy                           0.94      6115
   macro avg       0.93      0.93      0.93      6115
weighted avg       0.94      0.94      0.94      6115


Model: KNeighborsClassifier, Train score is 0.8332788136517283, Test score is 0.8135731807031888


Unnamed: 0,Pred 0,Pred 1
Actual 0,1866,161
Actual 1,979,3109


              precision    recall  f1-score   support

           0       0.66      0.92      0.77      2027
           1       0.95      0.76      0.85      4088

    accuracy                           0.81      6115
   macro avg       0.80      0.84      0.81      6115
weighted avg       0.85      0.81      0.82      6115


Model: GradientBoostingClassifier, Train score is 0.9224730127576055, Test score is 0.9156173344235486


Unnamed: 0,Pred 0,Pred 1
Actual 0,1609,418
Actual 1,98,3990


              precision    recall  f1-score   support

           0       0.94      0.79      0.86      2027
           1       0.91      0.98      0.94      4088

    accuracy                           0.92      6115
   macro avg       0.92      0.88      0.90      6115
weighted avg       0.92      0.92      0.91      6115


Model: AdaBoostClassifier, Train score is 0.8649002289826627, Test score is 0.8593622240392478


Unnamed: 0,Pred 0,Pred 1
Actual 0,1444,583
Actual 1,277,3811


              precision    recall  f1-score   support

           0       0.84      0.71      0.77      2027
           1       0.87      0.93      0.90      4088

    accuracy                           0.86      6115
   macro avg       0.85      0.82      0.83      6115
weighted avg       0.86      0.86      0.86      6115




In [122]:
import pickle

with open('../Streamlit/speciespredict.pkl', 'wb') as file:
    pickle.dump(model_rf, file)