# Modelling

In [48]:
import pandas as pd

from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Perceptron

from keras.models import Sequential
from keras.layers import *

from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline

In [None]:
X = pd.read_csv('')
y = pd.read_csv('')

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state = 42, stratify=y)

In [59]:
standardize = MinMaxScaler(feature_range=(0, 1))
ohe = OneHotEncoder(handle_unknown='ignore')

perceptron = Perceptron(random_state=42)
smtom = SMOTETomek(random_state=42)

In [61]:
col_transform = ColumnTransformer([
    ('ss', standardize, ['Trap', 'AddressAccuracy', 'Satellite']),
    ('oe', ohe, ['Species'])
])

In [65]:
perceptron_pipe = Pipeline([
    ('transform', col_transform),
    ('sample', smtom),
    ('clf', perceptron)
])

In [67]:
cross_val_score(perceptron_pipe, X_train, y_train, cv=5, scoring='recall').mean()

0.5172475338868782

In [68]:
perceptron_pipe.fit(X_train, y_train)

In [69]:
perceptron_pipe.score(X_train, y_train)

0.3379741440545532

In [70]:
perceptron_pipe.score(X_test, y_test)

0.34064032304586095

In [71]:
preds = perceptron_pipe.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.98      0.31      0.47      3285
           1       0.07      0.88      0.12       182

    accuracy                           0.34      3467
   macro avg       0.52      0.60      0.30      3467
weighted avg       0.93      0.34      0.45      3467



In [72]:
df_test = pd.read_csv('../assets/test.csv')

In [75]:
results = perceptron_pipe.predict(df_test)

In [76]:
def create_predictions(predictions, filename):
    results_df = df_test[["Id"]].merge(
        pd.DataFrame(predictions), left_index=True, right_index=True
    )
    results_df.to_csv(f"../{filename}.csv", header=["Id", "WnvPresent"], index=False)

In [77]:
create_predictions(results, 'predictions1')