In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, 
    recall_score,
    precision_score,
    roc_auc_score,
    make_scorer, 
    roc_curve, 
    precision_recall_curve, 
    auc
)

pd.set_option("display.max.columns", None)
plt.style.use('fivethirtyeight')

In [23]:
train_cleaned = pd.read_csv('../Assets/train_cleaned.csv')
weather_cleaned = pd.read_csv('../Assets/weather_cleaned.csv')

In [24]:
weather_cleaned.shape

(2944, 15)

In [25]:
train_cleaned.shape

(9693, 166)

In [26]:
# Average the values of station 1 and station 2 
weather_cleaned = weather_cleaned.groupby('Date').mean().drop(columns='Station')

In [27]:
# Drop engineered columns with Year, Month and Day
weather_cleaned.reset_index(inplace=True)
weather_cleaned.drop(columns=['Year','Month','Day'],inplace=True)
weather_cleaned.head()

Unnamed: 0,Date,Tmax,Tmin,Tavg,DewPoint,WetBulb,PrecipTotal,relative_humidity,Trange,Week,precip_7D_avg
0,2007-05-01,83.5,51.0,67.25,51.0,56.5,0.0,56.0,32.5,18.0,0.0
1,2007-05-02,59.5,42.5,51.0,42.0,47.0,0.0,53.5,17.0,18.0,0.0
2,2007-05-03,66.5,47.0,56.75,40.0,49.0,0.0,34.0,19.5,18.0,0.0
3,2007-05-04,72.0,50.0,61.0,41.5,50.0,0.0,34.5,22.0,18.0,0.0
4,2007-05-05,66.0,53.5,59.75,38.5,49.5,0.0,24.5,12.5,18.0,0.0


In [28]:
weather_cleaned.shape

(1472, 11)

In [29]:
train_merge = pd.merge(train_cleaned, weather_cleaned,how='left', left_on='date', right_on='Date')
train_merge.shape

(9693, 177)

In [30]:
train_merge.drop(columns=['date','Date'],inplace=True)
train_merge.head()

Unnamed: 0,latitude,longitude,nummosquitos,wnvpresent,month,year,day_of_week,day,species_CULEX ERRATICUS,species_CULEX PIPIENS,species_CULEX PIPIENS/RESTUANS,species_CULEX RESTUANS,species_CULEX SALINARIUS,species_CULEX TARSALIS,species_CULEX TERRITANS,species_UNSPECIFIED CULEX,trap_T001,trap_T002,trap_T002A,trap_T002B,trap_T003,trap_T004,trap_T005,trap_T006,trap_T007,trap_T008,trap_T009,trap_T011,trap_T012,trap_T013,trap_T014,trap_T015,trap_T016,trap_T017,trap_T018,trap_T019,trap_T025,trap_T027,trap_T028,trap_T030,trap_T031,trap_T033,trap_T034,trap_T035,trap_T036,trap_T037,trap_T039,trap_T040,trap_T043,trap_T044,trap_T045,trap_T046,trap_T047,trap_T048,trap_T049,trap_T050,trap_T051,trap_T054,trap_T054C,trap_T060,trap_T061,trap_T062,trap_T063,trap_T065,trap_T065A,trap_T066,trap_T067,trap_T069,trap_T070,trap_T071,trap_T072,trap_T073,trap_T074,trap_T075,trap_T076,trap_T077,trap_T078,trap_T079,trap_T080,trap_T081,trap_T082,trap_T083,trap_T084,trap_T085,trap_T086,trap_T088,trap_T089,trap_T090,trap_T090A,trap_T090B,trap_T090C,trap_T091,trap_T092,trap_T094,trap_T094B,trap_T095,trap_T096,trap_T097,trap_T099,trap_T100,trap_T102,trap_T103,trap_T107,trap_T114,trap_T115,trap_T128,trap_T128A,trap_T129,trap_T135,trap_T138,trap_T141,trap_T142,trap_T143,trap_T144,trap_T145,trap_T146,trap_T147,trap_T148,trap_T149,trap_T150,trap_T151,trap_T152,trap_T153,trap_T154,trap_T155,trap_T156,trap_T157,trap_T158,trap_T159,trap_T160,trap_T161,trap_T162,trap_T200,trap_T200A,trap_T200B,trap_T206,trap_T209,trap_T212,trap_T215,trap_T218,trap_T218A,trap_T218B,trap_T218C,trap_T219,trap_T220,trap_T221,trap_T222,trap_T223,trap_T224,trap_T225,trap_T226,trap_T227,trap_T228,trap_T229,trap_T230,trap_T231,trap_T232,trap_T233,trap_T234,trap_T235,trap_T236,trap_T237,trap_T238,trap_T900,trap_T903,Tmax,Tmin,Tavg,DewPoint,WetBulb,PrecipTotal,relative_humidity,Trange,Week,precip_7D_avg
0,41.95469,-87.800991,1.0,0.0,5,2007,1,29,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,88.0,62.5,75.25,58.5,65.5,0.0,63.5,25.5,22.0,0.108571
1,41.95469,-87.800991,1.0,0.0,5,2007,1,29,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,88.0,62.5,75.25,58.5,65.5,0.0,63.5,25.5,22.0,0.108571
2,41.994991,-87.769279,1.0,0.0,5,2007,1,29,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,88.0,62.5,75.25,58.5,65.5,0.0,63.5,25.5,22.0,0.108571
3,41.974089,-87.824812,1.0,0.0,5,2007,1,29,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,88.0,62.5,75.25,58.5,65.5,0.0,63.5,25.5,22.0,0.108571
4,41.974089,-87.824812,4.0,0.0,5,2007,1,29,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,88.0,62.5,75.25,58.5,65.5,0.0,63.5,25.5,22.0,0.108571


In [32]:
target = 'wnvpresent'
features = train_merge.drop(columns=target).columns
X = train_merge[features]
y = train_merge[target]
y = y.astype(int)

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y)

In [38]:
X_train.shape

(7269, 174)

In [39]:
X_test.shape

(2424, 174)

In [40]:
y_train.shape

(7269,)

In [41]:
y_test.shape

(2424,)