In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, 
    recall_score,
    precision_score,
    roc_auc_score,
    make_scorer, 
    roc_curve, 
    precision_recall_curve, 
    auc
)

plt.style.use('fivethirtyeight')

In [2]:
train_cleaned = pd.read_csv('../Assets/train_cleaned.csv')
weather_cleaned = pd.read_csv('../Assets/weather_cleaned.csv')

In [3]:
weather_cleaned.shape

(2944, 15)

In [4]:
train_cleaned.shape

(9693, 166)

In [6]:
# Average the values of station 1 and station 2 
weather_cleaned = weather_cleaned.groupby('Date').mean().drop(columns='Station')

In [7]:
# Drop engineered columns with Year, Month and Day
weather_cleaned.reset_index(inplace=True)
weather_cleaned.drop(columns=['Year','Month','Day'],inplace=True)
weather_cleaned.head()

Unnamed: 0,Date,Tmax,Tmin,Tavg,DewPoint,WetBulb,PrecipTotal,relative_humidity,Trange,Week,precip_7D_avg
0,2007-05-01,83.5,51.0,67.25,51.0,56.5,0.0,56.0,32.5,18.0,0.0
1,2007-05-02,59.5,42.5,51.0,42.0,47.0,0.0,53.5,17.0,18.0,0.0
2,2007-05-03,66.5,47.0,56.75,40.0,49.0,0.0,34.0,19.5,18.0,0.0
3,2007-05-04,72.0,50.0,61.0,41.5,50.0,0.0,34.5,22.0,18.0,0.0
4,2007-05-05,66.0,53.5,59.75,38.5,49.5,0.0,24.5,12.5,18.0,0.0


In [9]:
weather_cleaned.shape

(1472, 11)

In [10]:
train_merge = pd.merge(train_cleaned, weather_cleaned,how='left', left_on='date', right_on='Date')
train_merge.shape

(9693, 177)

In [47]:
train_merge.drop(columns='date',inplace=True)
train_merge.head()

Unnamed: 0,latitude,longitude,nummosquitos,wnvpresent,month,year,day_of_week,day,species_CULEX ERRATICUS,species_CULEX PIPIENS,...,Tmax,Tmin,Tavg,DewPoint,WetBulb,PrecipTotal,relative_humidity,Trange,Week,precip_7D_avg
0,41.95469,-87.800991,1.0,0.0,5,2007,1,29,0,0,...,88.0,62.5,75.25,58.5,65.5,0.0,63.5,25.5,22.0,0.108571
1,41.95469,-87.800991,1.0,0.0,5,2007,1,29,0,0,...,88.0,62.5,75.25,58.5,65.5,0.0,63.5,25.5,22.0,0.108571
2,41.994991,-87.769279,1.0,0.0,5,2007,1,29,0,0,...,88.0,62.5,75.25,58.5,65.5,0.0,63.5,25.5,22.0,0.108571
3,41.974089,-87.824812,1.0,0.0,5,2007,1,29,0,0,...,88.0,62.5,75.25,58.5,65.5,0.0,63.5,25.5,22.0,0.108571
4,41.974089,-87.824812,4.0,0.0,5,2007,1,29,0,0,...,88.0,62.5,75.25,58.5,65.5,0.0,63.5,25.5,22.0,0.108571


In [19]:
target = 'wnvpresent'
features = train_merge.drop(columns=target).columns
X = train_merge[features]
y = train_merge[target]
y = y.astype(int)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y)

In [27]:
X_train.shape

(7269, 176)

In [28]:
X_test.shape

(2424, 176)

In [29]:
y_train.shape

(7269,)

In [30]:
y_test.shape

(2424,)

In [35]:
# Initiating models:
lr = LogisticRegression(solver='liblinear')
rf = RandomForestClassifier()

In [36]:
# Initiating models with pipeline:
lr_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('lr', lr)
])

In [37]:
rf_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('rf', rf)
])

In [41]:
maxfeat = int(np.round(np.sqrt(X_test.shape[1])))
maxdepth = int(np.round(maxfeat / 3))

In [39]:
scorer = roc_auc_score

In [42]:
### Randomized Search
# There are many hyper parameters that 
# could be tuned to increase its perfomance.

# Let's tune parametrs of the classifiers towards higher AUC ROC.

scoring = make_scorer(scorer)

lr_param_distributions = {
    'lr__penalty': ['l1', 'l2'],
    'lr__C': np.logspace(-5, 1, 100),
    'lr__class_weight': [None, 'balanced']
}

rf_param_distributions = {
    'rf__max_depth': list(range(2, maxdepth)),
    'rf__max_features': list(range(1, maxfeat)),
    'rf__min_samples_split': list(range(2, 9)),
    'rf__n_estimators': list(range(200, 501)),
    'rf__class_weight': [None, 'balanced', 'balanced_subsample']
}

In [43]:
# Optimize hyper parameters using randomized search according to one of the scores that is input
lr_rs = RandomizedSearchCV(
    lr_pipe, param_distributions=lr_param_distributions,
    scoring=scoring,
    n_jobs=-1,
    cv=5,
    verbose=1
)


rf_rs = RandomizedSearchCV(
    rf_pipe, param_distributions=rf_param_distributions,
    scoring=scoring,
    n_jobs=-1,
    cv=5,
    verbose=1
)

In [45]:
# Get metrics for 3 different models for one dataset
models = {'Logistic Regression': lr_rs,'Random Forest': rf_rs}
ss = pd.DataFrame()
cols = []
for key, model in models.items():
    cols.append(key)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_score = model.predict_proba(X_test)
    accur = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    aucroc = roc_auc_score(y_test, y_score[:, 1])
    data = {
        'Accuracy': accur,
        'Recall': recall, 
        'Precision': prec,
        'ROC AUC': aucroc,
        'Model': model.best_estimator_ # Included to easily call the model via masking if required
    }
    col = pd.Series(data)
    df = pd.DataFrame(col)
    ss = pd.concat([ss, df], axis=1)
ss.columns = cols

# Print dataframe without model
metrics = [metric for metric in ss.index if metric != 'Model']
ss.loc[metrics, :]

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    3.7s finished


ValueError: could not convert string to float: '2007-09-18'