### Header

In [None]:
# import libraries

# maths
import numpy as np
import pandas as pd
#import scipy.stats as stats
#from pandas.api.types import is_numeric_dtype

# visual
#from matplotlib_venn import venn2
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# modelling
from sklearn.linear_model import LinearRegression,LogisticRegression, Ridge, Lasso, ElasticNet, RidgeCV, LassoCV, ElasticNetCV
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.metrics import r2_score,mean_squared_error,confusion_matrix,accuracy_score, roc_auc_score, roc_curve
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.utils import resample, shuffle
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, RandomForestClassifier, RandomForestRegressor
from sklearn.cluster import KMeans, DBSCAN

# nlp
#from sklearn.feature_extraction.text import CountVectorizer,HashingVectorizer,TfidfVectorizer
#from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
#from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
#from sklearn.svm import SVC
#from nltk.stem import PorterStemmer,WordNetLemmatizer
#from nltk.tokenize import sent_tokenize, word_tokenize
#from nltk.corpus import stopwords
#from nltk.sentiment.vader import SentimentIntensityAnalyzer
#import spacy
#from spacy.tokens import Doc

# web
#import requests
#import json

# others
#import os
#import re
#import time
#import datetime as datetime

In [None]:
# file paths

input_path = '../data/2_input/'
clean_path = '../data/3_clean/'
output_path = '../data/4_output/'

image_path = '../images/'

### Functions

### Import Data

In [None]:
# import clean data

df_train = pd.read_csv(clean_path + 'train_clean.csv')
df_test = pd.read_csv(clean_path + 'test_clean.csv')
df_weather = pd.read_csv(clean_path + 'weather_clean.csv')
#df_spray = pd.read_csv(clean_path + 'spray_clean.csv')

### Preprocessing

In [None]:
df_train.shape

In [None]:
df_train.head()

In [None]:
df_test.shape

In [None]:
df_test.head()

In [None]:
print(df_train.columns)
print(df_test.columns)

In [None]:
#Combines test and train datasets for processing
test = df_test.drop('id', axis=1) #id not in train dataset
train = df_train.drop(['nummosquitos', 'wnvpresent'], axis=1) #nummosquitos and wnvpresent not in test dataset
combined_train_test = pd.concat([train,test], sort=False)
combined_train_test.shape

In [None]:
combined_train_test

In [None]:
#Using weather data only from Station 1
only_station_1 = df_weather[df_weather['station'] == 1] 

In [None]:
#Combines weather data with train and test dataset
all_dataset = combined_train_test.merge(only_station_1, how='left', on=['year','month','day'])
all_dataset.shape

In [None]:
all_dataset

In [None]:
#Get dummies for categorical data
df_get_dum = pd.concat([all_dataset, pd.get_dummies(all_dataset[['species', 'street', 'trap']],drop_first=True)], axis=1)
df_get_dum.drop(['species', 'street', 'trap'], inplace =True, axis=1)
df_get_dum.shape

In [None]:
df_get_dum

In [None]:
#Splits out train dataset and re-attach wnvpresent column
train = df_get_dum[df_get_dum['year']%2!=0]
wnv = pd.DataFrame(df_train['wnvpresent'])
train_with_wnv = train.merge(wnv, left_on=train.index, right_on=wnv.index)
train_with_wnv['nummosquitos'] = df_train['nummosquitos']
train_with_wnv.shape

In [None]:
train_with_wnv

In [None]:
#Splits out test dataset
test = df_get_dum.loc[df_get_dum['year']%2==0]
test.shape

In [None]:
test

In [None]:
#Splits train dataset into majority and minority classes based on wnvpresent column 1, 0
majority_class = train_with_wnv[train_with_wnv['wnvpresent']==0]
minority_class = train_with_wnv[train_with_wnv['wnvpresent']==1]

In [None]:
#Resamples minority class i.e. wnvpresent = 1, with duplicates to increase representation
minority_upsampled = resample( minority_class, replace=True, n_samples=majority_class.shape[0], random_state=42)
train_resampled = pd.concat([minority_upsampled,majority_class])
train_resampled.wnvpresent.value_counts()

In [None]:
train_resampled.shape

In [None]:
#Shuffles dataset
df = shuffle(train_resampled, random_state=42)
df.reset_index(drop=True, inplace=True)
df.head()

In [None]:
count=0
for x in list(df.columns):
    if 'street' in x:
        pass
    elif 'trap' in x:
        pass
    else:
        count += 1
count

In [None]:
X = df.iloc[:,:31]
X.columns

In [None]:
#X.drop(columns=['key_0', 'block', 'station'], inplace=True)
y_mos_count = df.nummosquitos
X_train, X_test, y_train, y_test = train_test_split(X, y_mos_count, test_size=0.33, random_state=42)

In [None]:
features = list(X.columns)
features

In [None]:
estimators = {
    'linreg': LinearRegression(),
    'ridge': Ridge(),
    'lasso': Lasso(),
    'en': ElasticNet(),
    'rfr': RandomForestRegressor()
}.items()

params = {
    'linreg': {
        'linreg__n_jobs': [1]
    },
    'ridge': {
        'ridge__alpha': np.logspace(-2, 3, 200)
    },
    'lasso': {
        'lasso__alpha': np.logspace(-2, 3, 200)
    },
    'en': {
        'en__l1_ratio': np.linspace(0.01, 1.0, 5),
        'en__alpha': np.logspace(-2, 3, 200)
    },
    'rfr': {
        'rfr__n_estimators': [10, 20, 30]
    }
}

In [None]:
models = []
parameters = []
test_score = []

for k,v in estimators:
    pipe = Pipeline([
        ('sc', StandardScaler()),
        (k,v)])
    
    param = params[k]
    
    gridsearch = GridSearchCV(
        estimator=pipe,
        param_grid=param,
        verbose=1,
        n_jobs=3,
    )

    gridsearch.fit(X_train, y_train)
    
    model = gridsearch.best_estimator_
    score = model.score(X_test, y_test)
    best_params = gridsearch.best_params_

    # predict y
    #y_pred = model.predict(X_test)
    
    # print results
    print("Model: ", k)
    print("Best parameters:", best_params)
    print("Best R2 score:", gridsearch.best_score_)
    print("Test R2 score:", score)
    
    # append info to list
    models.append(k)
    test_score.append(score)
    parameters.append(best_params)

In [None]:
# print summary of results
pd.DataFrame({
    'model': models,
    'parameters': parameters,
    'test_score': test_score
})

In [None]:
test.shape

In [None]:
X_kaggle.shape

In [None]:
# Random Forest Regressor has the highest score in predicting the number of mosquitos. 

model = RandomForestRegressor(n_estimators=20).fit(X_train, y_train)

features = list(X.columns)
X_kaggle = test[features]

test['nummosquitos'] = model.predict(X_kaggle)
test.head()

In [None]:
test['nummosquitos'] = test['nummosquitos'].map(lambda x:int(x))
test.head()

In [None]:
y= df.wnvpresent

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
estimators = {
    'Lr': LogisticRegression(),
    'Knn': KNeighborsClassifier(n_neighbors=5),
    'Dtree': DecisionTreeClassifier(),
    'Rf': RandomForestClassifier()
}.items()

In [None]:
for k,v in estimators:
    pipe = Pipeline([
        ('sc', StandardScaler()),
        (k,v)])
    model = pipe.fit(X_train,y_train)
    pred = model.predict(X_test)
    print('{} score: {} AUC/ROC: {}'.format(k, model.score(X_train,y_train), roc_auc_score(y_test,pred)))