### Header

In [1]:
# import libraries

# maths
import numpy as np
import pandas as pd
import scipy.stats as stats
from pandas.api.types import is_numeric_dtype

# visual
#from matplotlib_venn import venn2
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# modelling
from sklearn import linear_model
from sklearn.linear_model import Ridge,LinearRegression,LogisticRegression
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.metrics import r2_score,mean_squared_error,confusion_matrix,accuracy_score, roc_auc_score, roc_curve
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.utils import resample, shuffle
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, RandomForestClassifier
from sklearn.cluster import KMeans, DBSCAN

# nlp
from sklearn.feature_extraction.text import CountVectorizer,HashingVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.svm import SVC
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import spacy
from spacy.tokens import Doc

# web
import requests
import json

# others
import os
import re
import time
import datetime as datetime

In [2]:
# file paths

input_path = '../data/2_input/'
clean_path = '../data/3_clean/'
output_path = '../data/4_output/'

image_path = '../images/'

### Functions

### Import Data

In [3]:
# import clean data

df_train = pd.read_csv(clean_path + 'train_clean.csv')
df_test = pd.read_csv(clean_path + 'test_clean.csv')
df_weather = pd.read_csv(clean_path + 'weather_clean.csv')
#df_spray = pd.read_csv(clean_path + 'spray_clean.csv')

### Preprocessing

In [4]:
df_train.shape

(8475, 11)

In [5]:
df_test.shape

(116293, 10)

In [6]:
print(df_train.columns)
print(df_test.columns)

Index(['species', 'block', 'street', 'trap', 'nummosquitos', 'wnvpresent',
       'year', 'month', 'day', 'latitude', 'longitude'],
      dtype='object')
Index(['id', 'species', 'block', 'street', 'trap', 'year', 'month', 'day',
       'latitude', 'longitude'],
      dtype='object')


In [7]:
#Combines test and train datasets for processing
test = df_test.drop('id', axis=1) #id not in train dataset
train = df_train.drop(['nummosquitos', 'wnvpresent'], axis=1) #nummosquitos and wnvpresent not in test dataset
combined_train_test = pd.concat([test,train])
combined_train_test.shape

(124768, 9)

In [8]:
#Using weather data only from Station 1
only_station_1 = df_weather[df_weather['station'] == 1] 

In [9]:
# code from jngyun
# cheekeet: don't know put where

#train = df_get_dum[df_get_dum['year']%2!=0]
#test = df_get_dum.loc[df_get_dum['year']%2==0]

In [10]:
#Combines weather data with train and test dataset
all_dataset = combined_train_test.merge(only_station_1, how='left', on=['year','month','day'])
all_dataset.shape

KeyError: 'year'

In [None]:
#Get dummies for categorical data
df_get_dum = pd.concat([all_dataset, pd.get_dummies(all_dataset[['species', 'street', 'trap']],drop_first=True)], axis=1)
df_get_dum.drop(['species', 'street', 'trap'], inplace =True, axis=1)
df_get_dum.shape

In [None]:
#Splits out train dataset and re-attach wnvpresent column
train = df_get_dum.iloc[:8475]
wnv = pd.DataFrame(df_train['wnvpresent'])
train_with_wnv = train.merge(wnv, left_on=train.index, right_on=wnv.index) 
train_with_wnv.shape

In [None]:
#Splits out test dataset
test = df_get_dum.loc[8475:]
test.shape

In [None]:
#Splits train dataset into majority and minority classes based on wnvpresent column 1, 0
majority_class = train_with_wnv[train_with_wnv['wnvpresent']==0]
minority_class = train_with_wnv[train_with_wnv['wnvpresent']==1]

In [None]:
#Resamples minority class i.e. wnvpresent = 1, with duplicates to increase representation
minority_upsampled = resample( minority_class, replace=True, n_samples=majority_class.shape[0], random_state=42)
train_resampled = pd.concat([minority_upsampled,majority_class])
train_resampled.wnvpresent.value_counts()

In [None]:
train_resampled.shape

In [None]:
#Shuffles dataset
df = shuffle(train_resampled, random_state=42)
df.reset_index(drop=True, inplace=True)
df.head()

In [None]:
X = df.drop(['key_0','wnvpresent'], axis=1)

In [None]:
y= df.wnvpresent

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
estimators = {
    'Lr': LogisticRegression(),
    'Knn': KNeighborsClassifier(n_neighbors=5),
    'Dtree': DecisionTreeClassifier(),
    'Rf': RandomForestClassifier()
}.items()

In [None]:
for k,v in estimators:
    pipe = Pipeline([
        ('sc', StandardScaler()),
        (k,v)])
    model = pipe.fit(X_train,y_train)
    pred = model.predict(X_test)
    print('{} score: {} AUC/ROC: {}'.format(k, model.score(X_train,y_train), roc_auc_score(y_test,pred)))