### Header

In [1]:
# import libraries

# maths
import numpy as np
import pandas as pd
import scipy.stats as stats
from pandas.api.types import is_numeric_dtype

# visual
#from matplotlib_venn import venn2
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# modelling
from sklearn import linear_model
from sklearn.linear_model import Ridge,LinearRegression,LogisticRegression
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.metrics import r2_score,mean_squared_error,confusion_matrix,accuracy_score
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.utils import resample

# nlp
from sklearn.feature_extraction.text import CountVectorizer,HashingVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import spacy
from spacy.tokens import Doc

# web
import requests
import json

# others
import os
import re
import time
import datetime as datetime

In [2]:
# file paths

input_path = '../data/2_input/'
clean_path = '../data/3_clean/'
output_path = '../data/4_output/'

image_path = '../images/'

### Functions

### Import Data

In [3]:
# import clean data

df_train = pd.read_csv(clean_path + 'train_clean.csv')
df_test = pd.read_csv(clean_path + 'test_clean.csv')
df_weather = pd.read_csv(clean_path + 'weather_clean.csv')
#df_spray = pd.read_csv(clean_path + 'spray_clean.csv')

In [4]:
majority_class = df_train[df_train['wnvpresent']==0]
minority_class = df_train[df_train['wnvpresent']==1]

### Preprocessing

In [7]:
df_train.shape

(10506, 11)

In [14]:
df_test.shape

(116293, 10)

In [15]:
test = df_test.drop('id', axis=1)
train = df_train.drop(['nummosquitos', 'wnvpresent'], axis=1)
combined_train_test = pd.concat([test,train])
combined_train_test.shape

(126799, 9)

In [20]:
all_dataset = combined_train_test.merge(df_weather, how='left', on=['year','month','day'])
all_dataset.tail()

Unnamed: 0,species,block,street,trap,year,month,day,latitude,longitude,station,...,codesum,depth,water1,snowfall,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed
253593,CULEX PIPIENS/RESTUANS,17,N ASHLAND AVE,T232,2013,9,26,41.912563,-87.668055,2,...,,,,,0.0,29.4,30.04,4.1,9,4.6
253594,CULEX PIPIENS/RESTUANS,71,N HARLEM AVE,T233,2013,9,26,42.009876,-87.807277,1,...,BR,0.0,,0.0,0.0,29.34,30.04,3.8,8,4.2
253595,CULEX PIPIENS/RESTUANS,71,N HARLEM AVE,T233,2013,9,26,42.009876,-87.807277,2,...,,,,,0.0,29.4,30.04,4.1,9,4.6
253596,CULEX PIPIENS/RESTUANS,42,W 65TH ST,T235,2013,9,26,41.776428,-87.627096,1,...,BR,0.0,,0.0,0.0,29.34,30.04,3.8,8,4.2
253597,CULEX PIPIENS/RESTUANS,42,W 65TH ST,T235,2013,9,26,41.776428,-87.627096,2,...,,,,,0.0,29.4,30.04,4.1,9,4.6


In [5]:
minority_upsampled = resample( minority_class, replace=True, n_samples=majority_class.shape[0], random_state=42)
train_resampled = pd.concat([minority_upsampled,majority_class])
train_resampled.wnvpresent.value_counts()

1    9955
0    9955
Name: wnvpresent, dtype: int64

In [6]:
train_resampled.head()

Unnamed: 0,species,block,street,trap,nummosquitos,wnvpresent,year,month,day,latitude,longitude
2181,CULEX PIPIENS,38,E 115TH ST,T215,30,1,2007,8,17,41.686398,-87.531635
9852,CULEX PIPIENS,82,S KOSTNER AVE,T225,21,1,2013,8,22,41.743402,-87.731435
7198,CULEX RESTUANS,10,W OHARE,T903,7,1,2011,8,5,41.957799,-87.930995
2192,CULEX PIPIENS,40,E 130TH ST,T221,50,1,2007,8,17,41.659112,-87.538693
1900,CULEX PIPIENS,46,N MILWAUKEE AVE,T003,49,1,2007,8,15,41.964242,-87.757639
