### Header

In [1]:
# import libraries

# maths
import numpy as np
import pandas as pd
import scipy.stats as stats
from pandas.api.types import is_numeric_dtype

# visual
#from matplotlib_venn import venn2
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# modelling
from sklearn import linear_model
from sklearn.linear_model import Ridge,LinearRegression,LogisticRegression
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.metrics import r2_score,mean_squared_error,confusion_matrix,accuracy_score
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.utils import resample

# nlp
from sklearn.feature_extraction.text import CountVectorizer,HashingVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import spacy
from spacy.tokens import Doc

# web
import requests
import json

# others
import os
import re
import time
import datetime as datetime

In [2]:
# file paths

input_path = '../data/2_input/'
clean_path = '../data/3_clean/'
output_path = '../data/4_output/'

image_path = '../images/'

### Functions

### Import Data

In [3]:
# import clean data

df_train = pd.read_csv(clean_path + 'train_clean.csv')
df_test = pd.read_csv(clean_path + 'test_clean.csv')
df_weather = pd.read_csv(clean_path + 'weather_clean.csv')
#df_spray = pd.read_csv(clean_path + 'spray_clean.csv')

### Preprocessing

In [5]:
df_train.shape

(8475, 11)

In [6]:
df_test.shape

(116293, 10)

In [7]:
test = df_test.drop('id', axis=1)
train = df_train.drop(['nummosquitos', 'wnvpresent'], axis=1)
combined_train_test = pd.concat([test,train])
combined_train_test.shape

(124768, 9)

In [8]:
all_dataset = combined_train_test.merge(df_weather, how='left', on=['year','month','day'])
all_dataset.shape

(249536, 26)

In [13]:
df = all_dataset[all_dataset['station'] == 1]
df.shape

(124768, 26)

In [22]:
df_get_dum = pd.concat([df, pd.get_dummies(df[['species', 'street', 'trap']],drop_first=True)], axis=1)
df_get_dum.drop(['species', 'street', 'trap'], inplace =True, axis=1)
df_get_dum

Unnamed: 0,block,year,month,day,latitude,longitude,station,tmax,tmin,tavg,...,trap_T231,trap_T232,trap_T233,trap_T234,trap_T235,trap_T236,trap_T237,trap_T238,trap_T900,trap_T903
0,41,2008,6,11,41.954690,-87.800991,1,86,61,74,...,0,0,0,0,0,0,0,0,0,0
2,41,2008,6,11,41.954690,-87.800991,1,86,61,74,...,0,0,0,0,0,0,0,0,0,0
4,41,2008,6,11,41.954690,-87.800991,1,86,61,74,...,0,0,0,0,0,0,0,0,0,0
6,41,2008,6,11,41.954690,-87.800991,1,86,61,74,...,0,0,0,0,0,0,0,0,0,0
8,41,2008,6,11,41.954690,-87.800991,1,86,61,74,...,0,0,0,0,0,0,0,0,0,0
10,41,2008,6,11,41.954690,-87.800991,1,86,61,74,...,0,0,0,0,0,0,0,0,0,0
12,41,2008,6,11,41.954690,-87.800991,1,86,61,74,...,0,0,0,0,0,0,0,0,0,0
14,41,2008,6,11,41.954690,-87.800991,1,86,61,74,...,0,0,0,0,0,0,0,0,0,0
16,62,2008,6,11,41.994991,-87.769279,1,86,61,74,...,0,0,0,0,0,0,0,0,0,0
18,62,2008,6,11,41.994991,-87.769279,1,86,61,74,...,0,0,0,0,0,0,0,0,0,0


In [46]:
train = df_get_dum.loc[:8475]
wnv = pd.DataFrame(df_train['wnvpresent'])
df1 = train.merge(wnv, left_on=train.index, right_on=wnv.index)    
#df1[df1['wnvpresent']==1]


Unnamed: 0,key_0,block,year,month,day,latitude,longitude,station,tmax,tmin,...,trap_T232,trap_T233,trap_T234,trap_T235,trap_T236,trap_T237,trap_T238,trap_T900,trap_T903,wnvpresent
263,526,43,2008,6,11,41.878114,-87.629798,1,86,61,...,0,0,0,0,0,0,0,0,0,1
282,564,15,2008,6,11,41.994469,-87.667736,1,86,61,...,0,0,0,0,0,0,0,0,0,1
344,688,45,2008,6,11,41.963976,-87.691810,1,86,61,...,0,0,0,0,0,0,0,0,0,1
382,764,70,2008,6,11,42.008314,-87.777921,1,86,61,...,0,0,0,0,0,0,0,0,0,1
387,774,61,2008,6,11,41.991429,-87.747113,1,86,61,...,0,0,0,0,0,0,0,0,0,1
405,810,68,2008,6,11,41.921177,-87.795180,1,86,61,...,0,0,0,0,0,0,0,0,0,1
436,872,25,2008,6,11,41.846283,-87.656913,1,86,61,...,0,0,0,0,0,0,0,0,0,1
445,890,10,2008,6,11,41.721474,-87.648064,1,86,61,...,0,0,0,0,0,0,0,0,0,1
457,914,96,2008,6,11,41.719059,-87.675088,1,86,61,...,0,0,0,0,0,0,0,0,0,1
465,930,18,2008,6,11,41.966987,-87.674677,1,86,61,...,0,0,0,0,0,0,0,0,0,1


In [25]:
test = df.loc[8475:]

In [47]:
majority_class = df1[df1['wnvpresent']==0]
minority_class = df1[df1['wnvpresent']==1]

In [49]:
minority_upsampled = resample( minority_class, replace=True, n_samples=majority_class.shape[0], random_state=42)
train_resampled = pd.concat([minority_upsampled,majority_class])
train_resampled.wnvpresent.value_counts()

1    4028
0    4028
Name: wnvpresent, dtype: int64

In [11]:
#train_resampled.head()