### Header

In [1]:
# import libraries

# maths
import numpy as np
import pandas as pd
import scipy.stats as stats
from pandas.api.types import is_numeric_dtype

# visual
#from matplotlib_venn import venn2
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# modelling
from sklearn import linear_model
from sklearn.linear_model import Ridge,LinearRegression,LogisticRegression
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.metrics import r2_score,mean_squared_error,confusion_matrix,accuracy_score
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.utils import resample

# nlp
from sklearn.feature_extraction.text import CountVectorizer,HashingVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import spacy
from spacy.tokens import Doc

# web
import requests
import json

# others
import os
import re
import time
import datetime as datetime

In [2]:
# file paths

input_path = '../data/2_input/'
clean_path = '../data/3_clean/'
output_path = '../data/4_output/'

image_path = '../images/'

### Functions

### Import Data

In [3]:
# import clean data

df_train = pd.read_csv(clean_path + 'train_clean.csv')
df_test = pd.read_csv(clean_path + 'test_clean.csv')
df_weather = pd.read_csv(clean_path + 'weather_clean.csv')
#df_spray = pd.read_csv(clean_path + 'spray_clean.csv')

### Preprocessing

In [4]:
df_train.shape

(8475, 11)

In [5]:
df_test.shape

(116293, 10)

In [6]:
test = df_test.drop('id', axis=1)
train = df_train.drop(['nummosquitos', 'wnvpresent'], axis=1)
combined_train_test = pd.concat([test,train])
combined_train_test.shape

(124768, 9)

In [7]:
all_dataset = combined_train_test.merge(df_weather, how='left', on=['year','month','day'])
all_dataset.shape

KeyError: 'year'

In [None]:
df = all_dataset[all_dataset['station'] == 1]
df.shape

In [None]:
df_get_dum = pd.concat([df, pd.get_dummies(df[['species', 'street', 'trap']],drop_first=True)], axis=1)
df_get_dum.drop(['species', 'street', 'trap'], inplace =True, axis=1)
df_get_dum

In [None]:
train = df_get_dum.loc[:8475]
wnv = pd.DataFrame(df_train['wnvpresent'])
df1 = train.merge(wnv, left_on=train.index, right_on=wnv.index)    
#df1[df1['wnvpresent']==1]


In [None]:
test = df.loc[8475:]

In [None]:
majority_class = df1[df1['wnvpresent']==0]
minority_class = df1[df1['wnvpresent']==1]

In [None]:
minority_upsampled = resample( minority_class, replace=True, n_samples=majority_class.shape[0], random_state=42)
train_resampled = pd.concat([minority_upsampled,majority_class])
train_resampled.wnvpresent.value_counts()

In [None]:
#train_resampled.head()