In [106]:
import pandas as pd
import numpy as np
import random
from sklearn.utils import resample, shuffle


In [58]:
#Read Data

df_weather = pd.read_csv('../csv_files/weather_clean.csv')
df_spray = pd.read_csv('../csv_files/spray_clean.csv')
df_train = pd.read_csv('../csv_files/train_clean.csv')
df_test = pd.read_csv('../csv_files/test_clean.csv')
df_train_og = pd.read_csv('../csv_files/train-2.csv')


In [10]:
df_weather.head()

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,Sunrise,Sunset,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,1,2007-05-01,83,50,67,14,51,56.0,0,2,448,1849,0.0,29.1,29.82,1.7,27,9.2
1,2,2007-05-01,84,52,68,14,51,57.0,0,3,448,1849,0.0,29.18,29.82,2.7,25,9.6
2,1,2007-05-02,59,42,51,-3,42,47.0,14,0,447,1850,0.0,29.38,30.09,13.0,4,13.4
3,2,2007-05-02,60,43,52,-3,42,47.0,13,0,447,1850,0.0,29.44,30.08,13.3,2,13.4
4,1,2007-05-03,66,46,56,2,40,48.0,9,0,446,1851,0.0,29.39,30.12,11.7,7,11.9


In [11]:
df_spray.head()

Unnamed: 0,Date,Latitude,Longitude,daysfrom0
0,2011-08-29,42.391623,-88.089163,0
1,2011-08-29,42.391348,-88.089163,0
2,2011-08-29,42.391022,-88.089157,0
3,2011-08-29,42.390637,-88.089158,0
4,2011-08-29,42.39041,-88.088858,0


In [73]:
df_train.shape

(10506, 11)

In [71]:
df_test.shape

(116293, 11)

## Data Preparation for modelling

### Preprocessing Train/Test DataFrame

In [None]:
#Drop the ID of test as it is not present in train DataFrame and it is not needed. 

df_test.drop('Id', axis=1, inplace=True)

#Drop the WnvPresent in Train DataFrame as it is the target label.

df_train.drop('WnvPresent', axis = 1, inplace=True)

In [18]:
#Combine train and test DataFrame
df_combined = pd.concat([df_train,df_test])

print('Size of train/test dataset: {}'.format(df_combined.shape))

Size of train/test dataset: (126799, 10)


In [19]:
df_combined

Unnamed: 0,Date,Species,Trap,Latitude,Longitude,year,month,day,day_of_year,week
0,2007-05-29,CULEX PIPIENS/RESTUANS,T002,41.954690,-87.800991,2007,5,29,149,22
1,2007-05-29,CULEX RESTUANS,T002,41.954690,-87.800991,2007,5,29,149,22
2,2007-05-29,CULEX RESTUANS,T007,41.994991,-87.769279,2007,5,29,149,22
3,2007-05-29,CULEX PIPIENS/RESTUANS,T015,41.974089,-87.824812,2007,5,29,149,22
4,2007-05-29,CULEX RESTUANS,T015,41.974089,-87.824812,2007,5,29,149,22
...,...,...,...,...,...,...,...,...,...,...
116288,2014-10-02,CULEX SALINARIUS,T054C,41.925652,-87.633590,2014,10,2,275,40
116289,2014-10-02,CULEX TERRITANS,T054C,41.925652,-87.633590,2014,10,2,275,40
116290,2014-10-02,CULEX TARSALIS,T054C,41.925652,-87.633590,2014,10,2,275,40
116291,2014-10-02,UNSPECIFIED CULEX,T054C,41.925652,-87.633590,2014,10,2,275,40


### Preprocessing Weather DataFrame

In [21]:
#Musking Station 2 as most of the data from Station 2 are actually dupilcated from Station 1. 

df_weather = df_weather.loc[df_weather['Station'] == 1].reset_index(drop=True)
df_weather

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,Sunrise,Sunset,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,1,2007-05-01,83,50,67,14,51,56.0,0,2,448,1849,0.00,29.10,29.82,1.7,27,9.2
1,1,2007-05-02,59,42,51,-3,42,47.0,14,0,447,1850,0.00,29.38,30.09,13.0,4,13.4
2,1,2007-05-03,66,46,56,2,40,48.0,9,0,446,1851,0.00,29.39,30.12,11.7,7,11.9
3,1,2007-05-04,66,49,58,4,41,50.0,7,0,444,1852,0.00,29.31,30.05,10.4,8,10.8
4,1,2007-05-05,66,53,60,5,38,49.0,5,0,443,1853,0.00,29.40,30.10,11.7,7,12.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1467,1,2014-10-27,77,51,64,16,51,58.0,1,0,618,1653,0.00,28.92,29.66,12.0,19,12.9
1468,1,2014-10-28,68,45,57,10,38,47.0,8,0,619,1651,0.00,29.15,29.85,14.8,26,15.6
1469,1,2014-10-29,49,36,43,-4,32,40.0,22,0,620,1650,0.00,29.36,30.06,9.5,29,9.9
1470,1,2014-10-30,51,32,42,-4,34,40.0,23,0,622,1649,0.00,29.34,30.09,5.1,24,5.5


In [24]:
#Converting Date to DateTime.

df_combined['Date'] = pd.to_datetime(df_combined['Date']) 

df_weather['Date'] = pd.to_datetime(df_weather['Date'])

In [42]:
#Combined df_combined & df_weather

all_dataset = df_combined.merge(df_weather, how='left', on=['Date'])

print('Size of train/test dataset with weather data: {}'.format(all_dataset.shape))

Size of train/test dataset with weather data: (126799, 27)


In [43]:
all_dataset.head()

Unnamed: 0,Date,Species,Trap,Latitude,Longitude,year,month,day,day_of_year,week,...,Heat,Cool,Sunrise,Sunset,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,2007-05-29,CULEX PIPIENS/RESTUANS,T002,41.95469,-87.800991,2007,5,29,149,22,...,0,9,421,1917,0.0,29.39,30.11,5.8,18,6.5
1,2007-05-29,CULEX RESTUANS,T002,41.95469,-87.800991,2007,5,29,149,22,...,0,9,421,1917,0.0,29.39,30.11,5.8,18,6.5
2,2007-05-29,CULEX RESTUANS,T007,41.994991,-87.769279,2007,5,29,149,22,...,0,9,421,1917,0.0,29.39,30.11,5.8,18,6.5
3,2007-05-29,CULEX PIPIENS/RESTUANS,T015,41.974089,-87.824812,2007,5,29,149,22,...,0,9,421,1917,0.0,29.39,30.11,5.8,18,6.5
4,2007-05-29,CULEX RESTUANS,T015,41.974089,-87.824812,2007,5,29,149,22,...,0,9,421,1917,0.0,29.39,30.11,5.8,18,6.5


### Create Dummy variable for categorical variables

In [56]:
all_dataset = pd.get_dummies(all_dataset, columns=['Species','Trap'])

In [57]:
print('Size of train/test dataset with weather data(One Hot Encoded): {}'.format(all_dataset.shape))

Size of train/test dataset with weather data(One Hot Encoded): (126799, 182)


### Splitting up train/test set

In [63]:
#Splits out train dataset using year
train = all_dataset[all_dataset['year']%2!=0]
train.reset_index(inplace=True, drop=True)

#Re-attaching original nummosquitos and wnvpresent columns
wnv = pd.Series(df_train['WnvPresent'])
train_with_wnv = pd.concat([train , wnv], axis=1)
train_with_wnv['NumMosquitos'] = df_train_og['NumMosquitos']

print('Size of processed train data: {}'.format(train_with_wnv.shape))

Size of processed train data: (10506, 184)


In [64]:
train_with_wnv.head()

Unnamed: 0,Date,Latitude,Longitude,year,month,day,day_of_year,week,Station,Tmax,...,Trap_T233,Trap_T234,Trap_T235,Trap_T236,Trap_T237,Trap_T238,Trap_T900,Trap_T903,WnvPresent,NumMosquitos
0,2007-05-29,41.95469,-87.800991,2007,5,29,149,22,1,88,...,0,0,0,0,0,0,0,0,0,1
1,2007-05-29,41.95469,-87.800991,2007,5,29,149,22,1,88,...,0,0,0,0,0,0,0,0,0,1
2,2007-05-29,41.994991,-87.769279,2007,5,29,149,22,1,88,...,0,0,0,0,0,0,0,0,0,1
3,2007-05-29,41.974089,-87.824812,2007,5,29,149,22,1,88,...,0,0,0,0,0,0,0,0,0,1
4,2007-05-29,41.974089,-87.824812,2007,5,29,149,22,1,88,...,0,0,0,0,0,0,0,0,0,4


In [65]:
#Splits out test dataset using year
test = all_dataset.loc[all_dataset['year']%2==0]
print('Size of processed test data: {}'.format(test.shape))

Size of processed test data: (116293, 182)


In [74]:
test.head()

Unnamed: 0,Date,Latitude,Longitude,year,month,day,day_of_year,week,Station,Tmax,...,Trap_T231,Trap_T232,Trap_T233,Trap_T234,Trap_T235,Trap_T236,Trap_T237,Trap_T238,Trap_T900,Trap_T903
10506,2008-06-11,41.95469,-87.800991,2008,6,11,163,24,1,86,...,0,0,0,0,0,0,0,0,0,0
10507,2008-06-11,41.95469,-87.800991,2008,6,11,163,24,1,86,...,0,0,0,0,0,0,0,0,0,0
10508,2008-06-11,41.95469,-87.800991,2008,6,11,163,24,1,86,...,0,0,0,0,0,0,0,0,0,0
10509,2008-06-11,41.95469,-87.800991,2008,6,11,163,24,1,86,...,0,0,0,0,0,0,0,0,0,0
10510,2008-06-11,41.95469,-87.800991,2008,6,11,163,24,1,86,...,0,0,0,0,0,0,0,0,0,0


In [111]:
train_with_wnv.to_csv('../csv_files/eda_data.csv')

In [112]:
test.to_csv('../csv_files/model_test_data.csv')

### Compensating for imbalanced class

Out of the 8475 rows in our training dataset, only 457 (~5%) data points represent the virus present class while 8018 represent virus not present.

In [101]:
#Splits data by presence of wnv
majority_class = train_with_wnv[train_with_wnv['WnvPresent']==0]
minority_class = train_with_wnv[train_with_wnv['WnvPresent']==1]

#Resamples minority class with replacement
minority_upsampled = resample(minority_class, 
                              replace=True, 
                            n_samples=majority_class.shape[0], 
                            random_state=42)

#Combine new minority class dataset with original majority class dataset
train_resampled = pd.concat([minority_upsampled,majority_class])

#Checks class representation
train_resampled.WnvPresent.value_counts()

1    9955
0    9955
Name: WnvPresent, dtype: int64

In [107]:
#Shuffles dataset to inject randomness
df = shuffle(train_resampled, random_state=42)
df.reset_index(drop=True, inplace=True)

# Print resampled, reshuffled new dataset
df.head()

Unnamed: 0,Date,Latitude,Longitude,year,month,day,day_of_year,week,Station,Tmax,...,Trap_T233,Trap_T234,Trap_T235,Trap_T236,Trap_T237,Trap_T238,Trap_T900,Trap_T903,WnvPresent,NumMosquitos
0,2007-08-21,41.944869,-87.832763,2007,8,21,233,34,1,87,...,0,0,0,0,0,0,0,0,1,11
1,2011-07-25,42.008314,-87.777921,2011,7,25,206,30,1,88,...,0,0,0,0,0,0,0,0,0,15
2,2009-07-10,41.662014,-87.724608,2009,7,10,191,28,1,82,...,0,0,0,0,0,0,0,0,0,42
3,2013-08-01,41.728495,-87.600963,2013,8,1,213,31,1,81,...,0,0,0,0,0,0,0,0,1,21
4,2013-09-12,41.923738,-87.785288,2013,9,12,255,37,1,82,...,0,0,0,0,0,0,0,0,1,21


In [110]:
df.to_csv('../csv_files/model_train_data.csv')