#  Importing Libraries and Data

In [1]:
import pandas as pd

In [2]:
weather = pd.read_csv('./assets/weather', index_col=0)
mosquito = pd.read_csv('./assets/mosquito_data', index_col=0)

In [3]:
# Converting the dates back into datetime
mosquito['TEST DATE'] = pd.to_datetime(mosquito['TEST DATE'])
weather['DATE'] = pd.to_datetime(weather['DATE'])

# Splitting into Train and Test Data

In [4]:
# Creating the hold out set for mosquito data
mosquito_train = mosquito.set_index('TEST DATE').loc['2007':'2015']
mosquito_test = mosquito.set_index('TEST DATE').loc['2016':'2018']

In [5]:
# Creating the hold out set for weather data and including the prior year to the mosquito data
weather_train = weather.set_index('DATE').loc['2006':'2015']
weather_test = weather.set_index('DATE').loc['2015':'2018']

# Preparing the Data for Feature Engineering
### Fixing the Pipiens/Restuans Problem
#### Finding the proportions of pipiens/restuans that carry wnv and that are caught

In [6]:
# Beginning by looking at the species available
mosquito_train['SPECIES'].unique()

array(['CULEX PIPIENS/RESTUANS', 'CULEX RESTUANS', 'CULEX PIPIENS',
       'CULEX SALINARIUS', 'CULEX TERRITANS', 'CULEX TARSALIS',
       'UNSPECIFIED CULEX', 'CULEX ERRATICUS'], dtype=object)

In [7]:
# Determining the proportions of wnv in in order to fix pipiens/restuans problem
restuans_wnv = mosquito_train[(mosquito_train['RESULT']==1) & (mosquito_train['SPECIES']=='CULEX RESTUANS')]['RESULT'].sum()
pipiens_wnv = mosquito_train[(mosquito_train['RESULT']==1) & (mosquito_train['SPECIES']=='CULEX PIPIENS')]['RESULT'].sum()
restuans_wnv_perc = restuans_wnv/(pipiens_wnv + restuans_wnv)
pipiens_wnv_perc = 1 - restuans_wnv_perc
restuans_wnv_perc, pipiens_wnv_perc

(0.39655172413793105, 0.603448275862069)

In [8]:
# Determining the proportions of mosquitoes in the two species
restuans_mosq = mosquito_train[mosquito_train['SPECIES']=='CULEX RESTUANS']['NUMBER OF MOSQUITOES'].sum()
pipiens_mosq = mosquito_train[mosquito_train['SPECIES']=='CULEX PIPIENS']['NUMBER OF MOSQUITOES'].sum()
restuans_mosq_perc = restuans_mosq/(pipiens_mosq + restuans_mosq)
pipiens_mosq_perc = 1 - restuans_mosq_perc
restuans_mosq_perc, pipiens_mosq_perc

(0.513083720412999, 0.486916279587001)

In [9]:
# Creating a column with the number of each mosquito found
species_list = mosquito_train['SPECIES'].unique()
mosquito_train = pd.get_dummies(mosquito_train, columns=['SPECIES'])
for species in species_list:
    mosquito_train['SPECIES_' + species] *= mosquito_train['NUMBER OF MOSQUITOES']
mosquito_test = pd.get_dummies(mosquito_test, columns=['SPECIES'])
for species in species_list:
    try:
        mosquito_test['SPECIES_' + species] *= mosquito_test['NUMBER OF MOSQUITOES']
    except:
        pass

In [10]:
# Creating a column with species index
mosquito_train = mosquito_train.reset_index()
for row in mosquito_train.index:
    for species in species_list:
        if mosquito_train.loc[row, 'SPECIES_' + species] >= 1:
            mosquitoes = mosquito_train.loc[row, 'SPECIES_' + species]
            result = mosquito_train.loc[row, 'RESULT']
            mosquito_train.loc[row, species + ' INDEX'] = mosquitoes * result
            break
mosquito_test = mosquito_test.reset_index()
for row in mosquito_test.index:
    for species in species_list:
        try:
            if mosquito_test.loc[row, 'SPECIES_' + species] >= 1:
                mosquitoes = mosquito_test.loc[row, 'SPECIES_' + species]
                result = mosquito_test.loc[row, 'RESULT']
                mosquito_test.loc[row, species + ' INDEX'] = mosquitoes * result
                break
        except:
            pass

In [11]:
# Filling in the NaNs
mosquito_train.fillna(0, inplace=True)
mosquito_test.fillna(0, inplace=True)

In [12]:
# Redistributing the restuans/pipiens mosquitos into restuans and pipiens
for row in mosquito_train[mosquito_train['SPECIES_CULEX PIPIENS/RESTUANS']>=1].index:
    mosquitoes = mosquito_train.loc[row, 'NUMBER OF MOSQUITOES']
    result = mosquito_train.loc[row, 'RESULT']
    mosquito_train.loc[row, 'SPECIES_CULEX PIPIENS'] = pipiens_mosq_perc * mosquitoes
    mosquito_train.loc[row, 'SPECIES_CULEX RESTUANS'] = restuans_mosq_perc * mosquitoes
    mosquito_train.loc[row, 'CULEX PIPIENS INDEX'] = pipiens_mosq_perc * mosquitoes * result
    mosquito_train.loc[row, 'CULEX RESTUANS INDEX'] = restuans_mosq_perc * mosquitoes * result
for row in mosquito_test[mosquito_test['SPECIES_CULEX PIPIENS/RESTUANS']>=1].index:
    mosquitoes = mosquito_test.loc[row, 'NUMBER OF MOSQUITOES']
    result = mosquito_test.loc[row, 'RESULT']
    mosquito_test.loc[row, 'SPECIES_CULEX PIPIENS'] = pipiens_mosq_perc * mosquitoes
    mosquito_test.loc[row, 'SPECIES_CULEX RESTUANS'] = restuans_mosq_perc * mosquitoes
    mosquito_test.loc[row, 'CULEX PIPIENS INDEX'] = pipiens_mosq_perc * mosquitoes * result
    mosquito_test.loc[row, 'CULEX RESTUANS INDEX'] = restuans_mosq_perc * mosquitoes * result

# Creating Features
### Starting with weather features
#### Creating a degree week

In [13]:
# Creating the degree week by looking at the average temperature of the prior week compared to 71.6 degrees
# See https://parasitesandvectors.biomedcentral.com/articles/10.1186/1756-3305-3-19 for more info
weather_train['DW'] = weather_train['TAVG'].rolling(7).mean().map(lambda x: x - 71.6 if x > 71.6 else 0)
weather_test['DW'] = weather_test['TAVG'].rolling(7).mean().map(lambda x: x - 71.6 if x > 71.6 else 0)

#### Creating a last year precipitation feature

In [14]:
# Calculating the average precipation of the previous year and using it as a feature
for year in range(2007, 2016):
    weather_train.loc[str(year), 'PRCP LAST YEAR'] = weather.set_index('DATE').loc[str(year - 1), 'PRCP'].mean()
for year in range(2015, 2019):
    weather_test.loc[str(year), 'PRCP LAST YEAR'] = weather.set_index('DATE').loc[str(year - 1), 'PRCP'].mean()

#### Creating lags for the features

In [27]:
# Creating lags of the degree week
for i in range(1,6):
    weather_train['DW_' + str(i)] = weather_train['DW'].shift(i*7)
    weather_train['DW_' + str(i)].fillna(0, inplace=True)
for i in range(1,6):
    weather_test['DW_' + str(i)] = weather_test['DW'].shift(i*7)
    weather_test['DW_' + str(i)].fillna(0, inplace=True)

In [28]:
# Creating lags of rolling week precipitation
for i in range(1,6):
    weather_train['PRCP_' + str(i)] = weather_train['PRCP'].shift(i*7).rolling(7, min_periods=1).mean()
    weather_train['PRCP_' + str(i)].fillna(0, inplace=True)
for i in range(1,6):
    weather_test['PRCP_' + str(i)] = weather_test['PRCP'].shift(i*7).rolling(7, min_periods=1).mean()
    weather_test['PRCP_' + str(i)].fillna(0, inplace=True)

### Moving on to mosquito features
#### Creating a vector index

In [15]:
# Creating a vector index by adding each species index
# See https://parasitesandvectors.biomedcentral.com/articles/10.1186/1756-3305-3-19 for more info
vector_list = ['CULEX RESTUANS INDEX', 'CULEX PIPIENS INDEX', 'CULEX SALINARIUS INDEX', 'CULEX TERRITANS INDEX',
               'CULEX TARSALIS INDEX', 'UNSPECIFIED CULEX INDEX', 'CULEX ERRATICUS INDEX']
mosquito_train['VI'] = mosquito_train[vector_list].sum(axis=1)
vector_list = ['CULEX RESTUANS INDEX', 'CULEX PIPIENS INDEX', 'CULEX SALINARIUS INDEX', 'CULEX TERRITANS INDEX',
               'CULEX TARSALIS INDEX', 'CULEX ERRATICUS INDEX']
mosquito_test['VI'] = mosquito_test[vector_list].sum(axis=1)

####  Cleaning some data that missed

In [16]:
# Checking for traps that 'moved'
for trap in mosquito_train['TRAP'].unique():
    if len(mosquito_train[mosquito_train['TRAP']==trap]['LONGITUDE'].unique())>1:
        print('train long', trap)
    elif len(mosquito_train[mosquito_train['TRAP']==trap]['LATITUDE'].unique())>1:
        print('train lat', trap)
        
for trap in mosquito_test['TRAP'].unique():
    if len(mosquito_test[mosquito_test['TRAP']==trap]['LONGITUDE'].unique())>1:
        print('test long', trap)
    elif len(mosquito_test[mosquito_test['TRAP']==trap]['LATITUDE'].unique())>1:
        print('test lat', trap)

train long T035


In [17]:
# Understanding how it moved
mosquito_train[mosquito_train['TRAP']=='T035']['LONGITUDE'].unique()

array([-87.67772   , -87.63787261])

In [18]:
# Checking if the dates overlap
for date in mosquito_train[(mosquito_train['TRAP']=='T035')&
                           (mosquito_train['LONGITUDE']== -87.677720)]['TEST DATE'].unique():
    if date in mosquito_train[(mosquito_train['TRAP']=='T035')&
                              (mosquito_train['LONGITUDE']== -87.637872614)]['TEST DATE'].unique():
        print(date)

In [23]:
# Grouping each of the trap locations for each day
mosquito_train = mosquito_train.groupby(['TEST DATE', 'SEASON YEAR', 'LONGITUDE', 'LATITUDE'],
                                        as_index=False).sum()
mosquito_test = mosquito_test.groupby(['TEST DATE', 'SEASON YEAR', 'LONGITUDE', 'LATITUDE'],
                                      as_index=False).sum()

#### Creating vector distance

In [24]:
# Creating vector distances for the train data using inverse distances
for row in mosquito_train.index:
    
    # Update these features for the observation
    long = mosquito_train.loc[row, 'LONGITUDE']
    lat = mosquito_train.loc[row, 'LATITUDE']
    date = mosquito_train.loc[row, 'TEST DATE']
    mosquito_train.loc[row, 'VD'] = 0
    
    for row_2 in mosquito_train[mosquito_train['TEST DATE']==date].drop(row).index:
        
        # Calculate the inverse distances with other row and multiply them by their VI
        long_dist = abs(long - mosquito_train.loc[row_2, 'LONGITUDE'])
        lat_dist = abs(lat - mosquito_train.loc[row_2, 'LATITUDE'])
        distance = ((long_dist ** 2) + (lat_dist ** 2)) ** .5
        if distance != 0:
            vector_distance = (1.0 / distance) * mosquito_train.loc[row_2, 'VI']
            mosquito_train.loc[row, 'VD'] += vector_distance
        else:
            print('train', row, row_2)
            
# Creating vector distances for the test data using inverse distances           
for row in mosquito_test.index:
    
    # Update these features for the observation
    long = mosquito_test.loc[row, 'LONGITUDE']
    lat = mosquito_test.loc[row, 'LATITUDE']
    date = mosquito_test.loc[row, 'TEST DATE']
    mosquito_test.loc[row, 'VD'] = 0
    
    for row_2 in mosquito_test[mosquito_test['TEST DATE']==date].drop(row).index:
        
        # Calculate the inverse distances with other row and multiply them by their VI
        long_dist = abs(long - mosquito_test.loc[row_2, 'LONGITUDE'])
        lat_dist = abs(lat - mosquito_test.loc[row_2, 'LATITUDE'])
        distance = ((long_dist ** 2) + (lat_dist ** 2)) ** .5
        if distance != 0:
            vector_distance = (1.0 / distance) * mosquito_test.loc[row_2, 'VI']
            mosquito_test.loc[row, 'VD'] += vector_distance
        else:
            print('test', row, row_2)

#### Creating lags for the mosquito data

In [42]:
# Creating a lag for the last trap data for the VI
for long in mosquito_train['LONGITUDE'].unique():
    for lat in mosquito_train[mosquito_train['LONGITUDE']==long]['LATITUDE'].unique():
        for row in mosquito_train[(mosquito_train['LONGITUDE']==long)&(mosquito_train['LATITUDE']==lat)].index:
            mosquito_train.loc[row, 'VI_1'] = mosquito_train[(mosquito_train['LONGITUDE']==long)&
                                                             (mosquito_train['LATITUDE']==lat)]['VI'].shift(1)[row]

for long in mosquito_test['LONGITUDE'].unique():
    for lat in mosquito_test[mosquito_test['LONGITUDE']==long]['LATITUDE'].unique():
        for row in mosquito_test[(mosquito_test['LONGITUDE']==long)&(mosquito_test['LATITUDE']==lat)].index:
            mosquito_test.loc[row, 'VI_1'] = mosquito_test[(mosquito_test['LONGITUDE']==long)&
                                                             (mosquito_test['LATITUDE']==lat)]['VI'].shift(1)[row]

In [61]:
# If there is no lag data for a trap, use the average vector index of the prior week otherwise fill with a 0
for row in mosquito_train[mosquito_train['VI_1'].isna()].index:
    date = mosquito_train.loc[row, 'TEST DATE']
    mosquito_train.loc[row, 'VI_1'] = mosquito_train.groupby('TEST DATE').mean().shift(7)['VI'][date]
mosquito_train['VI_1'].fillna(0, inplace=True)

for row in mosquito_test[mosquito_test['VI_1'].isna()].index:
    date = mosquito_test.loc[row, 'TEST DATE']
    mosquito_test.loc[row, 'VI_1'] = mosquito_test.groupby('TEST DATE').mean().shift(7)['VI'][date]
mosquito_test['VI_1'].fillna(0, inplace=True)

In [65]:
# Creating a lag for the last trap data for the VD
for long in mosquito_train['LONGITUDE'].unique():
    for lat in mosquito_train[mosquito_train['LONGITUDE']==long]['LATITUDE'].unique():
        for row in mosquito_train[(mosquito_train['LONGITUDE']==long)&(mosquito_train['LATITUDE']==lat)].index:
            mosquito_train.loc[row, 'VD_1'] = mosquito_train[(mosquito_train['LONGITUDE']==long)&
                                                             (mosquito_train['LATITUDE']==lat)]['VD'].shift(1)[row]

for long in mosquito_test['LONGITUDE'].unique():
    for lat in mosquito_test[mosquito_test['LONGITUDE']==long]['LATITUDE'].unique():
        for row in mosquito_test[(mosquito_test['LONGITUDE']==long)&(mosquito_test['LATITUDE']==lat)].index:
            mosquito_test.loc[row, 'VD_1'] = mosquito_test[(mosquito_test['LONGITUDE']==long)&
                                                             (mosquito_test['LATITUDE']==lat)]['VD'].shift(1)[row]

In [66]:
# If there is no lag data for a trap, use the average vector distance of the prior week otherwise fill with a 0
for row in mosquito_train[mosquito_train['VD_1'].isna()].index:
    date = mosquito_train.loc[row, 'TEST DATE']
    mosquito_train.loc[row, 'VD_1'] = mosquito_train.groupby('TEST DATE').mean().shift(7)['VD'][date]
mosquito_train['VD_1'].fillna(0, inplace=True)

for row in mosquito_test[mosquito_test['VD_1'].isna()].index:
    date = mosquito_test.loc[row, 'TEST DATE']
    mosquito_test.loc[row, 'VD_1'] = mosquito_test.groupby('TEST DATE').mean().shift(7)['VD'][date]
mosquito_test['VD_1'].fillna(0, inplace=True)

# Exporting Data

In [69]:
# Commented out so it will not be accidentally done
weather_train.to_csv('./assets/weather_train')
weather_test.to_csv('./assets/weather_test')
mosquito_train.to_csv('./assets/mosquito_train')
mosquito_test.to_csv('./assets/mosquito_test')