In [2]:
import pandas as pd

In [3]:
import numpy as np

In [4]:
import matplotlib.pyplot as plt

In [5]:
from sklearn.experimental import enable_iterative_imputer

In [6]:
from sklearn.impute import IterativeImputer

In [7]:
import math

#### read in data

In [8]:
df_trainf = pd.read_csv('dengue_features_train.csv',sep = ',')

In [9]:
df_trainf.columns

Index(['city', 'year', 'weekofyear', 'week_start_date', 'ndvi_ne', 'ndvi_nw',
       'ndvi_se', 'ndvi_sw', 'precipitation_amt_mm', 'reanalysis_air_temp_k',
       'reanalysis_avg_temp_k', 'reanalysis_dew_point_temp_k',
       'reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k',
       'reanalysis_precip_amt_kg_per_m2',
       'reanalysis_relative_humidity_percent', 'reanalysis_sat_precip_amt_mm',
       'reanalysis_specific_humidity_g_per_kg', 'reanalysis_tdtr_k',
       'station_avg_temp_c', 'station_diur_temp_rng_c', 'station_max_temp_c',
       'station_min_temp_c', 'station_precip_mm'],
      dtype='object')

In [10]:
df_trainf.isnull().sum()

city                                       0
year                                       0
weekofyear                                 0
week_start_date                            0
ndvi_ne                                  194
ndvi_nw                                   52
ndvi_se                                   22
ndvi_sw                                   22
precipitation_amt_mm                      13
reanalysis_air_temp_k                     10
reanalysis_avg_temp_k                     10
reanalysis_dew_point_temp_k               10
reanalysis_max_air_temp_k                 10
reanalysis_min_air_temp_k                 10
reanalysis_precip_amt_kg_per_m2           10
reanalysis_relative_humidity_percent      10
reanalysis_sat_precip_amt_mm              13
reanalysis_specific_humidity_g_per_kg     10
reanalysis_tdtr_k                         10
station_avg_temp_c                        43
station_diur_temp_rng_c                   43
station_max_temp_c                        20
station_mi

#### Dropping city for sake of modeling

In [11]:
dfdropped=(df_trainf.drop(columns='city')).copy()

#### AVG_temp Converstion 

In [12]:
def celsius_to_kelvin(celsius):
    return(celsius+273.15)

In [13]:
def kelvin_to_celsius(kelvin):
    return(kelvin-273.15)

In [15]:
s_avg_c = dfdropped['station_avg_temp_c']
s_avg_k = dfdropped['reanalysis_avg_temp_k']
for idx,iter in dfdropped.iterrows():
    if (not math.isnan(s_avg_k[idx])) & (math.isnan(s_avg_c[idx])): #if K is valid and C is not
        s_avg_c[idx] = (kelvin_to_celsius(s_avg_k[idx]))   
        
    elif (not math.isnan(s_avg_c[idx])) & (math.isnan(s_avg_k[idx])): #if C is valid and K is not
        s_avg_k[idx] = (celsius_to_kelvin(s_avg_c[idx])) 
        
dfdropped['station_avg_temp_c'] = s_avg_c
dfdropped['reanalysis_avg_temp_k'] = s_avg_k

## Imputation

### Using sklearn imputer with bayesianRidge

In [16]:
from sklearn.linear_model import BayesianRidge

In [17]:
imp = IterativeImputer(estimator=BayesianRidge(),max_iter=20,verbose=0)

In [18]:
dfdroppedWSD = dfdropped.drop(columns='week_start_date').copy()
imp.fit(dfdroppedWSD)

IterativeImputer(add_indicator=False,
                 estimator=BayesianRidge(alpha_1=1e-06, alpha_2=1e-06,
                                         compute_score=False, copy_X=True,
                                         fit_intercept=True, lambda_1=1e-06,
                                         lambda_2=1e-06, n_iter=300,
                                         normalize=False, tol=0.001,
                                         verbose=False),
                 imputation_order='ascending', initial_strategy='mean',
                 max_iter=20, max_value=None, min_value=None,
                 missing_values=nan, n_nearest_features=None, random_state=None,
                 sample_posterior=False, tol=0.001, verbose=0)

In [19]:
imputed_df = imp.transform(dfdroppedWSD)

In [20]:
imputed_df = pd.DataFrame(imputed_df,columns=dfdroppedWSD.columns)

In [21]:
imputed_df.head(10)

Unnamed: 0,year,weekofyear,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,...,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
0,1990.0,18.0,0.1226,0.103725,0.198483,0.177617,12.42,297.572857,297.742857,292.414286,...,32.0,73.365714,12.42,14.012857,2.628571,25.442857,6.9,29.4,20.0,16.0
1,1990.0,19.0,0.1699,0.142175,0.162357,0.155486,22.82,298.211429,298.442857,293.951429,...,17.94,77.368571,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6
2,1990.0,20.0,0.03225,0.172967,0.1572,0.170843,34.54,298.781429,298.878571,295.434286,...,26.1,82.052857,34.54,16.848571,2.3,26.714286,6.485714,32.2,22.8,41.4
3,1990.0,21.0,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,299.228571,295.31,...,13.9,80.337143,15.36,16.672857,2.428571,27.471429,6.771429,33.3,23.3,4.0
4,1990.0,22.0,0.1962,0.2622,0.2512,0.24734,7.52,299.518571,299.664286,295.821429,...,12.2,80.46,7.52,17.21,3.014286,28.942857,9.371429,35.0,23.9,5.8
5,1990.0,23.0,0.170587,0.17485,0.254314,0.181743,9.58,299.63,299.764286,295.851429,...,26.49,79.891429,9.58,17.212857,2.1,28.114286,6.942857,34.4,23.9,39.1
6,1990.0,24.0,0.1129,0.0928,0.205071,0.210271,3.48,299.207143,299.221429,295.865714,...,38.6,82.0,3.48,17.234286,2.042857,27.414286,6.771429,32.2,23.3,29.7
7,1990.0,25.0,0.0725,0.0725,0.151471,0.133029,151.12,299.591429,299.528571,296.531429,...,30.0,83.375714,151.12,17.977143,1.571429,28.371429,7.685714,33.9,22.8,21.1
8,1990.0,26.0,0.10245,0.146175,0.125571,0.1236,19.32,299.578571,299.557143,296.378571,...,37.51,82.768571,19.32,17.79,1.885714,28.328571,7.385714,33.9,22.8,21.1
9,1990.0,27.0,0.108732,0.12155,0.160683,0.202567,14.41,300.154286,300.278571,296.651429,...,28.4,81.281429,14.41,18.071429,2.014286,28.328571,6.514286,33.9,24.4,1.1


In [22]:
imputed_df['city'] = df_trainf['city']

In [23]:
imputed_df['week_start_date'] = df_trainf['week_start_date']

In [24]:
imputed_df.columns

Index(['year', 'weekofyear', 'ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw',
       'precipitation_amt_mm', 'reanalysis_air_temp_k',
       'reanalysis_avg_temp_k', 'reanalysis_dew_point_temp_k',
       'reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k',
       'reanalysis_precip_amt_kg_per_m2',
       'reanalysis_relative_humidity_percent', 'reanalysis_sat_precip_amt_mm',
       'reanalysis_specific_humidity_g_per_kg', 'reanalysis_tdtr_k',
       'station_avg_temp_c', 'station_diur_temp_rng_c', 'station_max_temp_c',
       'station_min_temp_c', 'station_precip_mm', 'city', 'week_start_date'],
      dtype='object')

In [25]:
imputed_df.set_index(['city','week_start_date'])

Unnamed: 0_level_0,Unnamed: 1_level_0,year,weekofyear,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,...,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
city,week_start_date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
sj,1990-04-30,1990.0,18.0,0.122600,0.103725,0.198483,0.177617,12.42,297.572857,297.742857,292.414286,...,32.00,73.365714,12.42,14.012857,2.628571,25.442857,6.900000,29.4,20.0,16.0
sj,1990-05-07,1990.0,19.0,0.169900,0.142175,0.162357,0.155486,22.82,298.211429,298.442857,293.951429,...,17.94,77.368571,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6
sj,1990-05-14,1990.0,20.0,0.032250,0.172967,0.157200,0.170843,34.54,298.781429,298.878571,295.434286,...,26.10,82.052857,34.54,16.848571,2.300000,26.714286,6.485714,32.2,22.8,41.4
sj,1990-05-21,1990.0,21.0,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,299.228571,295.310000,...,13.90,80.337143,15.36,16.672857,2.428571,27.471429,6.771429,33.3,23.3,4.0
sj,1990-05-28,1990.0,22.0,0.196200,0.262200,0.251200,0.247340,7.52,299.518571,299.664286,295.821429,...,12.20,80.460000,7.52,17.210000,3.014286,28.942857,9.371429,35.0,23.9,5.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
iq,2010-05-28,2010.0,21.0,0.342750,0.318900,0.256343,0.292514,55.30,299.334286,300.771429,296.825714,...,45.00,88.765714,55.30,18.485714,9.800000,28.633333,11.933333,35.4,22.4,27.0
iq,2010-06-04,2010.0,22.0,0.160157,0.160371,0.136043,0.225657,86.47,298.330000,299.392857,296.452857,...,207.10,91.600000,86.47,18.070000,7.471429,27.433333,10.500000,34.7,21.7,36.6
iq,2010-06-11,2010.0,23.0,0.247057,0.146057,0.250357,0.233714,58.94,296.598571,297.592857,295.501429,...,50.60,94.280000,58.94,17.008571,7.500000,24.400000,6.900000,32.2,19.2,7.4
iq,2010-06-18,2010.0,24.0,0.333914,0.245771,0.278886,0.325486,59.67,296.345714,297.521429,295.324286,...,62.33,94.660000,59.67,16.815714,7.871429,25.433333,8.733333,31.2,21.0,16.0
