# Data Exploration
(C) 2018 Dariusz Kajtoch

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

%matplotlib inline

sns.set_context('notebook')
plt.rcParams['text.usetex'] = False

In [5]:
data = pd.read_csv('./data/dengue_features_train.csv')
data.columns

Index(['city', 'year', 'weekofyear', 'week_start_date', 'ndvi_ne', 'ndvi_nw',
       'ndvi_se', 'ndvi_sw', 'precipitation_amt_mm', 'reanalysis_air_temp_k',
       'reanalysis_avg_temp_k', 'reanalysis_dew_point_temp_k',
       'reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k',
       'reanalysis_precip_amt_kg_per_m2',
       'reanalysis_relative_humidity_percent', 'reanalysis_sat_precip_amt_mm',
       'reanalysis_specific_humidity_g_per_kg', 'reanalysis_tdtr_k',
       'station_avg_temp_c', 'station_diur_temp_rng_c', 'station_max_temp_c',
       'station_min_temp_c', 'station_precip_mm'],
      dtype='object')

# Data Description
1. Normalized difference vegetation index: Connected with life green plants. Values [-1,1]. NOAA satelite
    * `ndvi_se` - Pixel southeast of city centroid.
    * `ndvi_sw` - Pixel southweast of city centroid.
    * `ndvi_ne` - Pixel northeast of city centroid.
    * `ndvi_nw` - Pixel northwest of city centroid.
2. Precipitation: rain, snow and other.
    * `precipitation_amt_mm` - Total precipitation [mm].
3. 

* **GHCN** - Global Historical Climatology Network
* **PERSIANN** - Precipitation estimation from remotely sensed information on using artifical neural networks.Check [here](https://climatedataguide.ucar.edu/climate-data/persiann-cdr-precipitation-estimation-remotely-sensed-information-using-artificial).
* **NCEP** - National centers for environmental prediction.
* **Diurnal temperature range** - difference between maximal and minimal temperature during the day.

In [9]:
print(len(data))
pd.DataFrame({
    'NoNaNs': data.isna().sum(),
    'NoDistinct': data.nunique()
})

1456


Unnamed: 0,NoDistinct,NoNaNs
city,2,0
year,21,0
weekofyear,53,0
week_start_date,1049,0
ndvi_ne,1214,194
ndvi_nw,1365,52
ndvi_se,1395,22
ndvi_sw,1388,22
precipitation_amt_mm,1157,13
reanalysis_air_temp_k,1176,10


In [12]:
data[['reanalysis_air_temp_k', 'reanalysis_avg_temp_k', 'station_avg_temp_c']]

Unnamed: 0,reanalysis_air_temp_k,reanalysis_avg_temp_k,station_avg_temp_c
0,297.572857,297.742857,25.442857
1,298.211429,298.442857,26.714286
2,298.781429,298.878571,26.714286
3,298.987143,299.228571,27.471429
4,299.518571,299.664286,28.942857
5,299.630000,299.764286,28.114286
6,299.207143,299.221429,27.414286
7,299.591429,299.528571,28.371429
8,299.578571,299.557143,28.328571
9,300.154286,300.278571,28.328571
