# DengAI: Predicting Disease Spread

Using environmental data collected by various U.S. Federal Government agencies—from the Centers for Disease Control and Prevention to the National Oceanic and Atmospheric Administration in the U.S. Department of Commerce—can you predict the number of dengue fever cases reported each week in San Juan, Puerto Rico and Iquitos, Peru?

First, we'll take a look at some of the features of the training data

In [22]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
import plotly.graph_objects as go
import plotly.express as px
import sklearn as sk
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn import linear_model

In [2]:
df = pd.read_csv("Data/dengue_features_train.csv")
df_labels = pd.read_csv("Data/dengue_labels_train.csv")
df_labels_test = pd.read_csv("Data/dengue_features_test.csv")
submission_format = pd.read_csv("Data/submission_format.csv")

display(df.head(5))
print(df.shape)

display(df_labels.head(5))

display(df_labels_test.head(5))

Unnamed: 0,city,year,weekofyear,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,...,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
0,sj,1990,18,1990-04-30,0.1226,0.103725,0.198483,0.177617,12.42,297.572857,...,32.0,73.365714,12.42,14.012857,2.628571,25.442857,6.9,29.4,20.0,16.0
1,sj,1990,19,1990-05-07,0.1699,0.142175,0.162357,0.155486,22.82,298.211429,...,17.94,77.368571,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6
2,sj,1990,20,1990-05-14,0.03225,0.172967,0.1572,0.170843,34.54,298.781429,...,26.1,82.052857,34.54,16.848571,2.3,26.714286,6.485714,32.2,22.8,41.4
3,sj,1990,21,1990-05-21,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,...,13.9,80.337143,15.36,16.672857,2.428571,27.471429,6.771429,33.3,23.3,4.0
4,sj,1990,22,1990-05-28,0.1962,0.2622,0.2512,0.24734,7.52,299.518571,...,12.2,80.46,7.52,17.21,3.014286,28.942857,9.371429,35.0,23.9,5.8


(1456, 24)


Unnamed: 0,city,year,weekofyear,total_cases
0,sj,1990,18,4
1,sj,1990,19,5
2,sj,1990,20,4
3,sj,1990,21,3
4,sj,1990,22,6


Unnamed: 0,city,year,weekofyear,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,...,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
0,sj,2008,18,2008-04-29,-0.0189,-0.0189,0.102729,0.0912,78.6,298.492857,...,25.37,78.781429,78.6,15.918571,3.128571,26.528571,7.057143,33.3,21.7,75.2
1,sj,2008,19,2008-05-06,-0.018,-0.0124,0.082043,0.072314,12.56,298.475714,...,21.83,78.23,12.56,15.791429,2.571429,26.071429,5.557143,30.0,22.2,34.3
2,sj,2008,20,2008-05-13,-0.0015,,0.151083,0.091529,3.66,299.455714,...,4.12,78.27,3.66,16.674286,4.428571,27.928571,7.785714,32.8,22.8,3.0
3,sj,2008,21,2008-05-20,,-0.019867,0.124329,0.125686,0.0,299.69,...,2.2,73.015714,0.0,15.775714,4.342857,28.057143,6.271429,33.3,24.4,0.3
4,sj,2008,22,2008-05-27,0.0568,0.039833,0.062267,0.075914,0.76,299.78,...,4.36,74.084286,0.76,16.137143,3.542857,27.614286,7.085714,33.3,23.3,84.1


In [3]:
df = pd.merge(df, df_labels) #append labels to main DataFrame for visualization

Now, let's visualize some of the featues of our dataset

In [1]:
tc_fig = px.bar(df, x='total_cases', titl='Total Cases Distribution')

charts = {'reanalysis_sat_precip_amt_mm':'Precipitation Amount (mm)', 'reanalysis_avg_temp_k':'Average Air Temperature (k)', 
        'reanalysis_relative_humidity_percent':'Mean Relative Humidity (%)',
        'reanalysis_dew_point_temp_k':'Average Dew Point Temperature (k)'
         }

for key in charts:
    data = go.Scatter(x=df[key], y=df['total_cases'], mode='markers', text=df['city'], marker_color=df['total_cases'])
    layout = go.Layout(
        title = {
            'text': charts[key] + ' vs Number of Cases',
            'y':0.9,
            'x':0.5,
            'xanchor':'center',
            'yanchor':'top'
        },
        xaxis_title = "Number of Cases",
        yaxis_title = charts[key]
    )
    fig = go.Figure(data=data, layout=layout)
    fig.show()



NameError: name 'px' is not defined

From what we can see, the data is fairly Gaussian. The DewPoint data is slightly skewed, but we'll ignore that for now. Next, lets explore how clean the data is.

In [5]:
num_nans = sum([True for idx, row in df.iterrows() if any(row.isnull())]) #count number of rows with ANY NaN value
ratio_nan = num_nans / len(df)

print(str(ratio_nan * 100) + "% of rows have some NaN value\n\n")

d = go.Bar(x=df.columns, y=(df.isnull().sum() / df.shape[0]) * 100)
l = go.Layout(
    title = {
        'text': 'Number of NaN values (%) in columns',
        'y':0.9,
        'x':0.5,
        'xanchor':'center',
        'yanchor':'top'
    },
    xaxis_title = 'Column name',
    yaxis_title = 'Number of NaN values (%)'
)

fig = go.Figure(data=d, layout=l)
fig.show()

17.6510989010989% of rows have some NaN value




As we can see, the percentage of rows with some NaN value is pretty high, so it wouldn't be a great idea to just remove them entirely. Instead, lets try to fill those missing values with some meaningful data. 

For this exploration, I've opted to replace missing values with that column's mean value. This is definitely not the only option though, for example, we could also experiment with usng some algorithm to predict the missing values. 

In [6]:
df.fillna(df.mean(), inplace=True)

print(df.isnull().sum())
print(df.shape)

city                                     0
year                                     0
weekofyear                               0
week_start_date                          0
ndvi_ne                                  0
ndvi_nw                                  0
ndvi_se                                  0
ndvi_sw                                  0
precipitation_amt_mm                     0
reanalysis_air_temp_k                    0
reanalysis_avg_temp_k                    0
reanalysis_dew_point_temp_k              0
reanalysis_max_air_temp_k                0
reanalysis_min_air_temp_k                0
reanalysis_precip_amt_kg_per_m2          0
reanalysis_relative_humidity_percent     0
reanalysis_sat_precip_amt_mm             0
reanalysis_specific_humidity_g_per_kg    0
reanalysis_tdtr_k                        0
station_avg_temp_c                       0
station_diur_temp_rng_c                  0
station_max_temp_c                       0
station_min_temp_c                       0
station_pre

Great. Now, lets try a constructing a baseline model using Linear Regression.

In [46]:
y = df_labels[['total_cases']]

# df = pd.get_dummies(df, columns=['city'], drop_first=True)
# df.drop(['total_cases', 'week_start_date'], inplace=True, axis=1)

kf = KFold(n_splits=5, shuffle=True)
reg = linear_model.LinearRegression()

accuracies = []
for train_index, test_index in kf.split(df):
    d_train = df.iloc[train_index]
    t_train = y.iloc[train_index]
    
    d_test = df.iloc[test_index]
    t_test = y.iloc[test_index]
    
    reg.fit(d_train, t_train)
    accuracies.append(reg.score(d_test, t_test) * 100)


fig = go.Figure(data=[
    go.Bar(name='Accuracy Scores', x=[1,2,3,4,5], y=accuracies)
])

fig.update_layout(xaxis_title='k-th Fold', yaxis_title='Accuracy (%)')
fig.show()