In [15]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
%matplotlib inline

In [None]:
training_records = pd.read_csv('dengue_features_train.csv')
testing_records = pd.read_csv('dengue_features_test.csv')

In [17]:
training_records_sj = training_records[training_records['city'] == 'sj'].drop('city', axis=1)
training_records_iq = training_records[training_records['city'] == 'iq'].drop('city', axis=1)
testing_records_sj = testing_records[testing_records['city'] == 'sj'].drop('city', axis=1)
testing_records_iq = testing_records[testing_records['city'] == 'iq'].drop('city', axis=1)
records_iq = pd.concat([training_records_iq, testing_records_iq], ignore_index=True)
records_sj = pd.concat([training_records_sj, testing_records_sj], ignore_index=True)

In [18]:
records_iq.drop(['reanalysis_avg_temp_k','reanalysis_sat_precip_amt_mm', 'reanalysis_specific_humidity_g_per_kg'], axis=1, inplace=True)
records_sj.drop(['reanalysis_avg_temp_k','reanalysis_sat_precip_amt_mm', 'reanalysis_specific_humidity_g_per_kg'], axis=1, inplace=True)
records_iq.drop(['year'], axis=1, inplace=True)
records_sj.drop(['year'], axis=1, inplace=True)
records_iq[['reanalysis_air_temp_k', 'reanalysis_dew_point_temp_k', 'reanalysis_max_air_temp_k', 
            'reanalysis_min_air_temp_k']] -= 273.15
records_sj[['reanalysis_air_temp_k', 'reanalysis_dew_point_temp_k', 'reanalysis_max_air_temp_k', 
            'reanalysis_min_air_temp_k']] -= 273.15
records_iq['station_temp'] = records_iq[
    ['station_avg_temp_c', 'station_max_temp_c', 'station_min_temp_c']
].interpolate().mean(axis=1)
records_iq.drop(['station_avg_temp_c', 'station_diur_temp_rng_c', 
                          'station_max_temp_c', 'station_min_temp_c'], axis=1, inplace=True)
records_sj['station_temp'] = records_sj[
    ['station_avg_temp_c', 'station_max_temp_c', 'station_min_temp_c']
].interpolate().mean(axis=1)
records_sj.drop(['station_avg_temp_c', 'station_diur_temp_rng_c', 
                          'station_max_temp_c', 'station_min_temp_c'], axis=1, inplace=True)

In [19]:
records_iq.to_csv('./Files/PreProcessed-features-iq.csv', index=False)
records_sj.to_csv('./Files/PreProcessed-features-sj.csv', index=False)

In [20]:
training_targets = pd.read_csv('dengue_labels_train.csv')

In [21]:
training_targets.drop(['year'], axis=1, inplace=True)
training_targets['week_start_date'] = training_records['week_start_date']
training_targets_sj = training_targets[training_targets['city'] == 'sj'].drop('city', axis=1)
training_targets_iq = training_targets[training_targets['city'] == 'iq'].drop('city', axis=1)

In [22]:
training_targets_iq.to_csv('./Files/PreProcessed-labels-train-iq.csv', index=False)
training_targets_sj.to_csv('./Files/PreProcessed-labels-train-sj.csv', index=False)

In [23]:
records_iq = pd.read_csv(
    './Files/PreProcessed-features-iq.csv', 
    parse_dates=['week_start_date'],
    index_col='week_start_date'
).interpolate()
records_sj = pd.read_csv(
    './Files/PreProcessed-features-sj.csv', 
    parse_dates=['week_start_date'],
    index_col='week_start_date'
).interpolate()
labels_iq = pd.read_csv(
    './Files/PreProcessed-labels-train-iq.csv',
    parse_dates=['week_start_date'],
    index_col='week_start_date'
)
labels_sj = pd.read_csv(
    './Files/PreProcessed-labels-train-sj.csv',
    parse_dates=['week_start_date'],
    index_col='week_start_date'
)

In [24]:
def getPredictions(Id, totalRecords,labels,numOfTrain , period ,features):
    ##One hot encode weekofyear
    weeks = pd.get_dummies(totalRecords['weekofyear'], prefix='w')
    train_time , test_time = weeks[:numOfTrain].reset_index().drop('week_start_date'
                                                                 , axis=1) ,weeks[numOfTrain:].reset_index().drop('week_start_date', axis=1)
    train_cases = labels[['total_cases']].reset_index().drop('week_start_date', axis=1)
    
    ####Seasonality prediction model
    seasonal_model = LinearRegression()
    seasonal_model.fit(train_time, train_cases)
    
    seasonal_train = pd.Series(
        seasonal_model.predict(train_time).flatten()).rolling(5, min_periods=1, center=True).mean()
    
    train_trendComponent = train_cases.total_cases - seasonal_train
    
    trend = totalRecords[features].reset_index().drop('week_start_date', axis=1).rolling(period).mean()
    
    train_trend = trend[period:numOfTrain]
    test_trend = trend[numOfTrain:]
    train_remainder = train_trendComponent[period:]
    
    ####Trend prediction model
    trend_model = LinearRegression()
    trend_model.fit(train_trend, train_remainder)
    train_pred_trend = pd.Series(trend_model.predict(train_trend).flatten())

    print('Mean_absolute_error for example '+str(Id) +" - "+ str(mean_absolute_error(y_pred=train_pred_trend.values + seasonal_train[period:].values,
                    y_true=train_cases['total_cases'][period:].values)))

    predicted_seasonal = pd.Series(seasonal_model.predict(test_time).flatten())
    predicted_trend = pd.Series(trend_model.predict(test_trend).flatten())

    pred = (predicted_trend + predicted_seasonal).rolling(5, min_periods=1, center=True).mean().astype(int)
    return pred

In [25]:
pred_iq = getPredictions(1,records_iq, labels_iq, 520, 53, [
    'reanalysis_precip_amt_kg_per_m2',
    'reanalysis_relative_humidity_percent', 
    'station_temp'
])
pred_sj = getPredictions(2,records_sj, labels_sj, 936, 53, [
    'reanalysis_precip_amt_kg_per_m2',
    'reanalysis_relative_humidity_percent',
    'station_temp'
])

Mean_absolute_error for example 1 - 6.209774527003167
Mean_absolute_error for example 2 - 25.275639609689655


In [26]:
test_sj = testing_records[testing_records['city'] == 'sj']
test_iq = testing_records[testing_records['city'] == 'iq']

submission = pd.DataFrame({
    'city': pd.concat([test_sj['city'], test_iq['city']], ignore_index=True),
    'year': pd.concat([test_sj['year'], test_iq['year']], ignore_index=True), 
    'weekofyear': pd.concat([test_sj['weekofyear'], test_iq['weekofyear']], ignore_index=True),
    'total_cases': pd.concat([pred_sj, pred_iq], ignore_index=True).round().astype(int).clip(lower=0)
})

In [27]:
submission.to_csv('submission.csv', index=False)
print("Submission saved to submission.csv")
submission.head()

Submission saved to submission.csv


Unnamed: 0,city,year,weekofyear,total_cases
0,sj,2008,18,0
1,sj,2008,19,0
2,sj,2008,20,0
3,sj,2008,21,0
4,sj,2008,22,0
