In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [70]:
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OneHotEncoder, StandardScaler, Normalizer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error

## 1. Global parameters

## 2. Building blocks functions

In [9]:
def get_dt_col(X , column='week_start_date', format='%Y-%m-%d'):
    """
    Input:
        X
        col = weekofx

    Returns:
        X with X[col].dtype = datetime

    """
    X_new = X.copy()
    X_new[column] = pd.to_datetime(X_new[column], format=format)

    return X_new
    

In [87]:
def get_train_data(features_dir, labels_dir, drop=False):
    """ 
    Input:
        str to csv with train features and labels
        drop: to drop NaNs from feature columns.

    Output:
         X, y
    
    """
    features = pd.read_csv(features_dir)
    labels = pd.read_csv(labels_dir)

    data = features.merge(labels)

    print("Preprocessing of training data started.")
    
    if drop:
        data_clean = data.dropna()
        print("Droped {:.2f}% of rows.".format((1 - data_clean.shape[0]/data.shape[0])*100))
    else:
        data_clean = data.copy()

    #data_clean.loc[:,'log_total_cases'] = np.log1p(data_clean['total_cases'])
    data_clean.loc[:,'log_total_cases'] = data_clean['total_cases']
    data_clean_dt = get_dt_col(data_clean)

    y = data_clean_dt['log_total_cases'].astype(int)
    X = data_clean_dt.drop(columns=['total_cases','log_total_cases'])
    print("Preprocessing of training data finished.\n")
        
    return X, y


In [93]:
def get_train_features(X, y, corr_threshold = 0.3, split = False, skip = False):
    """ 
    Input:
        X
        y
        corr_threshold = 0.3
        split: to split the data by cities

    Output:
        numerical_list, categorical_list, other_list
    """
    data = X.copy()
    data.loc[:,'log_total_cases'] = y

    print("Feature selection in progress.")
    num_feat = data.select_dtypes(include=[np.number])

    correlation_matrix = num_feat.corr()

    cm = np.abs(correlation_matrix[['log_total_cases']]).sort_values(['log_total_cases'], ascending = False)
    mask = cm>corr_threshold
    boundary_index = np.sum(mask).iloc[0]

    ordered_num_features = cm.index[1:].to_list()
    numerical_selected = ordered_num_features[0:boundary_index-1]

    if skip:
        numerical_selected = num_feat.columns.to_list()
        numerical_selected.remove('log_total_cases')
        numerical_selected.pop(0)
    
    if split:
        categorical_cols = []
    else:
        categorical_cols = ['city']
        
    other_cols = []
    
    # Ouput logs
    print("Used {:.2f}% of numerical feature with maximum correlation = {:.4f} and minimum correlation= {:.4f}."
          .format((1 - len(numerical_selected)/num_feat.shape[1])*100, 
          cm['log_total_cases'].iloc[1], cm['log_total_cases'].iloc[boundary_index-1]))
    print("The categorical features are: {}".format(categorical_cols))
    print("The other selected features are: {}.".format(other_cols))
    print("Feature selection has been accomplished. \n")

    return numerical_selected, categorical_cols, other_cols

In [50]:
def get_trained_pipeline(X, y, columns, model, train_size=0.8, validate = True, shuffle = True):
    """ 
    Input:
        X, y
        corr_threshold: Feature to Y correlation threshold for feature selection.

    Output:
        print statement with val mean abs error
        pipeline
        # X_valid, y_valid (for later validation trials) - commented out for now

    """

    X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=1-train_size, shuffle=shuffle)

    numerical_cols, categorical_cols, other_cols = columns[0], columns[1], columns[2]

    my_cols = categorical_cols + numerical_cols + other_cols

    X_train = X_train_full[my_cols].copy()
    X_valid = X_valid_full[my_cols].copy()

    my_pipeline = get_pipeline(model, columns)

    
    if validate:
        print("Training in progress.")
        my_pipeline.fit(X_train, y_train)
        print("Training finished.\n")
        print("Validation error in progress.")
        preds = my_pipeline.predict(X_valid)
        score = mean_absolute_error(y_valid, preds)
        print(' Finished Validation')
        print('MAE:', score)
        print('\n')
    else:
        print("Training in progress.")
        my_pipeline.fit(X[my_cols], y)
        print("Training finished.\n")

    return my_pipeline

In [13]:
def get_pipeline(model, columns):
    """ 
    Input:
        model
        
    Output:
        pipeline
    """

    numerical_cols, categorical_cols = columns[0], columns[1]

    numerical_transformer = Pipeline( steps =[
        ('imputer', KNNImputer()),
        ('scalar', StandardScaler()),
        ('normalizer', Normalizer())
        # ('MinMaxScaler')
    ])
    # Preprocessing for categorical data
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder())
    ])

    # Bundle preprocessing for numerical and categorical data
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])
    
    my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])
    
    return my_pipeline

In [14]:
def get_prediction(X_test, trained_pipeline, exp=False):
    """
    Input:
        X_test
        trained_pipeline

    Output:
        prediction vector

        X_test -> apply dt_col

    """

    X = get_dt_col(X_test)
    prediction = trained_pipeline.predict(X)

    if exp:
        return np.expm1(prediction)
    else:
        return prediction

In [15]:
def get_test_output(X_test, predictions, indexes = ['city', 'year', 'weekofyear'], exponentiate=False, file_dir = '../data/output.csv'):
    """
    Input:
        X_test - 
        predictions - for both cities indexed as X_test requires it and 
                        with the same number of rows as X_test. 
        indexes (List(str)) - By default the columns of X_test for the required format,
                        i.e., ['city', 'year', 'weekofyear'];
        exp (bool) - to exponentiate predictions or not;
                     
        file_dir (str) - directory with the filename of the output csv file;
                         Give an empty string if saving the output is not desired.
        
    Returns:
        output_df (pd.DataFrame) - Dataframe with formatted results
        csv file
    """

    # 1. Exponentiate the predictions if needed.
    if exponentiate:
        result = np.round(np.expm1(predictions)).astype(int)
    else:
        result = np.round(predictions).astype(int)

    # 2. Make the dictionary for the output dataframe format.
    out_dict = {}
    for index in indexes:
        out_dict[index] = X_test[index]

    out_dict['total_cases'] = result
    output_df = pd.DataFrame.from_dict(out_dict).set_index(indexes)
    
    # 3. Save predictions in a csv file ready for submission.
    if file_dir:
        print("Saved predictions in competition file format in path {} . /n".format(file_dir))
        output_df.to_csv(file_dir)
        print('\n')

    return output_df
    

In [16]:
def split_by_city(from_dir = '../data/01_raw/', to_dir = '../data/02_intermediate/'):
    
    features = pd.read_csv(from_dir + 'dengue_features_train.csv')
    labels = pd.read_csv(from_dir + 'dengue_labels_train.csv')
    test = pd.read_csv(from_dir + 'dengue_features_test.csv')

    features[features.city == 'sj'].to_csv(to_dir + 'dengue_features_train_sj.csv')
    features[features.city == 'iq'].to_csv(to_dir + 'dengue_features_train_iq.csv')

    labels[labels.city == 'sj'].to_csv(to_dir + 'dengue_labels_train_sj.csv')
    labels[labels.city == 'iq'].to_csv(to_dir + 'dengue_labels_train_iq.csv')

    test[test.city == 'sj'].to_csv(to_dir + 'dengue_features_test_sj.csv')
    test[test.city == 'iq'].to_csv(to_dir + 'dengue_features_test_iq.csv')

    return None

In [101]:
def bind_city_outputs(raw_dir = '../data/01_raw/', output_dir = '../data/07_model_output/'):

    test_data = pd.read_csv(raw_dir + 'dengue_features_test.csv')

    output_sj = pd.read_csv(output_dir + 'output_sj.csv')
    output_iq = pd.read_csv(output_dir + 'output_iq.csv')

    output = pd.concat([output_sj, output_iq])

    test_data[['city','year','weekofyear']].merge(output).to_csv(output_dir + 'output_two_cities.csv', index=False)

    return None

## 3. Pipeline

In [46]:
corr_threshold = 0

In [None]:
split_by_city()

In [94]:
features_dir = '../data/02_intermediate/dengue_features_train_sj.csv'
labels_dir = '../data/02_intermediate/dengue_labels_train_sj.csv'
test_dir = '../data/02_intermediate/dengue_features_test_sj.csv'
test_output_dir = '../data/07_model_output/output_sj.csv'

X, y = get_train_data(features_dir=features_dir, labels_dir=labels_dir)
columns = get_train_features(X=X, y=y, corr_threshold = corr_threshold, split=True, skip = True)
print(columns)

Preprocessing of training data started.
Preprocessing of training data finished.

Feature selection in progress.
Used 8.33% of numerical feature with maximum correlation = 0.2871 and minimum correlation= 0.0003.
The categorical features are: []
The other selected features are: [].
Feature selection has been accomplished. 

(['year', 'weekofyear', 'ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw', 'precipitation_amt_mm', 'reanalysis_air_temp_k', 'reanalysis_avg_temp_k', 'reanalysis_dew_point_temp_k', 'reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k', 'reanalysis_precip_amt_kg_per_m2', 'reanalysis_relative_humidity_percent', 'reanalysis_sat_precip_amt_mm', 'reanalysis_specific_humidity_g_per_kg', 'reanalysis_tdtr_k', 'station_avg_temp_c', 'station_diur_temp_rng_c', 'station_max_temp_c', 'station_min_temp_c', 'station_precip_mm'], [], [])


  return reduction(axis=axis, out=out, **passkwargs)


In [98]:
features_dir = '../data/02_intermediate/dengue_features_train_sj.csv'
labels_dir = '../data/02_intermediate/dengue_labels_train_sj.csv'
test_dir = '../data/02_intermediate/dengue_features_test_sj.csv'
test_output_dir = '../data/07_model_output/output_sj.csv'

X, y = get_train_data(features_dir=features_dir, labels_dir=labels_dir)
columns = get_train_features(X=X, y=y, corr_threshold = corr_threshold, split=True, skip = True)
model = RandomForestRegressor(criterion='absolute_error')

trained_pipeline = get_trained_pipeline(X=X, y=y, columns=columns, model=model, validate=False)


X_test = pd.read_csv(test_dir)
test_prediction = get_prediction(X_test, trained_pipeline, exp=False)
result_df = get_test_output(X_test=X_test,
                            predictions=test_prediction,
                            exponentiate=False,
                            file_dir=test_output_dir)

Preprocessing of training data started.
Preprocessing of training data finished.

Feature selection in progress.
Used 8.33% of numerical feature with maximum correlation = 0.2871 and minimum correlation= 0.0003.
The categorical features are: []
The other selected features are: [].
Feature selection has been accomplished. 

Training in progress.


  return reduction(axis=axis, out=out, **passkwargs)


Training finished.

Saved predictions in competition file format in path ../data/07_model_output/output_sj.csv . /n




In [99]:
features_dir = '../data/02_intermediate/dengue_features_train_iq.csv'
labels_dir = '../data/02_intermediate/dengue_labels_train_iq.csv'
test_dir = '../data/02_intermediate/dengue_features_test_iq.csv'
test_output_dir = '../data/07_model_output/output_iq.csv'

X, y = get_train_data(features_dir=features_dir, labels_dir=labels_dir)
columns = get_train_features(X=X, y=y, corr_threshold = corr_threshold, split=True, skip = True)
model = RandomForestRegressor(criterion='absolute_error')

trained_pipeline = get_trained_pipeline(X=X, y=y, columns=columns, model=model, validate=False)


X_test = pd.read_csv(test_dir)
test_prediction = get_prediction(X_test, trained_pipeline, exp=False)
result_df = get_test_output(X_test=X_test,
                            predictions=test_prediction,
                            exponentiate=False,
                            file_dir=test_output_dir)

Preprocessing of training data started.
Preprocessing of training data finished.

Feature selection in progress.
Used 8.33% of numerical feature with maximum correlation = 0.2365 and minimum correlation= 0.0096.
The categorical features are: []
The other selected features are: [].
Feature selection has been accomplished. 

Training in progress.


  return reduction(axis=axis, out=out, **passkwargs)


Training finished.

Saved predictions in competition file format in path ../data/07_model_output/output_iq.csv . /n




In [102]:
bind_city_outputs()