In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [5]:
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OneHotEncoder, StandardScaler, Normalizer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_error

## 1. Global parameters

In [24]:
features_dir = '../data/01_raw/dengue_features_train.csv'
labels_dir = '../data/01_raw/dengue_labels_train.csv'
test_dir = '../data/01_raw/dengue_features_test.csv'

test_output_dir = 'output.csv'

corr_threshold = 0.3

## 2. Building blocks functions

In [25]:
def get_dt_col(X , column='week_start_date', format='%Y-%m-%d'):
    """
    Input:
        X
        col = weekofx

    Returns:
        X with X[col].dtype = datetime

    """
    X_new = X.copy()
    X_new[column] = pd.to_datetime(X_new[column], format=format)

    return X_new
    

In [41]:
def get_train_data(features_dir, labels_dir, drop=False):
    """ 
    Input:
        str to csv with train features and labels
        drop: to drop NaNs from feature columns.

    Output:
         X, y
    
    """
    features = pd.read_csv(features_dir)
    labels = pd.read_csv(labels_dir)

    data = features.merge(labels)

    print("Preprocessing of training data started.")
    
    if drop:
        data_clean = data.dropna()
        print("Droped {:.2f}% of rows.".format((1 - data_clean.shape[0]/data.shape[0])*100))
    else:
        data_clean = data.copy()

    data_clean.loc[:,'log_total_cases'] = np.log1p(data_clean['total_cases'])
    data_clean_dt = get_dt_col(data_clean)

    y = data_clean_dt['log_total_cases']
    X = data_clean_dt.drop(columns=['total_cases','log_total_cases'])
    print("Preprocessing of training data finished.\n")
        
    return X, y


In [42]:
def get_train_features(X, y, corr_threshold = 0.3, split = False):
    """ 
    Input:
        X
        y
        corr_threshold = 0.3
        split: to split the data by cities

    Output:
        numerical_list, categorical_list, other_list
    """
    data = X.copy()
    data.loc[:,'log_total_cases'] = y

    print("Feature selection in progress.")
    num_feat = data.select_dtypes(include=[np.number])

    correlation_matrix = num_feat.corr()

    cm = np.abs(correlation_matrix[['log_total_cases']]).sort_values(['log_total_cases'], ascending = False)
    mask = cm>corr_threshold
    boundary_index = np.sum(mask).iloc[0]

    ordered_num_features = cm.index[1:].to_list()
    numerical_selected = ordered_num_features[0:boundary_index-1]
    
    if split:
        categorical_cols = []
    else:
        categorical_cols = ['city']
        
    other_cols = ['week_start_date']
    
    # Ouput logs
    print("Used {:.2f}% of numerical feature with maximum correlation = {:.4f} and minimum correlation= {:.4f}."
          .format((1 - len(numerical_selected)/num_feat.shape[1])*100, 
          cm['log_total_cases'].iloc[1], cm['log_total_cases'].iloc[boundary_index-1]))
    print("The categorical features are: {}".format(categorical_cols))
    print("The other selected features are: {}.".format(other_cols))
    print("Feature selection has been accomplished. \n")

    return numerical_selected, categorical_cols, other_cols

In [43]:
def get_trained_pipeline(X, y, columns, model, train_size=0.8, validate = True, shuffle = True):
    """ 
    Input:
        X, y
        corr_threshold: Feature to Y correlation threshold for feature selection.

    Output:
        print statement with val mean abs error
        pipeline
        # X_valid, y_valid (for later validation trials) - commented out for now

    """

    X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=1-train_size, shuffle=shuffle)

    numerical_cols, categorical_cols, other_cols = columns[0], columns[1], columns[2]

    my_cols = categorical_cols + numerical_cols + other_cols

    X_train = X_train_full[my_cols].copy()
    X_valid = X_valid_full[my_cols].copy()

    my_pipeline = get_pipeline(model, columns)
    print("Training in progress.")
    my_pipeline.fit(X_train, y_train)
    print("Training finished.\n")
    
    if validate:
        print("Validation error in progress.")
        preds = my_pipeline.predict(X_valid)
        score = mean_absolute_error(y_valid, preds)
        print(' Finished Validation')
        print('MAE:', score)
        print('\n')

    return my_pipeline

In [44]:
def get_pipeline(model, columns):
    """ 
    Input:
        model
        
    Output:
        pipeline
    """

    numerical_cols, categorical_cols = columns[0], columns[1]

    numerical_transformer = Pipeline( steps =[
        ('imputer', SimpleImputer()),
        ('scalar', StandardScaler()),
        ('normalizer', Normalizer())
        # ('MinMaxScaler')
    ])
    # Preprocessing for categorical data
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder())
    ])

    # Bundle preprocessing for numerical and categorical data
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])
    
    my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])
    
    return my_pipeline

In [45]:
def get_prediction(X_test, trained_pipeline, exp=False):
    """
    Input:
        X_test
        trained_pipeline

    Output:
        prediction vector

        X_test -> apply dt_col

    """

    X = get_dt_col(X_test)
    prediction = trained_pipeline.predict(X)

    if exp:
        return np.expm1(prediction)
    else:
        return prediction

In [46]:
def get_test_output(X_test, predictions, indexes = ['city', 'year', 'weekofyear'], exponentiate=False, file_dir = '../data/output.csv'):
    """
    Input:
        X_test - 
        predictions - for both cities indexed as X_test requires it and 
                        with the same number of rows as X_test. 
        indexes (List(str)) - By default the columns of X_test for the required format,
                        i.e., ['city', 'year', 'weekofyear'];
        exp (bool) - to exponentiate predictions or not;
                     
        file_dir (str) - directory with the filename of the output csv file;
                         Give an empty string if saving the output is not desired.
        
    Returns:
        output_df (pd.DataFrame) - Dataframe with formatted results
        csv file
    """

    # 1. Exponentiate the predictions if needed.
    if exponentiate:
        result = np.round(np.expm1(predictions)).astype(int)
    else:
        result = np.round(predictions).astype(int)

    # 2. Make the dictionary for the output dataframe format.
    out_dict = {}
    for index in indexes:
        out_dict[index] = X_test[index]

    out_dict['total_cases'] = result
    output_df = pd.DataFrame.from_dict(out_dict).set_index(indexes)
    
    # 3. Save predictions in a csv file ready for submission.
    if file_dir:
        print("Saved predictions in competition file format in path {} . /n".format(file_dir))
        output_df.to_csv(file_dir)
        print('\n')

    return output_df
    

## 3. Pipeline

In [47]:

X, y = get_train_data(features_dir=features_dir, labels_dir=labels_dir)
columns = get_train_features(X=X, y=y, corr_threshold = corr_threshold)
model = LinearRegression()

trained_pipeline = get_trained_pipeline(X=X, y=y, columns=columns, model=model)


X_test = pd.read_csv(test_dir)
test_prediction = get_prediction(X_test, trained_pipeline, exp=False)
result_df = get_test_output(X_test=X_test,
                            predictions=test_prediction,
                            exponentiate=True,
                            file_dir=test_output_dir)


Preprocessing of training data started.
Preprocessing of training data finished.

Feature selection in progress.
Used 56.52% of numerical feature with maximum correlation = 0.5640 and minimum correlation= 0.3234.
The categorical features are: ['city']
The other selected features are: ['week_start_date'].
Feature selection has been accomplished. 

Training in progress.
Training finished.

Validation error in progress.
 Finished Validation
MAE: 0.730682949005616


Saved predictions in competition file format in path output.csv . /n




  return reduction(axis=axis, out=out, **passkwargs)
