In [2]:
import pandas as pd
import numpy as np

from resources.models import random_forest, xgboost, loaded_model
from resources.functions.cross_feature import cross_feature
from resources.functions.scale_data import normalize_df, standardize_df
from resources.functions.get_outliers import get_outlier_per_feature, get_outlier_index_list
from resources.functions.model_functions import get_model_predictions, get_stacked_model
from resources.properties import PATH_TRAIN, PATH_TEST

## Cleaning Pipeline:
1. Drop rows that contain null values,
2. Separate numeric and categorical features
3. Scale and/or perform feature crossing on numeric features,
4. Get outliers index list from numeric features,
5. Apply get_dummies to categorical features,
6. Concatenate numeric and categorical features
7. Remove outliers using Z-score

In [3]:
def prepare_data(df_train:pd.DataFrame, X_test:pd.DataFrame, file_name_to_save:str, drop_null=True, 
                 normal=False, stand=False, feature_crossing=False, drop_outliers=True, get_dummies=True):
    # 1. droping null values:
    if drop_null == True:
        # for trainig values
        df_train = df_train.dropna(axis=0, how='any')
        df_train = df_train.reset_index()
        df_train = df_train.drop(columns='ID')
        # for testing values
        X_test = X_test.reset_index()
        X_test = X_test.drop(columns='ID')

    # 2. Separating numeric and categorical features:
    y_train = df_train.Target
    X_train = df_train.drop(axis=1, columns='Target')
    # for training values
    X_train_numer = X_train.select_dtypes(include=[np.number])
    X_train_categ = X_train.select_dtypes(exclude=[np.number])
    # for testing values
    X_test_numer = X_test.select_dtypes(include=[np.number])
    X_test_categ = X_test.select_dtypes(exclude=[np.number])

    # 3. Normalizing, Standardize and/or cross feature:
    if normal == True:    
        # for trainig values
        X_train_numer = normalize_df(X_train_numer)
        # for testing values
        X_test_numer = normalize_df(X_test_numer)
    if stand == True:
        X_train_numer = standardize_df(X_train_numer)
        # for testing values
        X_test_numer = standardize_df(X_test_numer)
    if feature_crossing == True:
        X_train_numer = cross_feature(X_train_numer)
        # for testing values
        X_test_numer = cross_feature(X_test_numer)
        
    # 4. Outliers index list:
        # for trainig values
    outlier_index_list_train = get_outlier_index_list(get_outlier_per_feature, X_train_numer)
    
    # 5. Dummies:
    if get_dummies == True:
        # for trainig values
        X_train_categ = pd.get_dummies(X_train_categ)
        # for testing values
        X_test_categ = pd.get_dummies(X_test_categ)

    # 6. Concatenation:
        # for trainig values
    X_train = pd.concat([X_train_categ, X_train_numer], axis=1)
    df_train = pd.concat([X_train, y_train], axis=1)
        # for testing values
    X_test = pd.concat([X_test_categ, X_test_numer], axis=1)
   
    # 7. Removing outliers:
    if drop_outliers == True:
        # for trainig values
        df_train = df_train.drop(index=outlier_index_list_train)
        
    # 8. Adjusting X_test:
    index = X_train.columns.get_loc("PanelG_D")
    values = np.zeros(shape=(X_test.shape[0],), dtype=int)
    X_test.insert(loc=index, column='PanelG_D', value=values)
    
    # 9. Saving data
    df_train.to_csv(f'cleaned-data/{file_name_to_save}.csv')
    
    return {
        'DataFrame': df_train,
        'file_name': f'cleaned-data/{file_name_to_save}.csv',
        'X_test': X_test,
    }

## Datasets

In [4]:
# for trainig values
df_train = pd.read_csv(PATH_TRAIN, index_col=0)

# for testing values
X_test = pd.read_csv(PATH_TEST, index_col=0)

In [5]:
# introduce the name of the new file prepared
file_name_to_save = 'dum-zscore-cross-stan'
prepared_data = prepare_data(df_train, X_test, file_name_to_save, drop_null=True, normal=False, 
                             stand=True, feature_crossing=True, drop_outliers=True, get_dummies=True)

In [6]:
# for trainig values
path_train = f'cleaned-data/{file_name_to_save}.csv'
df_train = pd.read_csv(path_train, index_col=0)

## Training the models:
After study some models performance on the train, it was decided to use the followings:
- XGboost
- Random Forest
- Random Forest Optimized by grid_search
- Stacking Classifier

In [8]:
# models chosen to run get_model_predictions, they were already initialized on models.py
models = {
    'XGboost': xgboost,
    'RandomForest': random_forest,
    'RandomForestOptimized': loaded_model,
    'Stacking': get_stacked_model(),
}

In [10]:
# taking the output, df_train (X_train and y_train) and X_test, from prepare_data function runned above
y_train = prepared_data['DataFrame'].Target
X_train = prepared_data['DataFrame'].drop(axis=1, columns='Target')
X_test = prepared_data['X_test']

predictions = get_model_predictions(models, X_train, y_train, X_test)



In [14]:
predictions.keys()

dict_keys(['XGboost', 'RandomForest', 'RandomForestOptimized', 'Stacking'])

In [11]:
# create solution DataFrame using predictions dict[model_name]['predictions']
# raw_X_test would provide the ID needed when the DataFrame is constructed.
raw_X_test = pd.read_csv(PATH_TEST, index_col=0)

solution = pd.DataFrame(
    data=predictions['RandomForestOptimized']['predictions'], # Select the model predictions that better perform based on the model_comparator results
    index=raw_X_test.index, 
    columns=['Target']
)

NameError: name 'path_test' is not defined

In [None]:
# name the predictions generated by the model selected previously
output_name = 'test-1'
solution.to_csv(f'outputs/{output_name}.csv')