In [7]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import label_binarize, LabelBinarizer
from sklearn.model_selection import StratifiedKFold

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from resources.functions.model_functions import get_stacked_model, get_grid, get_grid_predictions
from resources.properties import PATH_TRAIN, PATH_TEST, RANDOM_STATE

In [8]:
# Parameters to tune, for some models
tree_params = {
    'final_estimator__max_leaf_nodes': list(range(1, 100)), 
    'final_estimator__min_samples_split': [0.5,1,2,3,4],
    'final_estimator__min_samples_leaf': range(1,10),
    'final_estimator__criterion': ['gini', 'entropy'],
    'final_estimator__max_depth': [1,2,4,6,8,10,12,14,16,18,20],
}

random_forest_params = {
    'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False],
    'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [0.5,1,2,4],
    'min_samples_split': [0.5,1,2,3],
}

meta_params = {
    'final_estimator__solver': ['newton-cg', 'lbfgs', 'liblinear', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],
    'final_estimator__penalty': ['none', 'l1', 'l2'],
    'final_estimator__C': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4],
    'final_estimator__fit_intercept':[True, False],
}

boost_params = {
    "learning_rate": [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
    "max_depth": [3, 4, 5, 6, 8, 10],
    "min_child_weight": [1, 3, 5, 7, 10],
    "gamma": [0.0, 0.1, 0.2, 0.3, 0.4],
    "colsample_bytree": [0.3, 0.4, 0.5, 0.7],
    'subsample': [0.6, 0.8, 1.0],
}

## Consult Scores 

In [9]:
df_train = pd.read_csv(PATH_TRAIN, index_col=0)

y = df_train.Target.values
X = df_train.drop(axis=1, columns='Target').values

lb = LabelBinarizer()
y_binarize = lb.fit_transform(y)

models = {
    'DecisionTree': get_grid(
        model=DecisionTreeClassifier(random_state=1),
        cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE).split(X,y),
        params=tree_params,
    ),
    'RandomForestClassifier': get_grid(
        model=RandomForestClassifier(n_estimators=500, criterion="gini", max_depth=10, max_features="auto", min_samples_leaf=0.005, min_samples_split=0.005, n_jobs=-1,random_state=RANDOM_STATE),
        cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE).split(X,y),
        params=random_forest_params,
    ),
    'Stacked': get_grid(
        model=get_stacked_model(),
        cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE).split(X,y),
        params=tree_params,
    ),
}


# choose a model to tune by uncommented the respecrive code line!!

# # DecisionTreeClassifier:
# grid_model = models['DecisionTree'].fit(X, y_binarize)

# # RandomForestClassifier:
# grid_model = models['RandomForestClassifier'].fit(X, y_binarize)

# # Stacked:
# grid_model = models['Stacked'].fit(X, y)

In [None]:
# print('DecisionTreeClassifier accuracy: ', grid_tree.best_score_)
# print('RandomForestClassifier: ', grid_rfc.best_score_)
# print('Stacked accuracy: ', grid_stacked.best_score_)

In [None]:
grid_stacked.best_estimator_, grid_stacked.best_params_

## Get Predictions

In [None]:
path_train = 'cleaned-data/nor-dum-zscore-1.csv'
df_train = pd.read_csv(path_train, index_col=0)

X_test = pd.read_csv(PATH_TEST, index_col=0)

y_train = df_train.Target
X_train = df_train.drop(axis=1, columns='Target')

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20)

# Separating numeric and categorical features:
X_test_numer = X_test.select_dtypes(include=[np.number])
X_test_categ = X_test.select_dtypes(exclude=[np.number])

# Normalizing:
# X_test_numer = normalize_df(X_test_numer)

# Dummies:
X_test_categ = pd.get_dummies(X_test_categ)

# Concatenation:
X_test = pd.concat([X_test_categ, X_test_numer], axis=1)

# Adjusting X_test:
index = X_train.columns.get_loc("PanelG_D")
values = np.zeros(shape=(X_test.shape[0],), dtype=int)
X_test.insert(loc=index, column='PanelG_D', value=values)

In [None]:
# get predictions based on a given GridSearch model, previously set
y_pred_grid = get_grid_predictions(grid_model, X_test)
y_pred_grid = lb.inverse_transform(y_pred_grid)

In [None]:
# convert y_pred to DataFrame
raw_X_test = pd.read_csv(path_test, index_col=0)
grid_solution = pd.DataFrame(
    data=y_pred_grid, 
    index=raw_X_test.index, 
    columns=['Target']
)

In [None]:
# save predictions
grid_solution.to_csv(f'outputs/{file_name}.csv')

In [None]:
# save models to use them later
pickle.dump(grid_tree, open(f'{model_name}', 'wb'))