In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import plotly.graph_objects as go
import plotly.express as px

plt.rcParams["figure.figsize"] = (6, 6)  # Change matplotlib Box Size
plt.rcParams["font.size"] = 12  # Change matplotlib Font Size
plt.style.use("fivethirtyeight")

In [None]:
import sklearn
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score

In [None]:
!pip install flaml

In [None]:
!pip install -U ipywidgets

In [None]:
from flaml import AutoML

In [None]:
df_feat_test = pd.read_csv('/kaggle/input/data-challenge-supervised-ml202208/features_test.csv')
df_targ_train = pd.read_csv('/kaggle/input/data-challenge-supervised-ml202208/target_train.csv')
df_feat_train = pd.read_csv('/kaggle/input/data-challenge-supervised-ml202208/features_train.csv')

In [None]:
X_test_1 = df_feat_test
X_train = df_feat_train
y_train = df_targ_train['Expected']

X_train=X_train.drop(columns=['Id'])
X_test=X_test_1.drop(columns=['Id'])

In [None]:
X_test.shape, X_train.shape, y_train.shape

In [None]:
X_train.describe()

In [98]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Columns: 120 entries, 0 to 119
dtypes: float64(120)
memory usage: 9.2 MB


In [None]:
X_train.isnull().sum().unique()

In [None]:
X_train.select_dtypes(include=["int64"])

In [None]:
sns_plot = sns.clustermap(X_train.corr(), cmap="rocket_r")

In [None]:
X_train.corr().unstack().sort_values(ascending=False).drop_duplicates()

In [None]:
#!pip install catboost

In [None]:
from catboost import CatBoostClassifier, Pool

model_Cat = CatBoostClassifier(
         early_stopping_rounds = 10,
         learning_rate = 0.06692273485930686,
         n_estimators = 200,
         thread_count = -1,
         verbose = False,
         random_seed = 10242048,
         #od_type = "Iter",
         #od_wait = 10
)

In [None]:
feats = model_Cat.select_features(X_train, y_train,
                         features_for_select=list(range(120)),
                         num_features_to_select=16)

In [None]:
X_train_red = X_train.iloc[:,feats['selected_features']]
X_train_red.shape

In [None]:
automl = AutoML()

In [None]:
automl_settings = {
    "time_budget": 1200, # 20 mins to try and select the best model
    "metric": 'macro_f1',
    "task": 'classification',
    "log_file_name": 'mylog.log',
    "eval_method": 'cv',
    "n_splits": 5
}

In [None]:
automl.fit(X_train=X_train_red, y_train=y_train.values,
           **automl_settings)

In [None]:
automl.best_estimator

In [None]:
automl.best_config

In [None]:
automl.model.get_params()

In [None]:
predictions = automl.predict(X_train_red)
cf = confusion_matrix(y_train, predictions)
print(classification_report(y_train, predictions))
sns.heatmap(cf, annot=True);

# Use FLAML ensemble approach

In [None]:
automl_ens = AutoML()

automl_ens.fit(X_train=X_train_red, y_train=y_train.values, ensemble=True,
           **automl_settings)

In [None]:
automl_ens.best_estimator

In [None]:
automl_ens.model

In [None]:
predictions = automl_ens.model.predict(X_train_red)
cf = confusion_matrix(y_train, predictions)
print(classification_report(y_train, predictions))
sns.heatmap(cf, annot=True);

## Continue with CatBoost algorithm

In [99]:
model_Cat = CatBoostClassifier(
         early_stopping_rounds = 13,
         learning_rate = 0.04171721859304757,
         n_estimators = 2500,
         thread_count = -1,
         verbose = False,
         random_seed = 10242048,
         #od_type = "Iter",
         #od_wait = 10
)

In [100]:
model_Cat.fit(X_train_red, y_train)

<catboost.core.CatBoostClassifier at 0x7b114e0323e0>

In [101]:
cv_score = cross_val_score(model_Cat, X_train_red, y_train,
                           cv=5, scoring='f1_macro')
print(cv_score)
print(np.mean(cv_score))

[0.6900797  0.68053531 0.68903492 0.71367006 0.70220594]
0.6951051868677635


In [None]:
list_feat_imp = model_Cat.get_feature_importance(data=Pool(X_train_red, label=y_train))

In [None]:
plt.hist(list_feat_imp);

In [None]:
list_feat_imp[list_feat_imp>1]

## Finetune CatBoost using GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

catboost_tune = CatBoostClassifier(random_seed = 10242048,
                                   verbose = False,
                                   thread_count = -1)
                                  

grid_search = {
            'early_stopping_rounds': [10, 13],
            'min_data_in_leaf': [500],
            'learning_rate': [0.175],
            'n_estimators': [150],
            'l2_leaf_reg': [4]}


grid_search_obj = GridSearchCV(estimator=catboost_tune,
                               param_grid=grid_search,
                               scoring='f1_macro', cv=3, verbose=5, n_jobs=-1)

#grid_search_obj.fit(X_Train,Y_Train)



In [None]:
grid_search_obj.fit(X_train_red,y_train)

In [None]:
predictioncat = grid_search_obj.best_estimator_.predict(X_train_red)
print(confusion_matrix(y_train,predictioncat))
print(classification_report(y_train,predictioncat))

In [None]:
pd.set_option('display.max_colwidth', None)

cv_result_df = pd.DataFrame({
    'Model Rank': grid_search_obj.cv_results_['rank_test_score'],
    'Model Hyperparams': grid_search_obj.cv_results_['params'],
    'Avg CV F1-macro': grid_search_obj.cv_results_['mean_test_score'],
    'Std Dev CV F1-macro': grid_search_obj.cv_results_['std_test_score'],
    'CV Fold 1 F1-macro': grid_search_obj.cv_results_['split0_test_score'],
    'CV Fold 2 F1-macro': grid_search_obj.cv_results_['split1_test_score'],
    'CV Fold 3 F1-macro': grid_search_obj.cv_results_['split2_test_score']
})


cv_result_df.sort_values(by=['Model Rank'], ascending=True)

## Use Hyperopt for Bayesian hyperparameter tuning

In [None]:
#!pip install hyperopt

In [None]:
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

In [None]:
space = {
    'learning_rate':  hp.choice('learning_rate', [0.04, 0.06]),
    'n_estimators': hp.choice('n_estimators', [600, 860, 1000]),
    'l2_leaf_reg' : hp.choice('l2_leaf_reg', [30, 100])
}

def objective(space):
    cat_model_opt = CatBoostClassifier(

                                        learning_rate = space['learning_rate'],
                                        n_estimators = space['n_estimators'],
                                        l2_leaf_reg = space['l2_leaf_reg'],
                                        min_data_in_leaf = 300,
                                        early_stopping_rounds = 13,
                                        random_seed = 10242048,
                                        verbose = False,
                                        thread_count = -1)
    
   
       
    f1 = cross_val_score(cat_model_opt, X_train_red, y_train, cv=5, scoring='f1_macro').mean()

    # We aim to maximize accuracy, therefore we return it as a negative value
    return {'loss': -f1, 'status': STATUS_OK }

trials = Trials()

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest, # Tree parzen estimator
            max_evals=10,
            trials=trials)
best

In [None]:
depth = {0: 6}
lrate = {0: 0.02, 1: 0.04, 2:0.06}
n_est = {0: 600, 1:860, 2:1000}
l2_reg = {0: 10, 1: 30, 2: 100}

optimal_cat = CatBoostClassifier(
                                       learning_rate = lrate[best['learning_rate']],
                                       l2_leaf_reg = l2_reg[best['l2_leaf_reg']],
                                       n_estimators = n_est[best['n_estimators']],
                                       min_data_in_leaf = 500,
                                        early_stopping_rounds = 5,
                                        random_seed = 10242048,
                                        verbose = False,
                                        thread_count = -1,
                                      ).fit(X_train_red,y_train)

In [None]:
prediction_best_cat = optimal_cat.predict(X_train_red)
print(confusion_matrix(y_train,prediction_best_cat))
print(classification_report(y_train,prediction_best_cat))

In [None]:
X_test_red = X_test.iloc[:,feats['selected_features']]
X_train_red.shape

In [None]:
y_pred_automl = automl.predict(X_test_red)

In [None]:
df_pred_automl = pd.DataFrame()
df_pred_automl['Id']=X_test_1['Id']
df_pred_automl.set_index('Id', inplace=True)
df_pred_automl['Predicted'] = y_pred_automl.ravel()

In [None]:
df_pred_automl.to_csv('df_pred_automl_2.csv')

In [102]:
y_pred_cat = model_Cat.predict(X_test_red)

In [103]:
df_pred_cat = pd.DataFrame()
df_pred_cat['Id']=X_test_1['Id']
df_pred_cat.set_index('Id', inplace=True)
df_pred_cat['Predicted'] = y_pred_cat.ravel()

In [104]:
df_pred_cat.to_csv('df_pred_cat4.csv')

In [None]:
red_prediction_2 = optimal_cat.predict(X_test_red)

In [None]:
comp = pd.DataFrame(red_prediction == red_prediction_2)

In [None]:
comp.sum()