In [6]:
from pycaret.classification import *
import pandas as pd
import numpy as np  
from scipy import stats
import seaborn as sns
import re

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, FunctionTransformer, LabelEncoder, OneHotEncoder
from sklearn.metrics import matthews_corrcoef
from sklearn.ensemble import IsolationForest

In [4]:
!conda install pycaret --yes

Channels:
 - rapidsai
 - nvidia
 - conda-forge
 - defaults
 - pytorch
Platform: linux-64
Collecting package metadata (repodata.json): done
Solving environment: done

# All requested packages already installed.



In [7]:
df_train = pd.read_csv('/kaggle/input/playground-series-s4e8/train.csv')
df_test = pd.read_csv('/kaggle/input/playground-series-s4e8/test.csv')

In [8]:
df_train_cleaned = df_train.copy()
df_test_cleaned = df_test.copy()

# Drop 'id' column
df_train_cleaned = df_train_cleaned.drop(['id'], axis=1)

# Define the target column
target_column = 'class'

# ordinal column
ordinal_columns = np.array(['gill-spacing'])
# ordinal column에서 사용된 데이터의 순서를 정의
gill_spacing_order = [[ 'f', 'Unknown', 'c', 'd']]

# Select categorical columns, excluding the target column
categorical_columns = df_train_cleaned.select_dtypes(include=['object']).columns.drop(target_column)
categorical_columns = categorical_columns.drop('gill-spacing')

# Select numerical columns, excluding the target column if it's numerical
numerical_columns = df_train_cleaned.select_dtypes(exclude=['object']).columns.drop(target_column, errors='ignore')

In [9]:
# Define a function to identify and replace infrequent categories
def replace_infrequent_categories(df, column, threshold=70):
    value_counts = df[column].value_counts()
    infrequent = value_counts[value_counts <= threshold].index
    df[column] = df[column].apply(lambda x: "Unknown" if x in infrequent else x)
    return df

# Handle invalid values and infrequent categories for all categorical columns
for col in categorical_columns:
    df_train_cleaned = replace_infrequent_categories(df_train_cleaned, col)
    df_test_cleaned = replace_infrequent_categories(df_test_cleaned, col)

# ordinal column에도 똑같이 적용
df_train_cleaned = replace_infrequent_categories(df_train_cleaned, ordinal_columns[0])
df_test_cleaned = replace_infrequent_categories(df_test_cleaned, ordinal_columns[0])

In [10]:
# numercal column들의 skewness (데이터의 전체적인 기울기) 계산
df_train_cleaned[numerical_columns].apply(lambda x: stats.skew(x.dropna()))

cap-diameter    3.972607
stem-height     1.926681
stem-width      1.235426
dtype: float64

In [11]:
# Compute medians for numerical columns in the training set
medians = df_train_cleaned[numerical_columns].median()

# Fill missing values in the training and testing sets
df_train_cleaned[numerical_columns] = df_train_cleaned[numerical_columns].fillna(medians)
df_test_cleaned[numerical_columns] = df_test_cleaned[numerical_columns].fillna(medians)

In [12]:
# Impute any missing values with 'Unknown'
df_train_cleaned = df_train_cleaned.fillna("Unknown")
df_test_cleaned = df_test_cleaned.fillna("Unknown")

In [13]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the target variable
train_encoded_target = label_encoder.fit_transform(df_train_cleaned[['class']])

# Convert categorical columns to 'category' dtype 
df_train_cleaned[categorical_columns] = df_train_cleaned[categorical_columns].astype('category')
df_test_cleaned[categorical_columns] = df_test_cleaned[categorical_columns].astype('category')

# Convert ordinal columns to 'category' dtype 
df_train_cleaned[ordinal_columns] = df_train_cleaned[ordinal_columns].astype('category')
df_test_cleaned[ordinal_columns] = df_test_cleaned[ordinal_columns].astype('category')

# Define the numerical pipeline
numerical_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('convert_to_float32', FunctionTransformer(lambda x: x.astype(np.float32)))
])

# Define the categorical pipeline
ordinal_pipeline = Pipeline(steps=[
    ('ordinal', OrdinalEncoder(dtype=np.int32, handle_unknown='use_encoded_value', unknown_value=-1, categories=gill_spacing_order))
])

# Define the categorical pipeline
categorical_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='first', dtype=np.int32, handle_unknown='ignore'))
])

# Combine both numerical and categorical pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_columns),
        ('ord', ordinal_pipeline, ordinal_columns),
        ('cat', categorical_pipeline, categorical_columns)
    ]
)


# Apply the transformations using the pipeline
df_train_encoded = preprocessor.fit_transform(df_train_cleaned)
df_test_encoded = preprocessor.transform(df_test_cleaned)

# Ensure outputs are dense arrays
train_encoded_dense = df_train_encoded.toarray()
test_encoded_dense = df_test_encoded.toarray()

# Get feature names
numerical_feature_names = numerical_columns  # Assuming numerical columns do not change names
ordinal_feature_names = ordinal_columns
categorical_feature_names = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_columns)

# Combine the feature names
all_feature_names = np.concatenate([numerical_feature_names, ordinal_feature_names, categorical_feature_names])

# Debugging: Print the number of feature names
print("Number of Features:", len(all_feature_names))

# Convert the transformed dense arrays back into DataFrames
df_train_preprocessed = pd.DataFrame(train_encoded_dense, columns=all_feature_names)
df_test_preprocessed = pd.DataFrame(test_encoded_dense, columns=all_feature_names)

Number of Features: 120


In [14]:
train_data = df_train_preprocessed
train_data['class'] = train_encoded_target
test_data = df_test_preprocessed

print('Shape of train data is : ' , train_data.shape)

# Extract test_ids for later use
test_ids = df_test_cleaned['id']
#test_data = test_data.drop(columns=['id'], axis=1)

numerical_features = train_data.select_dtypes(include=[np.number])
train_data.describe(include=[np.number]).transpose()

categorial_features = train_data.select_dtypes(include=object)

Shape of train data is :  (3116945, 121)


In [16]:
clf1 = setup(data = train_data, 
             target = 'class',
             train_size=0.8,
             normalize=True,
             normalize_method='minmax',
             remove_multicollinearity=False,
             remove_outliers=True,
             fold=5,
             verbose = False,
             use_gpu = True,
            )

[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1


In [None]:
best_model=compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)


Processing:   0%|          | 0/69 [00:00<?, ?it/s]

## Hyperparameter tuning

In [None]:
tuned_model = tune_model(best_model)

## Plotting Model's performance

In [None]:
#Plotting the confusion Matrix
plot_model(estimator = tuned_model, plot = 'confusion_matrix')

In [None]:
#Plotting the learning curve
plot_model(estimator = tuned_model, plot = 'learning')

In [None]:
#plotting the ROC curve
plot_model(estimator = tuned_model, plot = 'auc')

## Decision Boundary

In [None]:
#plotting decision boundary
plot_model(estimator = tuned_model, plot = 'boundary', use_train_data = True)

## Feature Importance

In [None]:
#Plotting Feature Importance
plot_model(estimator = tuned_model, plot = 'feature')

## Blending Multiple Models

In [None]:
#Creating Models
lightgbm  = create_model('lightgbm');       
catboost  = create_model('catboost');          
gbc  = create_model('gbc'); 

#Blending the top 3 models
blend = blend_models(estimator_list=[lightgbm,gbc,catboost])

## Plotting confusion matrix for Blended Model

In [None]:
plot_model(estimator = blend, plot = 'confusion_matrix')

**Observation :**
By Blending the top 3 models, the precision has improved, however we observe a detoriation in the Recall

# Summary
* PyCaret provides several **data preprocessing and preparation** exercises like imputing missing values, handling outliers, standardizing/normalizing variables, one-hot/ordinal/cardinal encoding, target imbalance and feature selection.
* It provides a robust framework not only to **build multiple ML models** but also evaluate them using a wide set of metrics.
* It offers visuals/plots to **evaluate the model performance** as well as interpret the models.
* Additionally, we can also perform **hyperparameter tuning** and **blend multiple models** to attain highly accuracte models.

# License
References :
* https://pycaret.org
* https://pycaret.readthedocs.io/en/stable/api/classification.html

Copyright (c) 2022 chakrabortyarnab

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.