In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from matplotlib_inline import backend_inline
backend_inline.set_matplotlib_formats('svg')
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.utils import compute_class_weight
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from skopt import BayesSearchCV
import warnings
warnings.filterwarnings('ignore')

## Read the data

In [None]:
columns=['Id','RI','Na','Mg','Al','Si','K','Ca','Ba','Fe','Type']
glass_type=pd.read_csv("https://raw.githubusercontent.com/FlipRoboTechnologies/ML-Datasets/main/Glass%20Identification/Glass%20Identification.csv",
                       header=None)
glass_type.columns=columns
glass_type

In [None]:
glass_type['Type_bin']=np.where(glass_type.Type<5,0,1)
glass_type

## EDA

In [None]:
glass_type.info()

In [None]:
glass_type.drop(columns=['Id','Type'],inplace=True)
glass_type

In [None]:
glass_type_eda=glass_type.copy()
glass_type_eda['Type_text']=np.where(glass_type_eda.Type_bin==0,'Window Glass','Non-Window Glass')
glass_type_eda.drop(columns=['Type_bin'],inplace=True)
glass_type_eda

In [None]:
glass_type_eda.Type_text.value_counts(normalize=True)

### Univariate Analysis

In [None]:
glass_type_eda.describe()

In [None]:
for col in glass_type_eda.iloc[:,:-1].columns.values:
    plt.figure(figsize=(12,6))
    glass_type_eda[col].plot.hist(bins=50)
    plt.xlabel(col)
    plt.show()

### Bivariate Analysis

In [None]:
plt.figure(figsize=(12,6))
sns.heatmap(data=glass_type_eda.drop(columns=['Type_text']).corr(),
            annot=True,
            cmap='coolwarm',
            vmin=-1,
            vmax=1)
plt.show()

In [None]:
for col in glass_type_eda.iloc[:,:-1].columns.values:
    plt.figure(figsize=(12,6))
    sns.kdeplot(hue='Type_text',x=col,data=glass_type_eda)
    plt.xlabel(col)
    plt.show()

In [None]:
for col in glass_type_eda.iloc[:,:-1].columns.values:
    plt.figure(figsize=(12,6))
    glass_type_eda.groupby('Type_text')[col].mean().plot.bar()
    plt.axhline(y=glass_type_eda[col].mean(),color='red')
    plt.xlabel(col)
    plt.show()

### Data Preparation

In [None]:
X,y=glass_type.drop(columns=['Type_bin'],axis=1),glass_type['Type_bin']
scaler=MinMaxScaler()
X_train,X_test,y_train,y_test=train_test_split(X,
                                               y,
                                               test_size=0.2,
                                               shuffle=True,
                                               stratify=y,
                                               random_state=42)

scaler.fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)

### Model Building

In [None]:
for rs in range(1,1001):
    score_dict={}
    model=RandomForestClassifier(random_state=rs,
                                 class_weight='balanced')
    cw=compute_class_weight(class_weight='balanced',
                     classes=np.unique(y),
                     y=y)
    sample_weight=np.where(y_train==0,cw[0],cw[-1])
    cv=10
    cv_scores=cross_val_score(estimator=model,
                              X=X_train_scaled,
                              y=y_train,
                              cv=cv,
                              scoring='roc_auc',
                              n_jobs=-1).mean()
    model.fit(X=X_train_scaled,
              y=y_train,
              sample_weight=sample_weight)
    y_pred_proba=model.predict_proba(X_test_scaled)[:,1]
    test_score=roc_auc_score(y_true=y_test,
                            y_score=y_pred_proba)
    score_diff=abs(test_score-cv_scores)
    score_dict[rs]=[cv_scores,test_score,score_diff]

best_rs=min(score_dict,key=lambda x:score_dict[x][-1])

print(f'Best Random State: {best_rs}')
print(f'Best CV Score: {score_dict[best_rs][0]}')
print(f'Best Test Score: {score_dict[best_rs][1]}')
print(f'Least Score difference: {score_dict[best_rs][-1]}')

### Hyperparameter Tuning

In [None]:
model=RandomForestClassifier(random_state=best_rs,
                             class_weight='balanced')
n_estimators = np.linspace(start = 100, stop = 2000, num = 10).astype(np.int64)
max_features = ['log2', 'sqrt']
max_depth = np.linspace(start=10, stop=100, num=10).astype(np.int64).tolist()
max_depth.append(None)
min_samples_leaf = np.linspace(start=2,stop=10, num=5).astype(np.int64)
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
optimizer=BayesSearchCV(estimator=model,
                        search_spaces=random_grid,
                        n_iter=500,
                        cv=10,
                        scoring='roc_auc',
                        n_jobs=-1)
optimizer.fit(X=X_train_scaled,
              y=y_train,
              sample_weight=sample_weight)