In [1]:
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler, OrdinalEncoder, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.metrics import matthews_corrcoef, mean_absolute_error, roc_curve, auc, roc_auc_score, recall_score, accuracy_score, classification_report, confusion_matrix, log_loss, ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.feature_selection import mutual_info_regression
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_breast_cancer, make_blobs
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBRegressor, XGBClassifier
import xgboost as xgb
import seaborn as sns
sns.set_theme(context='notebook')
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
# from imblearn.over_sampling import SMOTE

import itertools
import os 
os.chdir('e:/ml_data/kaggle_data/steel_plate')

In [2]:
df_sub = pd.read_csv('sample_submission.csv')
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [3]:
df_train.head().style.highlight_max(subset=['X_Minimum'], color='blue')

Unnamed: 0,id,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,Length_of_Conveyer,TypeOfSteel_A300,TypeOfSteel_A400,Steel_Plate_Thickness,Edges_Index,Empty_Index,Square_Index,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,0,584,590,909972,909977,16,8,5,2274,113,140,1358,0,1,50,0.7393,0.4,0.5,0.0059,1.0,1.0,0.0,1.2041,0.9031,0.699,-0.5,-0.0104,0.1417,0,0,0,1,0,0,0
1,1,808,816,728350,728372,433,20,54,44478,70,111,1687,1,0,80,0.7772,0.2878,0.2581,0.0044,0.25,1.0,1.0,2.6365,0.7782,1.7324,0.7419,-0.2997,0.9491,0,0,0,0,0,0,1
2,2,39,192,2212076,2212144,11388,705,420,1311391,29,141,1400,0,1,40,0.0557,0.5282,0.9895,0.1077,0.2363,0.3857,0.0,4.0564,2.179,2.2095,-0.0105,-0.0944,1.0,0,0,1,0,0,0,0
3,3,781,789,3353146,3353173,210,16,29,3202,114,134,1387,0,1,40,0.7202,0.3333,0.3333,0.0044,0.375,0.931,1.0,2.3222,0.7782,1.4314,0.6667,-0.0402,0.4025,0,0,1,0,0,0,0
4,4,1540,1560,618457,618502,521,72,67,48231,82,111,1692,0,1,300,0.1211,0.5347,0.0842,0.0192,0.2105,0.9861,1.0,2.7694,1.415,1.8808,0.9158,-0.2455,0.9998,0,0,0,0,0,0,1


In [10]:
df_train = df_train.drop(['id'], axis=1)
df_test_ids = df_test['id']
df_test = df_test.drop(['id'], axis=1)

df_train.head()

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,...,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,584,590,909972,909977,16,8,5,2274,113,140,...,-0.5,-0.01,0.142,0,0,0,1,0,0,0
1,808,816,728350,728372,433,20,54,44478,70,111,...,0.742,-0.3,0.949,0,0,0,0,0,0,1
2,39,192,2212076,2212144,11388,705,420,1311391,29,141,...,-0.011,-0.094,1.0,0,0,1,0,0,0,0
3,781,789,3353146,3353173,210,16,29,3202,114,134,...,0.667,-0.04,0.403,0,0,1,0,0,0,0
4,1540,1560,618457,618502,521,72,67,48231,82,111,...,0.916,-0.245,1.0,0,0,0,0,0,0,1


In [4]:
def getFeats(df):
    df['Y_Diff'] = (df['Y_Maximum'] - df['Y_Minimum'])
    df['Thickness_Normalized'] = (df['Steel_Plate_Thickness'] - df['Steel_Plate_Thickness'].min()) / (df['Steel_Plate_Thickness'].max() - df['Steel_Plate_Thickness'].min())
    df['Log_Ares_Ratio'] = 1 / (1 + np.exp(-df['LogOfAreas'] + 0.00001))
    df['Pixels_Areas'] = np.log(df['Pixels_Areas'] + 0.000001)
    dropCols = ['Square_Index','Sum_of_Luminosity','X_Minimum', 
                    'X_Perimeter', 'SigmoidOfAreas','Edges_X_Index', 
                    'Y_Minimum', 'Y_Maximum','TypeOfSteel_A400']
    df = df.drop(dropCols, axis=1)
    return df

In [5]:
df_train = getFeats(df_train)
df_test = getFeats(df_test)

target_cols=['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains','Dirtiness', 'Bumps','Other_Faults']

X = df_train.drop(target_cols, axis=1)
feats = X.columns

In [12]:
def cross_val_train(X,y,df_test,params):
    
    kf = StratifiedKFold(n_splits=10)
    test_preds = np.zeros((len(df_test)))
    val_preds = np.zeros((len(X)))
    val_scores, train_scores = [],[]
    
    for fold, (train_ind, valid_ind) in enumerate(kf.split(X,y)):
        
        X_train = X.iloc[train_ind]
        y_train = y[train_ind]
        X_valid = X.iloc[valid_ind]
        y_valid = y[valid_ind]
        model = XGBClassifier(**params) 
        model.fit(X_train, y_train, eval_set = [(X_valid, y_valid)], verbose = False)  
        
        train_roc = roc_auc_score(y_true=y_train,y_score=model.predict_proba(X_train)[:, 1])
        val_roc = roc_auc_score(y_true=y_valid,y_score=model.predict_proba(X_valid)[:, 1])
        print("Fold:",fold, " Train ROC:",np.round(train_roc,5), " Val ROC:",np.round(val_roc,5))
        
        test_preds += model.predict_proba(df_test[feats])[:, 1]/kf.get_n_splits()
        val_preds[valid_ind] = model.predict_proba(X_valid)[:, 1]
        val_scores.append(val_roc)
        
    return val_scores, val_preds, test_preds

params = {"booster": "gbtree","verbosity": 0,"max_depth": 5,"subsample": 0.7,"reg_alpha": 0.54,
          "random_state": 18,"n_estimators": 1500,"gamma": 0.44,"min_child_weight": 4,
          "reg_lambda": 0.00001,"learning_rate": 0.006,"colsample_bytree": 0.38, "early_stopping_rounds": 250}


In [13]:
df_final_pred=pd.DataFrame()
df_final_val=pd.DataFrame()
mean_vals = []
#
for k in target_cols:
    print ("-" * 20,"Start->",k,"<-","-" * 20)
    y=df_train[f"{k}"]
    
    score_list, oof_list, predict_list = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
    val_scores,val_preds,test_preds = cross_val_train(X,y,df_test,params)
    df_final_pred[f"{k}"]=test_preds
    df_final_val[f"{k}"]=val_preds
    mean_vals.append(np.mean(val_scores))
    print("Mean ROC for",k,":",np.mean(val_scores))
    print ("-" * 21,"End->",k,"<-","-" * 21)
    print("# ")


-------------------- Start-> Pastry <- --------------------
Fold: 0  Train ROC: 0.92865  Val ROC: 0.85343
Fold: 1  Train ROC: 0.92885  Val ROC: 0.86403
Fold: 2  Train ROC: 0.92999  Val ROC: 0.87977
Fold: 3  Train ROC: 0.93077  Val ROC: 0.87384
Fold: 4  Train ROC: 0.93124  Val ROC: 0.87785
Fold: 5  Train ROC: 0.91779  Val ROC: 0.86117
Fold: 6  Train ROC: 0.92326  Val ROC: 0.87998
Fold: 7  Train ROC: 0.92727  Val ROC: 0.8858
Fold: 8  Train ROC: 0.92058  Val ROC: 0.86991
Fold: 9  Train ROC: 0.93208  Val ROC: 0.87125
Mean ROC for Pastry : 0.871703880894542
--------------------- End-> Pastry <- ---------------------
# 
-------------------- Start-> Z_Scratch <- --------------------
Fold: 0  Train ROC: 0.98181  Val ROC: 0.95547
Fold: 1  Train ROC: 0.98188  Val ROC: 0.95661
Fold: 2  Train ROC: 0.98195  Val ROC: 0.96376
Fold: 3  Train ROC: 0.98102  Val ROC: 0.96584
Fold: 4  Train ROC: 0.98204  Val ROC: 0.96352
Fold: 5  Train ROC: 0.98211  Val ROC: 0.95909
Fold: 6  Train ROC: 0.98194  Val ROC: 0

In [14]:
print(f"Mean ROC Score Before Normalize: {roc_auc_score(df_train[target_cols],df_final_val[target_cols])}")
#
# Normalize: Set total probability to 1 - Val
df_final_val['sum_val']=df_final_val[target_cols].sum(axis=1)
for i in target_cols:
    df_final_val[i] = df_final_val[i]/df_final_val['sum_val']
#
print(f"Mean ROC Score After Normalize: {roc_auc_score(df_train[target_cols],df_final_val[target_cols])}")


Mean ROC Score Before Normalize: 0.8903944105196702
Mean ROC Score After Normalize: 0.8915948008332054


In [15]:
df_final_pred['sum_val']=df_final_pred[target_cols].sum(axis=1)
for i in target_cols:
    df_final_pred[i] = df_final_pred[i]/df_final_pred['sum_val']
#
submission=df_sub[['id']]
submission=pd.concat([submission,df_final_pred[target_cols]],axis=1)
submission.to_csv('submission.csv', index=False)
submission.head()


Unnamed: 0,id,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,19219,0.482,0.001,0.003,0.0,0.018,0.146,0.349
1,19220,0.251,0.022,0.007,0.0,0.176,0.233,0.311
2,19221,0.002,0.043,0.055,0.001,0.007,0.345,0.547
3,19222,0.154,0.002,0.001,0.001,0.017,0.385,0.44
4,19223,0.003,0.003,0.001,0.004,0.007,0.655,0.327


In [16]:
df_final_pred['sum_val']

0       1.036
1       0.992
2       0.906
3       0.978
4       1.000
         ... 
12809   0.763
12810   0.943
12811   0.995
12812   1.028
12813   0.959
Name: sum_val, Length: 12814, dtype: float64