In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

## Load Data

In [2]:
df = pd.read_csv("uci-secom.csv")
df.shape

(1567, 592)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1567 entries, 0 to 1566
Columns: 592 entries, Time to Pass/Fail
dtypes: float64(590), int64(1), object(1)
memory usage: 7.1+ MB


In [4]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,581,582,583,584,585,586,587,588,589,Pass/Fail
count,1561.0,1560.0,1553.0,1553.0,1553.0,1553.0,1553.0,1558.0,1565.0,1565.0,...,618.0,1566.0,1566.0,1566.0,1566.0,1566.0,1566.0,1566.0,1566.0,1567.0
mean,3014.452896,2495.850231,2200.547318,1396.376627,4.197013,100.0,101.112908,0.121822,1.462862,-0.000841,...,97.934373,0.500096,0.015318,0.003847,3.067826,0.021458,0.016475,0.005283,99.670066,-0.867262
std,73.621787,80.407705,29.513152,441.69164,56.35554,0.0,6.237214,0.008961,0.073897,0.015116,...,87.520966,0.003404,0.01718,0.00372,3.578033,0.012358,0.008808,0.002867,93.891919,0.49801
min,2743.24,2158.75,2060.66,0.0,0.6815,100.0,82.1311,0.0,1.191,-0.0534,...,0.0,0.4778,0.006,0.0017,1.1975,-0.0169,0.0032,0.001,0.0,-1.0
25%,2966.26,2452.2475,2181.0444,1081.8758,1.0177,100.0,97.92,0.1211,1.4112,-0.0108,...,46.1849,0.4979,0.0116,0.0031,2.3065,0.013425,0.0106,0.0033,44.3686,-1.0
50%,3011.49,2499.405,2201.0667,1285.2144,1.3168,100.0,101.5122,0.1224,1.4616,-0.0013,...,72.2889,0.5002,0.0138,0.0036,2.75765,0.0205,0.0148,0.0046,71.9005,-1.0
75%,3056.65,2538.8225,2218.0555,1591.2235,1.5257,100.0,104.5867,0.1238,1.5169,0.0084,...,116.53915,0.502375,0.0165,0.0041,3.295175,0.0276,0.0203,0.0064,114.7497,-1.0
max,3356.35,2846.44,2315.2667,3715.0417,1114.5366,100.0,129.2522,0.1286,1.6564,0.0749,...,737.3048,0.5098,0.4766,0.1045,99.3032,0.1028,0.0799,0.0286,737.3048,1.0


In [5]:
df["Pass/Fail"].value_counts()

Pass/Fail
-1    1463
 1     104
Name: count, dtype: int64

In [6]:
label_name = "Pass/Fail"
label_one_count = (df[label_name] == 1).sum()
label_one_count

104

In [7]:
df_label_not_one = df[df[label_name] == -1].sample(label_one_count)

df_under = pd.concat([df_label_not_one,  df[df[label_name] == 1]])
df_under.shape

(208, 592)

In [8]:
df_under[label_name].value_counts()

Pass/Fail
-1    104
 1    104
Name: count, dtype: int64

In [9]:
X, y = df_under.drop(columns=[label_name, "Time"]), df_under[label_name]
X.shape, y.shape

((208, 590), (208,))

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.1, random_state=42)

In [11]:
cat_col = X.select_dtypes(exclude="number").columns
print(cat_col)
X_train[cat_col] = X_train[cat_col].astype("category")
X_test[cat_col] = X_test[cat_col].astype("category")

Index([], dtype='object')


## LightGBM

In [12]:
import lightgbm 
        
model_lgbm = lightgbm.LGBMClassifier(boosting_type='goss', 
                                    n_estimators=1000, max_depth=3,
                                    early_stopping_rounds=10,
                                    n_jobs=-1,
                                    learning_rate=0.01, random_state=42)
model_lgbm

In [13]:
# fit & predict
model_lgbm.fit(X_train, y_train, eval_set=[(X_test, y_test)],
        callbacks=[lightgbm.log_evaluation(1)])

[1]	valid_0's binary_logloss: 0.692385
[2]	valid_0's binary_logloss: 0.691416
[3]	valid_0's binary_logloss: 0.69051
[4]	valid_0's binary_logloss: 0.688599
[5]	valid_0's binary_logloss: 0.687789
[6]	valid_0's binary_logloss: 0.685364
[7]	valid_0's binary_logloss: 0.684651
[8]	valid_0's binary_logloss: 0.682327
[9]	valid_0's binary_logloss: 0.681708
[10]	valid_0's binary_logloss: 0.67948
[11]	valid_0's binary_logloss: 0.67895
[12]	valid_0's binary_logloss: 0.675987
[13]	valid_0's binary_logloss: 0.675042
[14]	valid_0's binary_logloss: 0.673189
[15]	valid_0's binary_logloss: 0.6723
[16]	valid_0's binary_logloss: 0.671087
[17]	valid_0's binary_logloss: 0.671555
[18]	valid_0's binary_logloss: 0.669214
[19]	valid_0's binary_logloss: 0.669695
[20]	valid_0's binary_logloss: 0.668045
[21]	valid_0's binary_logloss: 0.666345
[22]	valid_0's binary_logloss: 0.666842
[23]	valid_0's binary_logloss: 0.665143
[24]	valid_0's binary_logloss: 0.665647
[25]	valid_0's binary_logloss: 0.663309
[26]	valid_0's

In [14]:
model_lgbm.best_score_

defaultdict(collections.OrderedDict,
            {'valid_0': OrderedDict([('binary_logloss', 0.6095906052753985)])})