In [2]:
# import packages

import pandas as pd
pd.options.display.max_columns = None
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from ctgan import CTGAN
from table_evaluator import TableEvaluator
import warnings
warnings.filterwarnings('ignore')


IPython not installed.


In [3]:
df = pd.read_csv('csv/faults.csv')

df.columns = map(str.lower, df.columns)
feat_col = ['x_minimum', 'x_maximum', 'y_minimum', 'y_maximum', 'pixels_areas', 'x_perimeter', 'y_perimeter', 'sum_of_luminosity', 'minimum_of_luminosity', 'maximum_of_luminosity', 'length_of_conveyer', 'typeofsteel_a300', 'typeofsteel_a400', 'steel_plate_thickness', 'edges_index', 'empty_index', 'square_index', 'outside_x_index', 'edges_x_index', 'edges_y_index', 'outside_global_index', 'logofareas', 'log_x_index', 'log_y_index', 'orientation_index', 'luminosity_index', 'sigmoidofareas']
target_cols = ['pastry', 'z_scratch', 'k_scatch', 'stains', 'dirtiness', 'bumps', 'other_faults']
enc_dict = {'pastry' : 0,
            'z_scratch' : 1,
            'k_scatch' : 2,
            'stains' : 3,
            'dirtiness' : 4,
            'bumps' : 5,
            'other_faults' : 6}
df.drop(391, inplace = True)
df['anomaly'] = pd.from_dummies(df[target_cols]).replace(enc_dict)

df.drop(target_cols, axis = 1, inplace = True)

sm = SMOTE()
df_smot = df[(df['anomaly'] != 2) & (df['anomaly'] != 5) & (df['anomaly'] != 6)]
X = df_smot[feat_col]
y = df_smot['anomaly']
X_res, y_res = sm.fit_resample(X, y)
df_no6 = pd.merge(pd.DataFrame(X_res), pd.DataFrame(y_res), right_index = True, left_index = True)
df_smote = pd.concat([df_no6, df[(df['anomaly'] == 2) | (df['anomaly'] == 5) | (df['anomaly'] == 6)]], axis = 0)
df_smote.reset_index(drop = True, inplace = True)

#df_nonbin = df_res.drop(['typeofsteel_a300', 'typeofsteel_a400', 'outside_global_index', 'anomaly'], axis = 1)#.drop(target_cols, axis = 1)

In [4]:
df_smote['aggregate1'] = df_smote[['y_minimum', 'y_maximum']].mean(axis = 1)
df_smote.drop(['y_minimum', 'y_maximum', 'typeofsteel_a400'], axis = 1, inplace = True)
df_smote['aggregate2'] = df_smote[['x_minimum', 'x_maximum']].mean(axis = 1)
df_smote.drop(['x_minimum', 'x_maximum'], axis = 1, inplace = True)
df_smote['aggregate3'] = df_smote[['x_perimeter', 'pixels_areas']].mean(axis = 1)
df_smote['aggregate4'] = df_smote[['y_perimeter', 'x_perimeter']].mean(axis = 1)
df_smote.drop(['y_perimeter'], axis = 1, inplace = True)
df_smote['aggregate5'] = df_smote[['sum_of_luminosity', 'pixels_areas']].mean(axis = 1)
df_smote.drop(['pixels_areas'], axis = 1, inplace = True)
df_smote['aggregate6'] = df_smote[['sum_of_luminosity', 'x_perimeter']].mean(axis = 1)
df_smote.drop(['sum_of_luminosity', 'x_perimeter'], axis = 1, inplace = True)
df_smote.dropna(inplace = True)
df_smote = df_smote.iloc[:, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24, 18]]

In [5]:
anomaly_count = pd.DataFrame(df_smote['anomaly'].value_counts()).sort_index()
anomaly_count.index = target_cols
anomaly_count

Unnamed: 0,count
pastry,190
z_scratch,190
k_scatch,390
stains,190
dirtiness,190
bumps,402
other_faults,673


In [6]:
discrete_cols = ['typeofsteel_a300', 'outside_global_index', 'anomaly']

anomaly_count = df_smote['anomaly'].value_counts().sort_index()

for idx in anomaly_count.index: 
    if idx == 6:
        continue 

    print(f"Processing anomaly category: {idx}")

    sub_df = df_smote[df_smote['anomaly'] == idx]
    if sub_df.empty:
        print(f"Skipping idx {idx}: No samples found.")
        continue

    ctgan = CTGAN(batch_size = 200)
    ctgan.fit(sub_df, discrete_columns = discrete_cols)

    num_samples = int(anomaly_count.max() - anomaly_count.loc[idx])
    
    if num_samples > 0:
        synthetic_data = ctgan.sample(num_samples)
        #table_evaluator =  TableEvaluator(sub_df[:num_samples], synthetic_data, cat_cols= discrete_cols)
        #table_evaluator.visual_evaluation()
        df_smote = pd.concat([df_smote, synthetic_data], axis = 0)
    else:
        print(f"Skipping idx {idx}: No synthetic data needed.")

df_norm = df_smote.copy()
df_norm.reset_index(drop = True, inplace = True)

target_cols = anomaly_count.index.tolist()
anomaly_count = df_norm['anomaly'].value_counts().sort_index()
anomaly_count.index = target_cols

anomaly_count


Processing anomaly category: 0
Processing anomaly category: 1
Processing anomaly category: 2
Processing anomaly category: 3
Processing anomaly category: 4
Processing anomaly category: 5


0    673
1    673
2    673
3    673
4    673
5    673
6    673
Name: count, dtype: int64

In [7]:
df_nonbin = df_norm.drop(['typeofsteel_a300', 'outside_global_index', 'anomaly'], axis = 1)

sc = StandardScaler()
nonbin_norm = sc.fit_transform(df_nonbin)
df_nonbin_norm = pd.DataFrame(nonbin_norm, columns = df_nonbin.columns)
df_norm = pd.concat([df_nonbin_norm, df_norm[['typeofsteel_a300', 'outside_global_index', 'anomaly']]], axis = 1)

In [8]:
df_norm = df_norm.sample(frac = 1)
df_norm.reset_index(drop = True, inplace = True)
df_norm

Unnamed: 0,minimum_of_luminosity,maximum_of_luminosity,length_of_conveyer,steel_plate_thickness,edges_index,empty_index,square_index,outside_x_index,edges_x_index,edges_y_index,logofareas,log_x_index,log_y_index,orientation_index,luminosity_index,sigmoidofareas,aggregate1,aggregate2,aggregate3,aggregate4,aggregate5,aggregate6,typeofsteel_a300,outside_global_index,anomaly
0,0.048602,-0.384662,-0.521105,0.513414,1.528428,0.056186,-1.635986,-0.284629,-1.601243,0.575835,0.658473,-0.430656,1.598953,1.495229,-0.448322,1.307212,-0.010928,-0.027044,-0.088432,0.363408,-0.093848,-0.113965,0,1.0,4
1,0.266960,-0.267806,2.150279,0.036918,0.074987,-0.502390,0.213130,-0.454729,0.675457,0.602169,-1.338977,-1.148072,-1.108565,-1.575873,0.592524,0.752272,5.909537,-1.322786,-0.442008,-0.171665,1.044835,-0.408621,1,1.0,0
2,-0.980801,-1.553222,1.958370,0.080236,-1.119448,0.190489,-1.509253,-0.280461,-0.876564,0.562890,0.697081,-0.198271,1.485224,1.430428,-1.886929,1.311386,-0.423782,-1.194935,-0.087549,0.085688,-0.145502,-0.165172,1,1.0,6
3,-2.197368,0.433330,-0.513429,-0.786121,-0.937463,0.209045,-0.819440,1.780948,0.234553,-2.033613,1.676612,1.958538,1.598703,-1.166413,0.262807,1.350700,0.095295,-0.957771,-0.462657,4.121585,-1.257033,5.833420,0,1.0,2
4,-0.076174,0.024334,-0.551811,-0.786121,1.419745,-0.770548,1.560506,-0.161689,1.114701,0.601997,0.010163,0.096039,-0.029635,-0.139206,-0.186018,0.181568,-0.653537,0.226393,-0.296268,-0.350729,-0.278410,-0.294250,0,0.5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4706,0.859647,0.491758,-0.567163,-0.569532,0.105694,-1.905001,2.095264,-0.247523,1.391291,0.591045,-1.302649,-0.341910,-1.380082,-0.732271,0.548896,-1.120479,-0.471190,0.351455,-0.375997,-0.467444,-0.352249,-0.369074,0,0.5,3
4707,0.173378,-0.443090,1.981399,-0.352943,0.164440,0.529215,1.173081,-0.342973,0.816974,0.601997,-1.020215,-0.624199,-0.750320,-0.337305,-0.119113,-0.993645,-0.365944,-0.496092,-0.365154,-0.426471,-0.348820,-0.363010,1,0.0,5
4708,-0.481696,-0.501518,-0.444341,-0.786121,1.576366,0.041476,-0.880516,-0.272126,-0.817087,0.601997,0.062511,-0.346758,0.590891,1.108941,-0.586070,0.655688,-0.562913,0.178165,-0.284198,-0.224493,-0.275883,-0.291796,0,1.0,6
4709,0.235766,-0.326234,1.643638,-0.158012,-0.917327,-0.096378,0.755667,-0.326303,0.152761,0.601997,-0.616297,-0.522545,-0.345846,0.272326,-0.143944,-0.815710,0.716372,-1.090913,-0.351023,-0.390404,-0.335233,-0.349733,1,1.0,5


In [9]:
df_norm.to_csv('csv/balanced_normalized_steel_plates.csv')