In [1]:
# import packages

import pandas as pd
pd.options.display.max_columns = None
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from ctgan import CTGAN
from table_evaluator import TableEvaluator
import warnings
warnings.filterwarnings('ignore')


IPython not installed.


In [30]:
df = pd.read_csv('csv/faults.csv')

df.columns = map(str.lower, df.columns)
feat_col = ['x_minimum', 'x_maximum', 'y_minimum', 'y_maximum', 'pixels_areas', 'x_perimeter', 'y_perimeter', 'sum_of_luminosity', 'minimum_of_luminosity', 'maximum_of_luminosity', 'length_of_conveyer', 'typeofsteel_a300', 'typeofsteel_a400', 'steel_plate_thickness', 'edges_index', 'empty_index', 'square_index', 'outside_x_index', 'edges_x_index', 'edges_y_index', 'outside_global_index', 'logofareas', 'log_x_index', 'log_y_index', 'orientation_index', 'luminosity_index', 'sigmoidofareas']
target_cols = ['pastry', 'z_scratch', 'k_scatch', 'stains', 'dirtiness', 'bumps', 'other_faults']
enc_dict = {'pastry' : 0,
            'z_scratch' : 1,
            'k_scatch' : 2,
            'stains' : 3,
            'dirtiness' : 4,
            'bumps' : 5,
            'other_faults' : 6}
df.drop(391, inplace = True)
df['anomaly'] = pd.from_dummies(df[target_cols]).replace(enc_dict)

df.drop(target_cols, axis = 1, inplace = True)

sm = SMOTE()
df_smot = df[(df['anomaly'] != 2) & (df['anomaly'] != 5) & (df['anomaly'] != 6)]
X = df_smot[feat_col]
y = df_smot['anomaly']
X_res, y_res = sm.fit_resample(X, y)
df_no6 = pd.merge(pd.DataFrame(X_res), pd.DataFrame(y_res), right_index = True, left_index = True)
df_smote = pd.concat([df_no6, df[(df['anomaly'] == 2) | (df['anomaly'] == 5) | (df['anomaly'] == 6)]], axis = 0)
df_smote.reset_index(drop = True, inplace = True)

#df_nonbin = df_res.drop(['typeofsteel_a300', 'typeofsteel_a400', 'outside_global_index', 'anomaly'], axis = 1)#.drop(target_cols, axis = 1)

In [31]:
df_smote['aggregate1'] = df_smote[['y_minimum', 'y_maximum']].mean(axis = 1)
df_smote.drop(['y_minimum', 'y_maximum', 'typeofsteel_a400'], axis = 1, inplace = True)
df_smote['aggregate2'] = df_smote[['x_minimum', 'x_maximum']].mean(axis = 1)
df_smote.drop(['x_minimum', 'x_maximum'], axis = 1, inplace = True)
df_smote['aggregate3'] = df_smote[['x_perimeter', 'pixels_areas']].mean(axis = 1)
df_smote['aggregate4'] = df_smote[['y_perimeter', 'x_perimeter']].mean(axis = 1)
df_smote.drop(['y_perimeter'], axis = 1, inplace = True)
df_smote['aggregate5'] = df_smote[['sum_of_luminosity', 'pixels_areas']].mean(axis = 1)
df_smote.drop(['pixels_areas'], axis = 1, inplace = True)
df_smote['aggregate6'] = df_smote[['sum_of_luminosity', 'x_perimeter']].mean(axis = 1)
df_smote.drop(['sum_of_luminosity', 'x_perimeter'], axis = 1, inplace = True)
df_smote.dropna(inplace = True)
df_smote = df_smote.iloc[:, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24, 18]]

In [32]:
anomaly_count = pd.DataFrame(df_smote['anomaly'].value_counts()).sort_index()
anomaly_count.index = target_cols
anomaly_count

Unnamed: 0,count
pastry,190
z_scratch,190
k_scatch,390
stains,190
dirtiness,190
bumps,402
other_faults,673


In [33]:
discrete_cols = ['typeofsteel_a300', 'outside_global_index', 'anomaly']

anomaly_count = df_smote['anomaly'].value_counts().sort_index()

for idx in anomaly_count.index: 
    if idx == 6:
        continue 

    print(f"Processing anomaly category: {idx}")

    sub_df = df_smote[df_smote['anomaly'] == idx]
    if sub_df.empty:
        print(f"Skipping idx {idx}: No samples found.")
        continue

    ctgan = CTGAN(batch_size = 200)
    ctgan.fit(sub_df, discrete_columns = discrete_cols)

    num_samples = int(anomaly_count.max() - anomaly_count.loc[idx])
    
    if num_samples > 0:
        synthetic_data = ctgan.sample(num_samples)
        #table_evaluator =  TableEvaluator(sub_df[:num_samples], synthetic_data, cat_cols= discrete_cols)
        #table_evaluator.visual_evaluation()
        df_smote = pd.concat([df_smote, synthetic_data], axis = 0)
    else:
        print(f"Skipping idx {idx}: No synthetic data needed.")

df_norm = df_smote.copy()
df_norm.reset_index(drop = True, inplace = True)

target_cols = anomaly_count.index.tolist()
anomaly_count = df_norm['anomaly'].value_counts().sort_index()
anomaly_count.index = target_cols

anomaly_count


Processing anomaly category: 0
Processing anomaly category: 1
Processing anomaly category: 2
Processing anomaly category: 3
Processing anomaly category: 4
Processing anomaly category: 5


0    673
1    673
2    673
3    673
4    673
5    673
6    673
Name: count, dtype: int64

In [34]:
df_nonbin = df_norm.drop(['typeofsteel_a300', 'outside_global_index', 'anomaly'], axis = 1)

sc = StandardScaler()
nonbin_norm = sc.fit_transform(df_nonbin)
df_nonbin_norm = pd.DataFrame(nonbin_norm, columns = df_nonbin.columns)
df_norm = pd.concat([df_nonbin_norm, df_norm[['typeofsteel_a300', 'outside_global_index', 'anomaly']]], axis = 1)

In [35]:
df_norm

Unnamed: 0,minimum_of_luminosity,maximum_of_luminosity,length_of_conveyer,steel_plate_thickness,edges_index,empty_index,square_index,outside_x_index,edges_x_index,edges_y_index,logofareas,log_x_index,log_y_index,orientation_index,luminosity_index,sigmoidofareas,aggregate1,aggregate2,aggregate3,aggregate4,aggregate5,aggregate6,typeofsteel_a300,outside_global_index,anomaly
0,-0.604549,-1.305323,1.718299,0.071532,-0.950291,-1.279222,-1.209661,-0.402827,-0.685286,0.666365,0.195590,-0.540589,0.623512,1.289749,-0.978644,0.169682,-0.782966,-1.146403,-0.266055,-0.317974,-0.273496,-0.302262,1,1.0,0
1,-0.354472,-0.468180,1.718299,0.071532,1.096834,-0.331661,-1.127233,-0.424273,-0.223836,0.523097,-0.289461,-0.796513,0.289057,1.242770,-0.247564,-0.651387,0.392237,-0.028945,-0.317601,-0.388516,-0.311275,-0.338078,1,1.0,0
2,0.114422,-0.356560,1.259239,0.499372,1.687577,-0.584023,-0.712137,-0.422323,0.311075,0.440062,-0.514156,-0.796513,-0.093413,1.006188,0.086066,-0.892674,-0.117914,0.312604,-0.329712,-0.432185,-0.321350,-0.347645,1,1.0,0
3,0.114422,-0.300751,-0.677416,4.563853,0.993748,0.094673,-1.295701,-0.393079,-0.443150,0.666365,-0.027748,-0.659433,0.641425,1.338787,-0.128771,-0.006798,-0.731923,0.358082,-0.295554,-0.328052,-0.288963,-0.316857,0,1.0,0
4,-1.823673,-0.300751,-0.677416,2.317693,-0.893880,0.144871,-1.589289,-0.248807,-1.353212,0.616888,1.374347,0.130262,2.038893,1.506116,-0.396686,1.378431,-0.665151,1.176685,0.412434,0.552044,0.380859,0.319568,0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4706,1.145988,0.034107,1.639398,-0.976676,-0.140006,2.468656,-0.190928,-0.389728,1.020637,0.333147,-0.622228,-0.003421,-1.025062,-2.727161,-1.159373,-0.633224,1.498731,1.121522,-0.352082,-0.326455,-0.296431,0.093974,1,1.0,5
4707,0.208200,-0.189132,-0.620034,0.157100,0.620076,0.206197,1.761101,-0.117156,0.884070,0.627347,-0.531331,-0.625545,-0.805747,0.563827,-1.013169,1.635937,0.455363,-0.773901,-0.357020,-0.367141,-0.284391,0.162504,0,0.5,5
4708,0.301979,-0.244941,1.438559,-0.356308,-0.047662,0.012887,1.471809,-0.325409,1.104278,-1.974777,-0.501257,-0.510537,-1.353797,0.161058,0.492110,-0.393581,-0.880924,-0.681679,-0.395009,-0.361564,-0.247926,-0.291971,1,1.0,5
4709,-0.542030,-0.468180,-0.871082,0.199884,0.988242,-0.372167,1.024294,-0.464291,0.436190,0.540976,-0.671774,0.013115,0.350285,1.345114,-0.018137,0.615305,1.601341,-0.048598,-0.320398,-0.296400,-0.297248,-0.279594,0,0.0,5


In [36]:
df_norm.to_csv('csv/balanced_normalized_steel_plates.csv')