In [2]:
# import packages

import pandas as pd
pd.options.display.max_columns = None
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from ctgan import CTGAN
from table_evaluator import TableEvaluator
import warnings
warnings.filterwarnings('ignore')


IPython not installed.


In [3]:
df = pd.read_csv('csv/faults.csv')

df.columns = map(str.lower, df.columns)
feat_col = ['x_minimum', 'x_maximum', 'y_minimum', 'y_maximum', 'pixels_areas', 'x_perimeter', 'y_perimeter', 'sum_of_luminosity', 'minimum_of_luminosity', 'maximum_of_luminosity', 'length_of_conveyer', 'typeofsteel_a300', 'typeofsteel_a400', 'steel_plate_thickness', 'edges_index', 'empty_index', 'square_index', 'outside_x_index', 'edges_x_index', 'edges_y_index', 'outside_global_index', 'logofareas', 'log_x_index', 'log_y_index', 'orientation_index', 'luminosity_index', 'sigmoidofareas']
target_cols = ['pastry', 'z_scratch', 'k_scatch', 'stains', 'dirtiness', 'bumps', 'other_faults']
enc_dict = {'pastry' : 0,
            'z_scratch' : 1,
            'k_scatch' : 2,
            'stains' : 3,
            'dirtiness' : 4,
            'bumps' : 5,
            'other_faults' : 6}
df.drop(391, inplace = True)
df['anomaly'] = pd.from_dummies(df[target_cols]).replace(enc_dict)

df.drop(target_cols, axis = 1, inplace = True)

sm = SMOTE()
df_smot = df[(df['anomaly'] != 2) & (df['anomaly'] != 5) & (df['anomaly'] != 6)]
X = df_smot[feat_col]
y = df_smot['anomaly']
X_res, y_res = sm.fit_resample(X, y)
df_no6 = pd.merge(pd.DataFrame(X_res), pd.DataFrame(y_res), right_index = True, left_index = True)
df_smote = pd.concat([df_no6, df[(df['anomaly'] == 2) | (df['anomaly'] == 5) | (df['anomaly'] == 6)]], axis = 0)
df_smote.reset_index(drop = True, inplace = True)

#df_nonbin = df_res.drop(['typeofsteel_a300', 'typeofsteel_a400', 'outside_global_index', 'anomaly'], axis = 1)#.drop(target_cols, axis = 1)

In [4]:
df_smote['aggregate1'] = df_smote[['y_minimum', 'y_maximum']].mean(axis = 1)
df_smote.drop(['y_minimum', 'y_maximum', 'typeofsteel_a400'], axis = 1, inplace = True)
df_smote['aggregate2'] = df_smote[['x_minimum', 'x_maximum']].mean(axis = 1)
df_smote.drop(['x_minimum', 'x_maximum'], axis = 1, inplace = True)
df_smote['aggregate3'] = df_smote[['x_perimeter', 'pixels_areas']].mean(axis = 1)
df_smote['aggregate4'] = df_smote[['y_perimeter', 'x_perimeter']].mean(axis = 1)
df_smote.drop(['y_perimeter'], axis = 1, inplace = True)
df_smote['aggregate5'] = df_smote[['sum_of_luminosity', 'pixels_areas']].mean(axis = 1)
df_smote.drop(['pixels_areas'], axis = 1, inplace = True)
df_smote['aggregate6'] = df_smote[['sum_of_luminosity', 'x_perimeter']].mean(axis = 1)
df_smote.drop(['sum_of_luminosity', 'x_perimeter'], axis = 1, inplace = True)
df_smote.dropna(inplace = True)
df_smote = df_smote.iloc[:, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24, 18]]

In [5]:
anomaly_count = pd.DataFrame(df_smote['anomaly'].value_counts()).sort_index()
anomaly_count.index = target_cols
anomaly_count

Unnamed: 0,count
pastry,190
z_scratch,190
k_scatch,390
stains,190
dirtiness,190
bumps,402
other_faults,673


In [6]:
discrete_cols = ['typeofsteel_a300', 'outside_global_index', 'anomaly']

anomaly_count = df_smote['anomaly'].value_counts().sort_index()

for idx in anomaly_count.index: 
    if idx == 6:
        continue 

    print(f"Processing anomaly category: {idx}")

    sub_df = df_smote[df_smote['anomaly'] == idx]
    if sub_df.empty:
        print(f"Skipping idx {idx}: No samples found.")
        continue

    ctgan = CTGAN(batch_size = 200)
    ctgan.fit(sub_df, discrete_columns = discrete_cols)

    num_samples = int(anomaly_count.max() - anomaly_count.loc[idx])
    
    if num_samples > 0:
        synthetic_data = ctgan.sample(num_samples)
        #table_evaluator =  TableEvaluator(sub_df[:num_samples], synthetic_data, cat_cols= discrete_cols)
        #table_evaluator.visual_evaluation()
        df_smote = pd.concat([df_smote, synthetic_data], axis = 0)
    else:
        print(f"Skipping idx {idx}: No synthetic data needed.")

df_norm = df_smote.copy()
df_norm.reset_index(drop = True, inplace = True)

target_cols = anomaly_count.index.tolist()
anomaly_count = df_norm['anomaly'].value_counts().sort_index()
anomaly_count.index = target_cols

anomaly_count


Processing anomaly category: 0


Processing anomaly category: 1
Processing anomaly category: 2
Processing anomaly category: 3
Processing anomaly category: 4
Processing anomaly category: 5


0    673
1    673
2    673
3    673
4    673
5    673
6    673
Name: count, dtype: int64

In [7]:
df_nonbin = df_norm.drop(['typeofsteel_a300', 'outside_global_index', 'anomaly'], axis = 1)

sc = StandardScaler()
nonbin_norm = sc.fit_transform(df_nonbin)
df_nonbin_norm = pd.DataFrame(nonbin_norm, columns = df_nonbin.columns)
df_norm = pd.concat([df_nonbin_norm, df_norm[['typeofsteel_a300', 'outside_global_index', 'anomaly']]], axis = 1)

In [11]:
df_norm = df_norm.sample(frac = 1)
df_norm

Unnamed: 0,minimum_of_luminosity,maximum_of_luminosity,length_of_conveyer,steel_plate_thickness,edges_index,empty_index,square_index,outside_x_index,edges_x_index,edges_y_index,logofareas,log_x_index,log_y_index,orientation_index,luminosity_index,sigmoidofareas,aggregate1,aggregate2,aggregate3,aggregate4,aggregate5,aggregate6,typeofsteel_a300,outside_global_index,anomaly
2216,0.724914,0.554846,-0.632347,-0.722612,-0.064125,0.136638,0.364388,-0.060018,-0.150056,-0.225268,0.205958,0.604670,-0.052594,-0.778524,0.640488,0.864554,-0.693124,-0.655433,-0.213257,-0.289144,-0.274824,-0.248258,0,0.0,6
3400,-1.819872,-0.248612,-0.725693,-0.722612,-0.854094,0.655219,-0.768921,2.650135,-0.364733,-0.655623,2.141880,1.985593,-0.070672,-1.410042,1.044918,1.422371,-0.897965,-0.933246,0.088139,0.567252,1.172500,1.984412,0,0.0,2
271,0.096572,-0.765121,-0.632347,-0.060092,-0.495842,0.775067,0.963568,-0.322035,0.084473,0.204579,-0.667092,-0.182519,-0.555304,-0.480287,-0.585843,-0.798372,-0.028362,-0.865759,-0.297088,-0.447410,-0.361762,-0.336454,1,0.0,1
1235,-2.228294,-2.544207,1.737194,-0.060092,-1.116643,-0.581421,0.895576,-0.412169,0.722786,0.634426,-0.575481,-0.528648,-0.399352,0.367185,-3.285756,-0.871201,-0.146875,2.192437,-0.296467,-0.477555,-0.368064,-0.342945,1,1.0,5
4049,0.724914,0.382676,-0.582084,-0.369268,-0.135375,-0.595116,-1.290961,-0.386811,-0.581799,-1.258395,-0.230999,-0.254212,1.523506,1.078531,0.319747,-0.950204,2.303810,0.504352,-0.264767,-0.278872,-0.371519,-0.351257,0,1.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1598,0.222241,-0.650341,-0.682610,2.479565,-0.505826,-0.668669,-0.627162,-0.399592,0.147849,0.634426,-0.230316,-0.639401,0.228198,1.125116,-0.231617,-0.529922,-0.678011,1.250457,-0.276596,-0.424800,-0.338108,-0.312528,0,1.0,6
1209,-0.280433,0.095727,1.751555,-0.280932,-0.365752,-1.505827,1.458989,-0.399592,-0.010401,0.634426,-0.465099,-0.429383,-0.473783,0.086751,-0.299628,-0.864709,-0.532580,-0.727203,-0.289947,-0.462482,-0.352756,-0.327345,1,1.0,5
4064,-0.060513,0.267897,-0.373852,-0.148428,1.955170,0.919282,-1.203247,-0.429349,-2.134638,0.551331,0.707412,-0.693402,0.607506,0.125486,0.036861,0.751496,0.031489,0.260057,-0.084553,-0.295427,-0.381157,-0.360554,0,1.0,4
2986,-0.249016,-0.879901,-0.682610,-0.060092,-0.114569,-0.679828,0.376234,-0.181768,-1.624987,1.008462,-0.724923,-0.267791,1.580745,1.260770,-1.580009,1.873435,0.111128,0.007611,-0.345033,-0.135575,-0.266345,-0.267144,1,0.0,1


In [10]:
df_norm.to_csv('csv/balanced_normalized_steel_plates.csv')