In [None]:
# import packages

import pandas as pd
pd.options.display.max_columns = None
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from ctgan import CTGAN
from table_evaluator import TableEvaluator
import warnings
warnings.filterwarnings('ignore')


IPython not installed.


In [None]:
df = pd.read_csv('csv/faults.csv')

df.columns = map(str.lower, df.columns)
feat_col = ['x_minimum', 'x_maximum', 'y_minimum', 'y_maximum', 'pixels_areas', 'x_perimeter', 'y_perimeter', 'sum_of_luminosity', 'minimum_of_luminosity', 'maximum_of_luminosity', 'length_of_conveyer', 'typeofsteel_a300', 'typeofsteel_a400', 'steel_plate_thickness', 'edges_index', 'empty_index', 'square_index', 'outside_x_index', 'edges_x_index', 'edges_y_index', 'outside_global_index', 'logofareas', 'log_x_index', 'log_y_index', 'orientation_index', 'luminosity_index', 'sigmoidofareas']
target_cols = ['pastry', 'z_scratch', 'k_scatch', 'stains', 'dirtiness', 'bumps', 'other_faults']
enc_dict = {'pastry' : 0,
            'z_scratch' : 1,
            'k_scatch' : 2,
            'stains' : 3,
            'dirtiness' : 4,
            'bumps' : 5,
            'other_faults' : 6}
df.drop(391, inplace = True)
df['anomaly'] = pd.from_dummies(df[target_cols]).replace(enc_dict)

df.drop(target_cols, axis = 1, inplace = True)

sm = SMOTE()
df_smote = df[df['anomaly'] != 6]
X = df_smote[feat_col]
y = df_smote['anomaly']
X_res, y_res = sm.fit_resample(X, y)
df_no6 = pd.merge(pd.DataFrame(X_res), pd.DataFrame(y_res), right_index = True, left_index = True)
df_smote = pd.concat([df_no6, df[df['anomaly'] == 6]], axis = 0)
df_smote.reset_index(drop = True, inplace = True)

#df_nonbin = df_res.drop(['typeofsteel_a300', 'typeofsteel_a400', 'outside_global_index', 'anomaly'], axis = 1)#.drop(target_cols, axis = 1)

In [3]:
df_smote['aggregate1'] = df_smote[['y_minimum', 'y_maximum']].mean(axis = 1)
df_smote.drop(['y_minimum', 'y_maximum', 'typeofsteel_a400'], axis = 1, inplace = True)
df_smote['aggregate2'] = df_smote[['x_minimum', 'x_maximum']].mean(axis = 1)
df_smote.drop(['x_minimum', 'x_maximum'], axis = 1, inplace = True)
df_smote['aggregate3'] = df_smote[['x_perimeter', 'pixels_areas']].mean(axis = 1)
df_smote['aggregate4'] = df_smote[['y_perimeter', 'x_perimeter']].mean(axis = 1)
df_smote.drop(['y_perimeter'], axis = 1, inplace = True)
df_smote['aggregate5'] = df_smote[['sum_of_luminosity', 'pixels_areas']].mean(axis = 1)
df_smote.drop(['pixels_areas'], axis = 1, inplace = True)
df_smote['aggregate6'] = df_smote[['sum_of_luminosity', 'x_perimeter']].mean(axis = 1)
df_smote.drop(['sum_of_luminosity', 'x_perimeter'], axis = 1, inplace = True)
df_smote.dropna(inplace = True)
df_smote = df_smote.iloc[:, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24, 18]]

In [4]:
anomaly_count = pd.DataFrame(df_smote['anomaly'].value_counts()).sort_index()
anomaly_count.index = target_cols
anomaly_count

Unnamed: 0,count
pastry,402
z_scratch,402
k_scatch,402
stains,402
dirtiness,402
bumps,402
other_faults,673


In [5]:
discrete_cols = ['typeofsteel_a300', 'outside_global_index', 'anomaly']

anomaly_count = df_smote['anomaly'].value_counts().sort_index()

for idx in anomaly_count.index: 
    if idx == 6:
        continue 

    print(f"Processing anomaly category: {idx}")

    sub_df = df_smote[df_smote['anomaly'] == idx]
    if sub_df.empty:
        print(f"Skipping idx {idx}: No samples found.")
        continue

    ctgan = CTGAN(batch_size = 200)
    ctgan.fit(sub_df, discrete_columns = discrete_cols)

    num_samples = int(anomaly_count.max() - anomaly_count.loc[idx])
    
    if num_samples > 0:
        synthetic_data = ctgan.sample(num_samples)
        #table_evaluator =  TableEvaluator(sub_df[:num_samples], synthetic_data, cat_cols= discrete_cols)
        #table_evaluator.visual_evaluation()
        df_smote = pd.concat([df_smote, synthetic_data], axis = 0)
    else:
        print(f"Skipping idx {idx}: No synthetic data needed.")

df_norm = df_smote.copy()
df_norm.reset_index(drop = True, inplace = True)

target_cols = anomaly_count.index.tolist()
anomaly_count = df_norm['anomaly'].value_counts().sort_index()
anomaly_count.index = target_cols

anomaly_count


Processing anomaly category: 0
Processing anomaly category: 1
Processing anomaly category: 2
Processing anomaly category: 3
Processing anomaly category: 4
Processing anomaly category: 5


0    673
1    673
2    673
3    673
4    673
5    673
6    673
Name: count, dtype: int64

In [6]:
df_nonbin = df_norm.drop(['typeofsteel_a300', 'outside_global_index', 'anomaly'], axis = 1)

sc = StandardScaler()
nonbin_norm = sc.fit_transform(df_nonbin)
df_nonbin_norm = pd.DataFrame(nonbin_norm, columns = df_nonbin.columns)
df_norm = pd.concat([df_nonbin_norm, df_norm[['typeofsteel_a300', 'outside_global_index', 'anomaly']]], axis = 1)

In [7]:
df_norm

Unnamed: 0,minimum_of_luminosity,maximum_of_luminosity,length_of_conveyer,steel_plate_thickness,edges_index,empty_index,square_index,outside_x_index,edges_x_index,edges_y_index,logofareas,log_x_index,log_y_index,orientation_index,luminosity_index,sigmoidofareas,aggregate1,aggregate2,aggregate3,aggregate4,aggregate5,aggregate6,typeofsteel_a300,outside_global_index,anomaly
0,-0.364206,-1.155487,1.663123,0.133522,-1.058897,-1.309620,-1.250907,-0.403180,-0.449663,0.657351,0.146638,-0.660617,0.593217,1.243133,-1.216155,0.186337,-0.803023,-1.213017,-0.304993,-0.327025,-0.315474,-0.310120,1,1.0,0
1,-0.105470,-0.282544,1.663123,0.133522,1.146349,-0.315668,-1.163390,-0.425064,0.008256,0.500384,-0.338894,-0.928963,0.238204,1.196409,-0.400841,-0.630566,0.392335,-0.007033,-0.354543,-0.406994,-0.349188,-0.343554,1,1.0,0
2,0.379660,-0.166151,1.210148,0.579275,1.782721,-0.580385,-0.722670,-0.423075,0.539074,0.409409,-0.563812,-0.928963,-0.167777,0.961112,-0.028770,-0.870628,-0.126566,0.361574,-0.366184,-0.456498,-0.358179,-0.352485,1,1.0,0
3,0.379660,-0.107955,-0.700840,4.813925,1.035301,0.131538,-1.342258,-0.393233,-0.209379,0.657351,-0.076922,-0.785230,0.612232,1.291905,-0.268361,0.010751,-0.751105,0.410654,-0.333350,-0.338449,-0.329277,-0.323744,0,1.0,0
4,-1.625544,-0.107955,-0.700840,2.473723,-0.998129,0.184193,-1.653970,-0.246014,-1.112477,0.603143,1.326564,0.042796,2.095601,1.458325,-0.567145,1.388951,-0.683188,1.294108,0.347211,0.659258,0.268460,0.270365,0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4706,0.023898,0.241222,1.542801,-0.579682,-0.645395,-1.492802,1.706176,-0.330965,0.573109,0.655202,-0.879735,-0.143426,-1.190382,-2.398510,-2.244148,-1.252915,-0.155252,-0.014469,-0.342220,-0.299952,-0.389177,-0.366467,1,1.0,5
4707,0.250292,-0.166151,1.436635,0.913589,0.286860,0.439023,1.068760,-0.286887,1.209406,0.821362,-0.591904,-1.166370,0.178645,-1.181119,0.012280,-0.830042,1.825624,-1.044993,-0.399643,-0.339120,-0.366135,-0.314884,0,0.0,5
4708,-0.234838,0.124830,-0.559285,-0.445956,0.088793,-0.940341,1.231848,-0.229241,0.166880,0.667884,0.020945,0.794818,-0.927673,-0.690301,-0.945063,-0.797716,0.231845,0.581771,-0.404964,-0.350370,-0.374061,-0.331798,0,1.0,5
4709,1.058842,-0.398936,-0.446041,0.400974,0.500270,-0.446249,1.596392,-0.491107,0.140594,0.949260,0.408638,-0.915772,-1.192535,1.139767,-1.979535,-1.109959,0.328092,-1.343699,-0.297929,-0.422464,-0.134137,-0.293847,1,1.0,5


In [None]:
df_norm.to_csv('csv/balanced_normalized_steel_plates.csv')