In [33]:
import numpy as np
import pandas as pd
import csv
import os
import ipynb
import math
import sklearn
import pickle
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn.linear_model import HuberRegressor
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score
from scipy.stats import rankdata

os.environ['KMP_DUPLICATE_LIB_OK']='True'


In [34]:
#import the data from the training csv
#then, firsst encode character attributes into integer attributes for an easier modeling
#3 additional columns are added, indicating the appereace of missing value and the product of attribute_2 and attribute_3

data = pd.read_csv(r'C:\Users\user\Desktop\tabular-playground-series-aug-2022\train.csv', delimiter=',', usecols = ['id','product_code','loading','attribute_0','attribute_1','attribute_2','attribute_3','measurement_0','measurement_1','measurement_2','measurement_3','measurement_4','measurement_5','measurement_0','measurement_6','measurement_7','measurement_0','measurement_8','measurement_9','measurement_10','measurement_11','measurement_12','measurement_13','measurement_14','measurement_15','measurement_16','measurement_17','failure'])

cleanup_nums1 = {"attribute_0": {"material_5": 0, "material_7": 1}}
cleanup_nums2 = {"product_code": {"A": 0, "B": 1,"C": 2,"D": 3,"E": 4}}
data = data.replace(cleanup_nums1)
data = data.replace(cleanup_nums2)

data['m3_missing'] = data['measurement_3'].isnull().astype(np.int8)
data['m5_missing'] = data['measurement_5'].isnull().astype(np.int8)
data['attribute_2*3'] = data['attribute_2'] * data['attribute_3']

data.head()

Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,...,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure,m3_missing,m5_missing,attribute_2*3
0,0,0,80.1,1,material_8,9,5,7,8,4,...,15.193,15.029,,13.034,14.684,764.1,0,0,0,45
1,1,0,84.89,1,material_8,9,5,14,3,3,...,11.755,14.732,15.425,14.395,15.631,682.057,0,0,0,45
2,2,0,82.43,1,material_8,9,5,12,1,5,...,13.798,16.711,18.631,14.094,17.946,663.376,0,0,0,45
3,3,0,101.07,1,material_8,9,5,13,2,6,...,10.02,15.25,15.562,16.154,17.172,826.282,0,0,0,45
4,4,0,188.06,1,material_8,9,5,9,2,8,...,12.428,16.182,12.76,13.153,16.412,579.885,0,0,0,45


In [35]:
#we then group the data by product_code
#here we drop some redundent attributes and measurements to perform the first feature selection

producta = data[data['product_code']==0].drop(['attribute_1', 'attribute_2', 'attribute_3'],axis=1)
productb = data[data['product_code']==1].drop(['attribute_1', 'attribute_2', 'attribute_3'],axis=1)
productc = data[data['product_code']==2].drop(['attribute_1', 'attribute_2', 'attribute_3'],axis=1)
productd = data[data['product_code']==3].drop(['attribute_1', 'attribute_2', 'attribute_3'],axis=1)
producte = data[data['product_code']==4].drop(['attribute_1', 'attribute_2', 'attribute_3'],axis=1)

impute_a = producta.drop(['id', 'measurement_9', 'measurement_10', 'measurement_11', 'measurement_12', 'measurement_13', 'measurement_14', 'measurement_15', 'measurement_16'],axis=1)
impute_b = productb.drop(['id', 'measurement_9', 'measurement_10', 'measurement_11', 'measurement_12', 'measurement_13', 'measurement_14', 'measurement_15', 'measurement_16'],axis=1)
impute_c = productc.drop(['id', 'measurement_9', 'measurement_10', 'measurement_11', 'measurement_12', 'measurement_13', 'measurement_14', 'measurement_15', 'measurement_16'],axis=1)
impute_d = productd.drop(['id', 'measurement_9', 'measurement_10', 'measurement_11', 'measurement_12', 'measurement_13', 'measurement_14', 'measurement_15', 'measurement_16'],axis=1)
impute_e = producte.drop(['id', 'measurement_9', 'measurement_10', 'measurement_11', 'measurement_12', 'measurement_13', 'measurement_14', 'measurement_15', 'measurement_16'],axis=1)

impute_a.head()

Unnamed: 0,product_code,loading,attribute_0,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,measurement_17,failure,m3_missing,m5_missing,attribute_2*3
0,0,80.1,1,7,8,4,18.04,12.518,15.748,19.292,11.739,20.155,764.1,0,0,0,45
1,0,84.89,1,14,3,3,18.213,11.54,17.717,17.893,12.748,17.889,682.057,0,0,0,45
2,0,82.43,1,12,1,5,18.057,11.652,16.738,18.24,12.718,18.288,663.376,0,0,0,45
3,0,101.07,1,13,2,6,17.295,11.188,18.576,18.339,12.583,19.06,826.282,0,0,0,45
4,0,188.06,1,9,2,8,19.346,12.95,16.99,15.746,11.306,18.093,579.885,0,0,0,45


In [36]:
#the next step is to perform imputation for the missing value
#for measurement17, it has significant correlation with other measurements,
#so I use HuberRegressor to compute the coefficient and fill the missing value in measurement17

train_a = impute_a.dropna(how='any')
hubera_1 = HuberRegressor().fit(train_a.iloc[:,11].values.reshape(-1, 1), train_a.iloc[:,12].values.reshape(-1, 1))
hubera_2 = HuberRegressor().fit(train_a.iloc[:,8].values.reshape(-1, 1), train_a.iloc[:,12].values.reshape(-1, 1))

train_b = impute_b.dropna(how='any')
huberb_1 = HuberRegressor().fit(train_b.iloc[:,10].values.reshape(-1, 1), train_b.iloc[:,12].values.reshape(-1, 1))
huberb_2 = HuberRegressor().fit(train_b.iloc[:,7].values.reshape(-1, 1), train_b.iloc[:,12].values.reshape(-1, 1))

train_c = impute_c.dropna(how='any')
huberc_1 = HuberRegressor().fit(train_c.iloc[:,11].values.reshape(-1, 1), train_c.iloc[:,12].values.reshape(-1, 1))
huberc_2 = HuberRegressor().fit(train_c.iloc[:,8].values.reshape(-1, 1), train_c.iloc[:,12].values.reshape(-1, 1))
huberc_3 = HuberRegressor().fit(train_c.iloc[:,10].values.reshape(-1, 1), train_c.iloc[:,12].values.reshape(-1, 1))

train_d = impute_d.dropna(how='any')
huberd_1 = HuberRegressor().fit(train_d.iloc[:,9].values.reshape(-1, 1), train_d.iloc[:,12].values.reshape(-1, 1))
huberd_2 = HuberRegressor().fit(train_d.iloc[:,8].values.reshape(-1, 1), train_d.iloc[:,12].values.reshape(-1, 1))

train_e = impute_e.dropna(how='any')
hubere_1 = HuberRegressor().fit(train_e.iloc[:,9].values.reshape(-1, 1), train_e.iloc[:,12].values.reshape(-1, 1))
hubere_2 = HuberRegressor().fit(train_e.iloc[:,11].values.reshape(-1, 1), train_e.iloc[:,12].values.reshape(-1, 1))



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [37]:
for i in range(len(impute_a)):
    if(math.isnan(impute_a.iloc[i,12])==True and math.isnan(impute_a.iloc[i,11])==True):
        impute_a.iloc[i,12] = hubera_2.predict(impute_a.iloc[i,8].reshape(-1, 1))
        
    elif(math.isnan(impute_a.iloc[i,12])==True and math.isnan(impute_a.iloc[i,11])==False):
        impute_a.iloc[i,12] = hubera_1.predict(impute_a.iloc[i,11].reshape(-1, 1))


for i in range(len(impute_b)):
    if(math.isnan(impute_b.iloc[i,12])==True and math.isnan(impute_b.iloc[i,10])==True):
        impute_b.iloc[i,12] = huberb_2.predict(impute_b.iloc[i,7].reshape(-1, 1))
        
    elif(math.isnan(impute_b.iloc[i,12])==True and math.isnan(impute_b.iloc[i,10])==False):
        impute_b.iloc[i,12] = huberb_1.predict(impute_b.iloc[i,10].reshape(-1, 1))

        
for i in range(len(impute_c)):
    if(math.isnan(impute_c.iloc[i,12])==True and math.isnan(impute_c.iloc[i,11])==True and math.isnan(impute_c.iloc[i,8])==True):
        impute_c.iloc[i,12] = huberc_3.predict(impute_c.iloc[i,10].reshape(-1, 1))
        
    elif(math.isnan(impute_c.iloc[i,12])==True and math.isnan(impute_c.iloc[i,11])==True):
        impute_c.iloc[i,12] = huberc_2.predict(impute_c.iloc[i,8].reshape(-1, 1))
        
    elif(math.isnan(impute_c.iloc[i,12])==True and math.isnan(impute_c.iloc[i,11])==False):
        impute_c.iloc[i,12] = huberc_1.predict(impute_c.iloc[i,11].reshape(-1, 1))


for i in range(len(impute_d)):
    if(math.isnan(impute_d.iloc[i,12])==True and math.isnan(impute_d.iloc[i,9])==True):
        impute_d.iloc[i,12] = huberd_2.predict(impute_d.iloc[i,8].reshape(-1, 1))
        
    elif(math.isnan(impute_d.iloc[i,12])==True and math.isnan(impute_d.iloc[i,9])==False):
        impute_d.iloc[i,12] = huberd_1.predict(impute_d.iloc[i,9].reshape(-1, 1))


for i in range(len(impute_e)):
    if(math.isnan(impute_e.iloc[i,12])==True and math.isnan(impute_e.iloc[i,9])==True):
        impute_e.iloc[i,12] = hubere_2.predict(impute_e.iloc[i,11].reshape(-1, 1))
        
    elif(math.isnan(impute_e.iloc[i,12])==True and math.isnan(impute_e.iloc[i,9])==False):
        impute_e.iloc[i,12] = hubere_1.predict(impute_e.iloc[i,9].reshape(-1, 1))
 



In [38]:
#for missing value in loading, I use KNNimputer to impute the missing value

imputer = KNNImputer(n_neighbors=10, weights="uniform")
impute_a['loading'] = imputer.fit_transform(np.array(impute_a['loading']).reshape(-1,1))

imputer = KNNImputer(n_neighbors=10, weights="uniform")
impute_b['loading'] = imputer.fit_transform(np.array(impute_b['loading']).reshape(-1,1))

imputer = KNNImputer(n_neighbors=10, weights="uniform")
impute_c['loading'] = imputer.fit_transform(np.array(impute_c['loading']).reshape(-1,1))

imputer = KNNImputer(n_neighbors=10, weights="uniform")
impute_d['loading'] = imputer.fit_transform(np.array(impute_d['loading']).reshape(-1,1))

imputer = KNNImputer(n_neighbors=10, weights="uniform")
impute_e['loading'] = imputer.fit_transform(np.array(impute_e['loading']).reshape(-1,1))

In [39]:
#after imputing, drop unnecessary measurements again to create the final feature selected

impute_a = impute_a.drop(['measurement_3', 'measurement_4', 'measurement_5', 'measurement_6', 'measurement_7', 'measurement_8'],axis=1)
impute_b = impute_b.drop(['measurement_3', 'measurement_4', 'measurement_5', 'measurement_6', 'measurement_7', 'measurement_8'],axis=1)
impute_c = impute_c.drop(['measurement_3', 'measurement_4', 'measurement_5', 'measurement_6', 'measurement_7', 'measurement_8'],axis=1)
impute_d = impute_d.drop(['measurement_3', 'measurement_4', 'measurement_5', 'measurement_6', 'measurement_7', 'measurement_8'],axis=1)
impute_e = impute_e.drop(['measurement_3', 'measurement_4', 'measurement_5', 'measurement_6', 'measurement_7', 'measurement_8'],axis=1)

impute_a.isna().sum()

product_code      0
loading           0
attribute_0       0
measurement_0     0
measurement_1     0
measurement_2     0
measurement_17    0
failure           0
m3_missing        0
m5_missing        0
attribute_2*3     0
dtype: int64

In [40]:
#a fuction to perform scaling before feeding it to the model, here i just use StandardScaler

def scaling(data):
    scaler = StandardScaler()
    select_feature = ['measurement_0', 'measurement_1', 'measurement_2', 'loading', 'measurement_17', 'attribute_2*3']
    scaled = scaler.fit_transform(data[select_feature])
    new = data.copy()
    new[select_feature] = scaled
    assert len(data) == len(new)
    return new

In [41]:
#combine the data after preprocessing to produce the final training data

frames = [impute_a, impute_b, impute_c, impute_d, impute_e]
train = pd.concat(frames)

X = train.drop(['product_code','failure'], axis=1)
Y = train['failure'].astype(int)

In [42]:
#the model basicaly builds on a cross-validation logistic regression model 
#after each fold fit into the model, save the model on pickle for inferencing
#the validation score are based on the average score of every fold

accuracy = 0
auc_score = 0
i = 0

fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
for  id, (tid, vid) in enumerate(fold.split(X, Y)):
    
    x_train, x_val = X.iloc[tid], X.iloc[vid]
    y_train, y_val = Y.iloc[tid], Y.iloc[vid]

    x_train = scaling(x_train)
    x_val = scaling(x_val)
    

    model = LogisticRegression(penalty='l2', solver='newton-cg', max_iter=1000, C=0.012)
    model.fit(x_train, y_train)  
    filename = 'finalized_model' + str(i) + '.sav'
    pickle.dump(model, open(filename, 'wb'))
    i = i+1
 
    auc_score += roc_auc_score(y_val, model.predict_proba(x_val)[:,1]) /10
    accuracy += accuracy_score(y_val, model.predict(x_val)) /10

auc_score, accuracy

(0.5918208636867467, 0.7872036130974782)