<a href="https://colab.research.google.com/github/forminju/DACON_anomaly_detection_/blob/main/0227_1_2___smartfactory.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import random
import time
import pickle
import psutil
import gc


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from umap import UMAP

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import KFold, StratifiedKFold

# setting some globl config

plt.style.use('ggplot')
orange_black = [
    '#fdc029', '#df861d', '#FF6347', '#aa3d01', '#a30e15', '#800000', '#171820'
]
plt.rcParams['figure.figsize'] = (16,9)
plt.rcParams["figure.facecolor"] = '#FFFACD'
plt.rcParams["axes.facecolor"] = '#FFFFE0'
plt.rcParams["axes.grid"] = True
plt.rcParams["grid.color"] = orange_black[3]
plt.rcParams["grid.alpha"] = 0.5
plt.rcParams["grid.linestyle"] = '--'


import warnings
warnings.filterwarnings("ignore")

INFERENCE = True

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [None]:
def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        
seed_everything(42)

In [None]:
%cd /content/drive/MyDrive/스마트공장

# Data Load

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/스마트공장/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/스마트공장/test.csv')

In [None]:
train_x = train_df.drop(columns=['TIMESTAMP', 'Y_Class','Y_Quality'])
train_y = train_df['Y_Class']

test_x = test_df.drop(columns=['TIMESTAMP'])

# 파생변수 생성

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
train_x

In [None]:
train_x['LINE_PRODUCT'] = train_x['LINE'].str.cat(train_x['PRODUCT_CODE'])
test_x['LINE_PRODUCT'] = test_x['LINE'].str.cat(test_x['PRODUCT_CODE'])

In [None]:
# qualitative to quantitative
qual_col = ['LINE', 'PRODUCT_CODE', 'LINE_PRODUCT']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    
    for label in np.unique(test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_x[i] = le.transform(test_x[i]) 
print('Done.')
     

In [None]:
train_x.drop(['LINE', 'PRODUCT_CODE'],axis=1, inplace=True)
test_x.drop(['LINE', 'PRODUCT_CODE'],axis=1, inplace=True)

# Imputation 기법 적용

In [None]:
train_x = train_x.dropna(how='all')
test_x = test_x[train_x.columns]

In [None]:
test_x = test_x[train_x.columns]

In [None]:
train_x = train_x.dropna(how='all',axis=1)
test_x = test_x[train_x.columns]

In [None]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=50)

imputer_output = imputer.fit_transform(train_x.iloc[:,1:])

In [None]:
train_x.iloc[:,1:] = pd.DataFrame(imputer_output, columns=train_x.iloc[:,1:].columns, index=list(train_x.iloc[:,1:].index.values));train_x

In [None]:
test_x = test_x[train_x.columns]

In [None]:
imputer_output_test = imputer.transform(test_x.iloc[:,1:])

In [None]:
test_x.iloc[:,1:] = pd.DataFrame(imputer_output_test, columns=test_x.iloc[:,1:].columns, index=list(test_x.iloc[:,1:].index.values));test_x

In [None]:
train_x

# vif 기반 변수 줄이기

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calc_vif(v):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = v.columns
    vif["VIF"] = [variance_inflation_factor(v.values,i) for i in range(v.shape[1])]

    high_vif = vif[vif["VIF"] > 5].sort_values("VIF",ascending=False)

    
    return(high_vif)

In [None]:
corr_matrix = train_x.corr()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find features with correlation greater than 0.95
to_drop1 = [column for column in upper.columns if any(upper[column] < -0.9)]
to_drop2 = [column for column in upper.columns if any(upper[column] > 0.9)]
print(to_drop1)
print(to_drop2)
drop=to_drop1+to_drop2

In [None]:
len(drop)

In [None]:
train_x = train_x.drop(drop,1)
test_x = test_x.drop(drop,1)

In [None]:
train_x.shape

In [None]:
features_train = train_x.copy()
features_test = test_x.copy()

In [None]:
train_x.iloc[:,1:].columns

In [None]:
train_x.shape

In [None]:
test_x.shape

# RobustScaler

In [101]:
from sklearn.preprocessing import RobustScaler

In [102]:
scaler = RobustScaler()

In [103]:
#train data 변환
X_group_train = scaler.fit_transform(features_train.iloc[:,1:])

#test data 변환
X_group_test = scaler.transform(features_test.iloc[:,1:])

In [104]:
train_x.iloc[:,1:] = pd.DataFrame(X_group_train, columns=train_x.iloc[:,1:].columns, index=list(train_x.iloc[:,1:].index.values));train_x

Unnamed: 0,PRODUCT_ID,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,...,X_2798,X_2799,X_2800,X_2801,X_2837,X_2839,X_2840,X_2842,X_2871,LINE_PRODUCT
0,TRAIN_000,-0.137931,-0.398010,0.0,0.0,0.000000,0.0,0.652,0.02,0.5,...,-54.0,-1.46,-91.874918,-2.32,15.238095,0.0606,0.0484,-0.000406,0.0,-0.6
1,TRAIN_001,-0.137931,-0.398010,0.0,0.0,0.000000,0.0,0.652,0.02,0.5,...,-4.0,-0.46,28.612477,1.68,-3.809524,0.0406,0.0984,0.000413,0.0,-0.4
2,TRAIN_002,1.000000,0.467662,0.0,0.0,0.190476,0.0,0.000,0.00,0.0,...,-4.0,-0.46,-33.049338,-0.32,15.238095,0.0606,-0.4316,-0.000405,0.0,-0.6
3,TRAIN_003,1.000000,0.467662,0.0,0.0,0.190476,0.0,0.000,0.00,0.0,...,-4.0,-0.46,27.877158,0.68,-13.150183,0.0506,0.0784,0.000473,0.0,-0.4
4,TRAIN_004,1.000000,0.467662,0.0,0.0,0.190476,0.0,0.000,0.00,0.0,...,-4.0,-0.46,-30.948424,-0.32,15.238095,0.0706,-0.4316,-0.000403,0.0,-0.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,TRAIN_593,0.000000,-0.248756,0.0,0.0,-0.523810,0.0,1.000,0.00,1.0,...,0.0,0.00,0.000000,0.00,0.000000,0.0000,0.0000,0.000000,0.0,0.4
594,TRAIN_594,-0.137931,-0.398010,0.0,0.0,0.000000,0.0,0.652,0.02,0.5,...,-4.0,0.54,-4.611971,-0.32,5.531136,0.0306,0.0484,-0.000613,0.0,-0.6
595,TRAIN_595,-0.137931,-0.398010,0.0,0.0,0.000000,0.0,0.652,0.02,0.5,...,-4.0,0.54,-8.663733,-0.32,14.322344,0.0406,0.0484,-0.000542,0.0,-0.6
596,TRAIN_596,65.517241,-0.497512,0.0,0.0,0.666667,0.0,0.000,0.00,0.0,...,0.0,0.00,0.000000,0.00,0.000000,0.0000,0.0000,0.000000,0.0,-0.2


In [105]:
test_x.iloc[:,1:] = pd.DataFrame(X_group_test, columns=test_x.iloc[:,1:].columns, index=list(test_x.iloc[:,1:].index.values))

# 분류기

In [106]:
import warnings
warnings.filterwarnings("ignore")
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from collections import Counter

In [107]:
classifiers = [['Naive Bayes :', GaussianNB()],
               ['KNeighbours :', KNeighborsClassifier()],
               ['SVM :', SVC()],
               ['LogisticRegression :', LogisticRegression()],
               ['DecisionTree :',DecisionTreeClassifier()],
               ['RandomForest :',RandomForestClassifier()],
               ['LGBMClassifier:', LGBMClassifier()],
               ['XGBClassifier: ', XGBClassifier()]]

In [108]:
from imblearn.over_sampling import SMOTE

osmote=SMOTE()
Xs_train,ys_train=osmote.fit_resample(train_x.iloc[:,1:],train_y)

print(Counter(ys_train))

Counter({1: 407, 2: 407, 0: 407})


In [109]:
from sklearn.model_selection import train_test_split

In [110]:
Xs_train, Xs_valid, ys_train, ys_valid = train_test_split(Xs_train, ys_train, test_size=0.2, random_state=42)

In [111]:
for name,classifier in classifiers:
    clf=classifier.fit(Xs_train,ys_train)
    y_pred=classifier.predict(Xs_valid)
    print(f'\n {name} \n')
    print(f'Training Score for {name}  {clf.score(Xs_train,ys_train) * 100:.2f}' )
    print(f'Testing Score for {name} {clf.score(Xs_valid,ys_valid) * 100:.2f}' )
    print(f'Classification report  \n {classification_report(ys_valid,y_pred)}' )
    print(f'Confusion matrix  \n {confusion_matrix(ys_valid,y_pred)}' )
    #print(f'ROC AUC  : {roc_auc_score(valid_y,y_pred)}' )


 Naive Bayes : 

Training Score for Naive Bayes :  44.47
Testing Score for Naive Bayes : 45.31
Classification report  
               precision    recall  f1-score   support

           0       0.65      0.40      0.49        83
           1       0.41      0.85      0.55        82
           2       0.35      0.10      0.16        80

    accuracy                           0.45       245
   macro avg       0.47      0.45      0.40       245
weighted avg       0.47      0.45      0.40       245

Confusion matrix  
 [[33 41  9]
 [ 6 70  6]
 [12 60  8]]

 KNeighbours : 

Training Score for KNeighbours :  74.49
Testing Score for KNeighbours : 66.94
Classification report  
               precision    recall  f1-score   support

           0       0.63      0.92      0.75        83
           1       0.76      0.27      0.40        82
           2       0.69      0.82      0.75        80

    accuracy                           0.67       245
   macro avg       0.69      0.67      0.63     

In [112]:
from imblearn.combine import SMOTETomek
rus=SMOTETomek()
Xrus_train,yrus_train =rus.fit_resample (train_x.iloc[:,1:],train_y)

print(Counter(yrus_train))

Counter({1: 385, 2: 384, 0: 382})


In [113]:
Xrus_train, Xrus_valid, yrus_train, yrus_valid = train_test_split(Xrus_train, yrus_train, test_size=0.2, random_state=42)

In [114]:
for name,classifier in classifiers:
    clf=classifier.fit(Xrus_train,yrus_train)
    y_pred=classifier.predict(Xrus_valid)
    print(f'\n {name} \n')
    print(f'Training Score for {name}  {clf.score(Xrus_train,yrus_train) * 100:.2f}' )
    print(f'Testing Score for {name} {clf.score(Xrus_valid,yrus_valid) * 100:.2f}' )
    print(f'Classification report  \n {classification_report(yrus_valid,y_pred)}' )
    print(f'Confusion matrix  \n {confusion_matrix(yrus_valid,y_pred)}' )
    #print(f'ROC AUC  : {roc_auc_score(valid_y,y_pred)}' )


 Naive Bayes : 

Training Score for Naive Bayes :  45.33
Testing Score for Naive Bayes : 46.75
Classification report  
               precision    recall  f1-score   support

           0       0.54      0.43      0.48        69
           1       0.45      0.96      0.61        81
           2       0.00      0.00      0.00        81

    accuracy                           0.47       231
   macro avg       0.33      0.47      0.36       231
weighted avg       0.32      0.47      0.36       231

Confusion matrix  
 [[30 39  0]
 [ 3 78  0]
 [23 58  0]]

 KNeighbours : 

Training Score for KNeighbours :  74.24
Testing Score for KNeighbours : 66.23
Classification report  
               precision    recall  f1-score   support

           0       0.63      0.88      0.73        69
           1       0.96      0.31      0.47        81
           2       0.62      0.83      0.71        81

    accuracy                           0.66       231
   macro avg       0.74      0.67      0.64     

In [45]:
from imblearn.over_sampling import BorderlineSMOTE

cc=BorderlineSMOTE()
Xcc_train,ycc_train =cc.fit_resample (train_x.iloc[:,1:],train_y)

print(Counter(ycc_train))

Counter({1: 407, 2: 407, 0: 407})


In [46]:
Xcc_train, Xcc_valid, ycc_train, ycc_valid = train_test_split(Xcc_train, ycc_train, test_size=0.2, random_state=42)

In [47]:
for name,classifier in classifiers:
    clf=classifier.fit(Xcc_train,ycc_train)
    y_pred=classifier.predict(Xcc_valid)
    print(f'\n {name} \n')
    print(f'Training Score for {name}  {clf.score(Xcc_train,ycc_train) * 100:.2f}' )
    print(f'Testing Score for {name} {clf.score(Xcc_valid,ycc_valid) * 100:.2f}' )
    print(f'Classification report  \n {classification_report(ycc_valid,y_pred)}' )
    print(f'Confusion matrix  \n {confusion_matrix(ycc_valid,y_pred)}' )


 Naive Bayes : 

Training Score for Naive Bayes :  64.34
Testing Score for Naive Bayes : 57.55
Classification report  
               precision    recall  f1-score   support

           0       0.84      0.46      0.59        83
           1       0.48      0.73      0.58        82
           2       0.58      0.54      0.56        80

    accuracy                           0.58       245
   macro avg       0.63      0.58      0.58       245
weighted avg       0.64      0.58      0.58       245

Confusion matrix  
 [[38 30 15]
 [ 6 60 16]
 [ 1 36 43]]

 KNeighbours : 

Training Score for KNeighbours :  75.31
Testing Score for KNeighbours : 69.39
Classification report  
               precision    recall  f1-score   support

           0       0.68      0.99      0.80        83
           1       0.86      0.15      0.25        82
           2       0.69      0.95      0.80        80

    accuracy                           0.69       245
   macro avg       0.74      0.69      0.62     

# sMOTETOMEK + votingclassifier

In [115]:
from imblearn.combine import SMOTETomek
rus=SMOTETomek()
Xrus_train,yrus_train =rus.fit_resample (train_x.iloc[:,1:],train_y)

print(Counter(yrus_train))

Counter({1: 387, 2: 383, 0: 381})


# 베이지안 최적화

In [49]:
!pip install bayesian-optimization

from bayes_opt import BayesianOptimization

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [116]:
from sklearn.metrics import f1_score
rf_params={
    'max_depth':(1,150),
    'n_estimators':(10,200),
    'max_samples':(0.5,1),
    'max_features':(0.5,1)
}

def rf_bo(max_depth, n_estimators, max_samples, max_features):
  params={
      'max_depth':int(round(max_depth)),
      'n_estimators':int(round(n_estimators)),
      'max_samples':max_samples,
      'max_features':max_features
  }

  model=RandomForestClassifier( **params,  n_jobs=-1, random_state=42)
  
  X_train,X_valid,y_train,y_valid=train_test_split(Xcc_train.iloc[:,1:],ycc_train,test_size=0.8)
  model.fit(X_train, y_train)

  score=f1_score(y_valid, model.predict(X_valid), average = 'micro')
  return score

In [119]:
BO_rf = BayesianOptimization(f=rf_bo,pbounds=rf_params,random_state=3,verbose=2)

BO_rf.maximize(init_points=7, n_iter=100)

rf_max_params=BO_rf.max['params']
rf_max_params['max_depth']=int(rf_max_params['max_depth'])
rf_max_params['n_estimators']=int(rf_max_params['n_estimators'])
rf_max_params

|   iter    |  target   | max_depth | max_fe... | max_sa... | n_esti... |
-------------------------------------------------------------------------
| [0m1        [0m | [0m0.7183   [0m | [0m83.07    [0m | [0m0.8541   [0m | [0m0.6455   [0m | [0m107.1    [0m |
| [95m2        [0m | [95m0.7516   [0m | [95m134.0    [0m | [95m0.9481   [0m | [95m0.5628   [0m | [95m49.38    [0m |
| [0m3        [0m | [0m0.7004   [0m | [0m8.669    [0m | [0m0.7204   [0m | [0m0.5149   [0m | [0m96.8     [0m |
| [95m4        [0m | [95m0.7939   [0m | [95m97.72    [0m | [95m0.6392   [0m | [95m0.8381   [0m | [95m122.3    [0m |
| [0m5        [0m | [0m0.7093   [0m | [0m4.573    [0m | [0m0.7794   [0m | [0m0.6296   [0m | [0m88.87    [0m |
| [0m6        [0m | [0m0.7465   [0m | [0m43.25    [0m | [0m0.8466   [0m | [0m0.7202   [0m | [0m39.8     [0m |
| [0m7        [0m | [0m0.749    [0m | [0m82.15    [0m | [0m0.8902   [0m | [0m0.6532   [0m | [0m52

{'max_depth': 149,
 'max_features': 0.8058109389693928,
 'max_samples': 0.9619012083054632,
 'n_estimators': 52}

In [120]:
xgb_params={
    'gamma':(0,20),
    'max_depth':(1,2000),
    'subsample':(0.5,1),
    'eta' : (0.001, 0.4)
}

In [121]:
def xgb_bo(eta, gamma,max_depth, subsample ):
  params={
      'learning_rate' :max(eta, 0),
      'gamma':int(round(gamma)),
      'max_depth':int(round(max_depth)),
      'subsample':int(round(subsample)),
  }
  model=XGBClassifier(**params, n_jobs=-1, random_state=42)
  X_train,X_valid,y_train,y_valid=train_test_split(Xcc_train.iloc[:,1:],ycc_train,test_size=0.2)

  model.fit(X_train,y_train)

  score=f1_score(y_valid,model.predict(X_valid), average='micro')
  return score

In [None]:
BO_xgb = BayesianOptimization(f=xgb_bo,pbounds=xgb_params,random_state=3,verbose=2)


# Bayesian Optimization을 실행해보세요
BO_xgb.maximize(init_points=7,n_iter=100)
xgb_max_params=BO_xgb.max['params']
xgb_max_params['max_depth']=int(xgb_max_params['max_depth'])
xgb_max_params

|   iter    |  target   |    eta    |   gamma   | max_depth | subsample |
-------------------------------------------------------------------------
| [0m1        [0m | [0m0.852    [0m | [0m0.2208   [0m | [0m14.16    [0m | [0m582.5    [0m | [0m0.7554   [0m |
| [0m2        [0m | [0m0.7959   [0m | [0m0.3573   [0m | [0m17.93    [0m | [0m252.0    [0m | [0m0.6036   [0m |
| [95m3        [0m | [95m0.8673   [0m | [95m0.02154  [0m | [95m8.816    [0m | [95m60.72    [0m | [95m0.7284   [0m |
| [95m4        [0m | [95m0.8929   [0m | [95m0.26     [0m | [95m5.57     [0m | [95m1.353e+03[0m | [95m0.7954   [0m |


In [None]:
lgbm_params={
    'n_estimators':(30,100),
    'max_depth':(1,2000),
    'subsample':(0.5,1)
}

In [None]:
def lgbm_bo(n_estimators,max_depth, subsample):
  params={
      'n_estimaotrs':int(round(n_estimators)),
      'max_depth':int(round(max_depth)),
      'subsample':int(round(subsample)),
  }
  model=LGBMClassifier(**params, n_jobs=-1, random_state=42)
  X_train,X_valid,y_train,y_valid=train_test_split(Xcc_train.iloc[:,1:],ycc_train,test_size=0.2)
  model.fit(X_train,y_train)

  score=f1_score(y_valid,model.predict(X_valid),average='micro')
  return score

In [None]:
BO_lgbm = BayesianOptimization(f=lgbm_bo,pbounds=lgbm_params,random_state=3,verbose=2)


# Bayesian Optimization을 실행해보세요
BO_lgbm.maximize(init_points=7,n_iter=100)
lgbm_max_params=BO_lgbm.max['params']
lgbm_max_params['n_estimators']=int(lgbm_max_params['n_estimators'])
lgbm_max_params['max_depth']=int(lgbm_max_params['max_depth'])
lgbm_max_params

In [None]:
LGBM = LGBMClassifier(**lgbm_max_params)
XGB = XGBClassifier(**xgb_max_params)
RF = RandomForestClassifier(**rf_max_params)
from sklearn.ensemble import VotingClassifier
# VotingClassifier 정의
VC = VotingClassifier(estimators=[('rf',RF),('xgb',XGB),('lgbm',LGBM)],voting='soft')

In [None]:
VC.fit(Xcc_train,ycc_train)

In [None]:
pred=VC.predict(test_x.iloc[:,1:])

In [None]:
submit = pd.read_csv('/content/drive/MyDrive/스마트공장/sample_submission (3).csv')

In [None]:
submit['Y_Class']=pred

In [None]:
submit['Y_Class'].value_counts()

In [None]:
submit.to_csv("0227_pred3.csv",index=False)