In [3]:
!pip install imbalanced-learn

Collecting imbalanced-learnNote: you may need to restart the kernel to use updated packages.
  Downloading imbalanced_learn-0.7.0-py3-none-any.whl (167 kB)





Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.7.0


In [50]:
import warnings
warnings.filterwarnings(action='ignore')
import imblearn
import pandas as pd
import numpy as np
import sklearn
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# functions

In [68]:
#모델링 함수
def modeling(model,x_train,x_test,y_train,y_test):
    model.fit(x_train,y_train)
    pred = model.predict(x_test)
    metrics(y_test,pred)

#평가 지표
def metrics(y_test,pred):
    accuracy = accuracy_score(y_test,pred)
    precision = precision_score(y_test,pred)
    recall = recall_score(y_test,pred)    
    #print('정확도 : {0:.4f}, 정밀도 : {1:.4f}, 재현율 : {2:.4f}'.format(accuracy,precision,recall))
    print("최종 : {}".format((recall*0.5 + accuracy*0.5)*100))

# Dataload/Preprocessing

In [69]:
# uci-secom.csv를 이용해 Pass/Fail를 예측하는 모델을 만든후
data = pd.read_csv("data/uci-secom.csv")
# 결측치 0으로
data = data.replace(np.NaN, 0)
# Time 열 제거
data = data.drop(columns = ['Time'], axis = 1)

#----preprocessing----
x = data.drop(columns = ['Pass/Fail'], axis = 1)
y = data['Pass/Fail'].to_numpy().ravel()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)
# StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

# Sampling
smote = SMOTE(random_state=0)
ros = RandomOverSampler()
rus = RandomUnderSampler()
#fit_resample
x_train_smote,y_train_smote = smote.fit_sample(x_train,y_train)
x_train_ros,y_train_ros = ros.fit_resample(x_train,y_train)
x_train_rus,y_train_rus = rus.fit_resample(x_train,y_train)

# oversampled_data = pd.DataFrame(oversampled_data)

In [70]:
print(x_train.shape[0],x_train_smote.shape[0],x_train_ros.shape[0],x_train_rus.shape[0])

1253 2324 2324 182


# Model

In [71]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

models = []
models.append(('LR', LogisticRegression(max_iter =5000))) #로지스틱 회귀모델 
models.append(('LDA', LinearDiscriminantAnalysis()))  # LDA 모델
models.append(('KNN', KNeighborsClassifier()))  # KNN 모델
models.append(('CART', DecisionTreeClassifier()))  # 의사결정트리 모델
models.append(('NB', GaussianNB()))  # 가우시안 나이브 베이즈 모델
models.append(('RF', RandomForestClassifier()))  # 랜덤포레스트 모델
models.append(('SVM', SVC(gamma='auto')))  # SVM 모델
models.append(('XGB', XGBClassifier()))  # XGB 모델

In [72]:
models

[('LR', LogisticRegression(max_iter=5000)),
 ('LDA', LinearDiscriminantAnalysis()),
 ('KNN', KNeighborsClassifier()),
 ('CART', DecisionTreeClassifier()),
 ('NB', GaussianNB()),
 ('RF', RandomForestClassifier()),
 ('SVM', SVC(gamma='auto')),
 ('XGB',
  XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
                colsample_bynode=None, colsample_bytree=None, gamma=None,
                gpu_id=None, importance_type='gain', interaction_constraints=None,
                learning_rate=None, max_delta_step=None, max_depth=None,
                min_child_weight=None, missing=nan, monotone_constraints=None,
                n_estimators=100, n_jobs=None, num_parallel_tree=None,
                random_state=None, reg_alpha=None, reg_lambda=None,
                scale_pos_weight=None, subsample=None, tree_method=None,
                validate_parameters=None, verbosity=None))]

In [73]:
# original
for i in range(len(models)):
    model = models[i][-1]
    print(models[i][0])
    modeling(model, x_train ,x_test, y_train ,y_test)

LR
최종 : 47.795198432141106
LDA
최종 : 56.76139147476727
KNN
최종 : 47.92993630573248
CART
최종 : 48.59137677609016
NB
최종 : 46.10485056344929
RF
최종 : 47.92993630573248
SVM
최종 : 47.92993630573248
XGB
최종 : 47.77070063694268


In [74]:
# smote
for i in range(len(models)):
    model = models[i][-1]
    print(models[i][0])
    modeling(model, x_train_smote, x_test, y_train_smote, y_test)

LR
최종 : 44.928956393924544
LDA
최종 : 51.028907398334155
KNN
최종 : 53.74816266536012
CART
최종 : 45.565899069083784
NB
최종 : 50.40421362077413
RF
최종 : 47.452229299363054
SVM
최종 : 49.54679078882901
XGB
최종 : 50.979911807937285


In [75]:
# ros
for i in range(len(models)):
    model = models[i][-1]
    print(models[i][0])
    modeling(model, x_train_ros, x_test, y_train_ros, y_test)

LR
최종 : 48.615874571288586
LDA
최종 : 50.55120039196473
KNN
최종 : 44.13277804997551
CART
최종 : 43.94904458598726
NB
최종 : 46.58255756981872
RF
최종 : 47.92993630573248
SVM
최종 : 45.85987261146497
XGB
최종 : 51.45761881430671


In [76]:
# rus
for i in range(len(models)):
    model = models[i][-1]
    print(models[i][0])
    modeling(model, x_train_rus, x_test, y_train_rus, y_test)

LR
최종 : 54.44634982851544
LDA
최종 : 53.650171484566386
KNN
최종 : 49.644781969622734
CART
최종 : 58.45173934345909
NB
최종 : 54.05438510534052
RF
최종 : 61.954924056834884
SVM
최종 : 63.54728074473297
XGB
최종 : 62.77560019598236


# Gridsearch

clf1 = DecisionTreeClassifier(max_depth=4)
clf2 = KNeighborsClassifier(n_neighbors=7)
clf3 = SVC(kernel='rbf', probability=True)

models.append(('RF', RandomForestClassifier()))  # 랜덤포레스트 모델
models.append(('SVM', SVC(gamma='auto')))  # SVM 모델
models.append(('XGB', XGBClassifier()))  # XGB 모델

In [192]:
model1 = RandomForestClassifier()
print(model1.get_params().keys())
print()
model2 = XGBClassifier()
print(model2.get_params().keys())
print()
model3 = SVC()
print(model3.get_params().keys())
#parameters = {'C': [0.001, 0.01, 0.1], 'penalty': ['l1', 'l2']}

dict_keys(['bootstrap', 'ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])

dict_keys(['objective', 'base_score', 'booster', 'colsample_bylevel', 'colsample_bynode', 'colsample_bytree', 'gamma', 'gpu_id', 'importance_type', 'interaction_constraints', 'learning_rate', 'max_delta_step', 'max_depth', 'min_child_weight', 'missing', 'monotone_constraints', 'n_estimators', 'n_jobs', 'num_parallel_tree', 'random_state', 'reg_alpha', 'reg_lambda', 'scale_pos_weight', 'subsample', 'tree_method', 'validate_parameters', 'verbosity'])

dict_keys(['C', 'break_ties', 'cache_size', 'class_weight', 'coef0', 'decision_function_shape', 'degree', 'gamma', 'kernel', 'max_iter', 'probability', 'random_state', 'shrinking', 'tol', 'verbose'])


In [193]:
params1 = {'max_depth':[1, 10, 100], 'max_features':[1, 10, 100], 'random_state':[0, 1]}
# params2 = {'gamma':[0.5,1,3,5], 'n_neighbors':[13,14, 15]}
# params3 = {'gamma':["auto", "scale"], 'degree':[1, 2, 3], 'random_state':[0, 1]}

In [194]:
GS11 = GridSearchCV(model1, params1, scoring = 'accuracy', cv = 5)
GS11.fit(x_train_rus, y_train_rus)
model1_acc = GS11.score(x_test, y_test)
print(model1_acc)
print(GS11.best_params_)

GS12 = GridSearchCV(model1, params1, scoring = 'recall', cv = 5)
GS12.fit(x_train_rus, y_train_rus)
model1_rec = GS12.score(x_test, y_test)
print(model1_rec)
print(GS12.best_params_)

0.6273885350318471
{'max_depth': 100, 'max_features': 10, 'random_state': 1}
0.5384615384615384
{'max_depth': 1, 'max_features': 1, 'random_state': 0}


In [179]:
GS21 = GridSearchCV(model2, params2, scoring = 'accuracy', cv = 5)
GS21.fit(x_train_rus, y_train_rus)
model2_acc = GS21.score(x_test, y_test)
print(model2_acc)
print(GS21.best_params_)

GS22 = GridSearchCV(model2, params2, scoring = 'recall', cv = 5)
GS22.fit(x_train_rus, y_train_rus)
model2_rec = GS22.score(x_test, y_test)
print(model2_rec)
print(GS22.best_params_)

0.767515923566879
{'leaf_size': 1, 'n_neighbors': 14}
0.46153846153846156
{'leaf_size': 1, 'n_neighbors': 13}


In [180]:
GS31 = GridSearchCV(model3, params3, scoring = 'accuracy', cv = 5)
GS31.fit(x_train_rus, y_train_rus)
model3_acc = GS31.score(x_test, y_test)
print(model3_acc)
print(GS31.best_params_)

GS32 = GridSearchCV(model3, params3, scoring = 'recall', cv = 5)
GS32.fit(x_train_rus, y_train_rus)
model3_rec = GS32.score(x_test, y_test)
print(model3_rec)
print(GS32.best_params_)

0.732484076433121
{'degree': 1, 'gamma': 'auto', 'random_state': 0}
0.6153846153846154
{'degree': 1, 'gamma': 'scale', 'random_state': 0}


# Ensemble

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import fbeta_score, make_scorer
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)],voting='soft')

params = {'lr__C': [1.0, 100.0], 'rf__n_estimators': [20, 200]}

grid = GridSearchCV(estimator=eclf, param_grid=params, scoring=['accuracy', 'recall'], cv=5)

grid.fit(x_train_rus,y_train_rus)
pred = grid.predict(x_test)
metrics(y_test,pred)

In [251]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from itertools import product
from sklearn.ensemble import VotingClassifier

#clf1 = DecisionTreeClassifier(max_depth=18, max_features=95, random_state=0)
clf1 = DecisionTreeClassifier()
#clf2 = KNeighborsClassifier(leaf_size=1, n_neighbors=14)
clf2 = KNeighborsClassifier()
#clf3 = SVC(degree=1, gamma='auto', random_state=0, probability=True)
clf3 = SVC(probability=True)
clf4 = XGBClassifier()
#clf5 = RandomForestClassifier(max_depth=100, max_features=10, random_state=1)
clf5 = RandomForestClassifier()

eclf = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2), ('svc', clf3), ('xgb', clf4), ('rfc', clf5)],
                        voting='soft',
                        weights=[1, 50, 100, 100,50])

#clf1 = clf1.fit(x_train_ros,y_train_ros)
#clf2 = clf2.fit(x_train_ros,y_train_ros)
#clf3 = clf3.fit(x_train_ros,y_train_ros)
#clf4 = clf4.fit(x_train_ros,y_train_ros)
#clf5 = clf5.fit(x_train_ros,y_train_ros)
#eclf = eclf.fit(x_train_ros,y_train_ros)

clf1 = clf1.fit(x_train_smote,y_train_smote)
clf2 = clf2.fit(x_train_smote,y_train_smote)
clf3 = clf3.fit(x_train_smote,y_train_smote)
clf4 = clf4.fit(x_train_rus,y_train_rus)
clf5 = clf5.fit(x_train_smote,y_train_smote)
eclf = eclf.fit(x_train_smote,y_train_smote)

#clf1 = clf1.fit(x_train,y_train)
#clf2 = clf2.fit(x_train,y_train)
#clf3 = clf3.fit(x_train,y_train)
#clf4 = clf4.fit(x_train,y_train)
#clf5 = clf5.fit(x_train,y_train)
#eclf = eclf.fit(x_train,y_train)

In [252]:
pred = eclf.predict(x_test)
metrics(y_test,pred)

최종 : 47.29299363057325


# Submit

In [262]:
# manufacture_test_feature.csv의 Pass/Fail를 예측해보세요.
dataframe = pd.read_csv("data/manufacture_test_feature.csv")
# manufacture_test_feature.csv의 순서대로 [index, Pass/Fail]를 가지는 dataframe을
dataframe.head(3)

Unnamed: 0,Time,0,1,2,3,4,5,6,7,8,...,580,581,582,583,584,585,586,587,588,589
0,2008-11-09 15:05:00,2958.83,2488.5,2197.5222,1373.0077,1.1369,100.0,106.0733,0.124,1.3886,...,,,0.4975,0.0142,0.0031,2.8592,0.0246,0.0064,0.0022,25.99
1,2008-10-14 03:21:00,3073.57,2528.59,2217.4111,1032.2836,1.4802,100.0,101.3511,0.1195,1.4234,...,,,0.495,0.0115,0.0028,2.3235,0.0138,0.0162,0.0047,117.7603
2,2008-09-24 10:10:00,2995.73,2515.83,2231.6111,2005.8966,1.2969,100.0,93.7522,0.1234,1.4136,...,0.0047,183.3928,0.5011,0.0103,0.0027,2.0617,0.009,0.0166,0.0047,183.3928


In [263]:
dataframe = dataframe.replace(np.NaN, 0)
# Time 열 제거
dataframe = dataframe.drop(columns = ['Time'], axis = 1)

In [264]:
dataframe.shape

(314, 590)

In [265]:
dataframe.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,580,581,582,583,584,585,586,587,588,589
0,2958.83,2488.5,2197.5222,1373.0077,1.1369,100.0,106.0733,0.124,1.3886,0.0107,...,0.0,0.0,0.4975,0.0142,0.0031,2.8592,0.0246,0.0064,0.0022,25.99
1,3073.57,2528.59,2217.4111,1032.2836,1.4802,100.0,101.3511,0.1195,1.4234,-0.0045,...,0.0,0.0,0.495,0.0115,0.0028,2.3235,0.0138,0.0162,0.0047,117.7603
2,2995.73,2515.83,2231.6111,2005.8966,1.2969,100.0,93.7522,0.1234,1.4136,0.0129,...,0.0047,183.3928,0.5011,0.0103,0.0027,2.0617,0.009,0.0166,0.0047,183.3928


In [269]:
dataframe = sc.transform(dataframe)

In [270]:
model_preds = eclf.predict(dataframe)

In [271]:
model_preds

array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  1,
       -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1,  1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1,  1, -1

In [244]:
pd.DataFrame(model_preds)

Unnamed: 0,0
0,1
1,-1
2,-1
3,1
4,1
...,...
309,-1
310,-1
311,1
312,1


https://www.kaggle.com/rafjaa/resampling-strategies-for-imbalanced-datasets

https://joonable.tistory.com/27

* test 데이터는 모른다는 게 가정이기 때문입니다. fit 단계에서 데이터 전체의 평균과 분산을 이용하는데 test 데이터는 이걸 알 수 없기 때문에 기존 train 데이터의 평균과 분산을 이용해 정규화 합니다. 그래서 test에서 transform만 수행합니다.
* 추가적인 설명으로는, 학습데이터 세트에서 변환을 위한 기반 설정(예를 들어 학습 데이터 세트의 최대값/최소값등)을 먼저 fit()을 통해서 설정한 뒤에 이를 기반으로 학습 데이터의 transform()을 수행하되 학습 데이터에서 설정된 변환을 위한 기반 설정을 그대로 테스트 데이터에도 적용하기 위해서입니다.

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

https://m.blog.naver.com/PostView.nhn?blogId=gustn3964&logNo=221431933811&proxyReferer=https:%2F%2Fwww.google.com%2F
