In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns


# export_graphviz: 나무 구조 생성 및 저장 
from sklearn.tree import export_graphviz
# graphviz : 나무 구조 시각화  (.dot 확장자 파일 불러오기 등)
import graphviz
# 데이터 분할:train, test
from sklearn.model_selection import train_test_split
# 예측/회귀 Random Forest
from sklearn.ensemble import RandomForestRegressor
# 최적 모델, 파라미터 탐색
from sklearn.model_selection import GridSearchCV
# 예측/회귀 Gradient Boosting
from sklearn.ensemble import GradientBoostingRegressor


import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler,MinMaxScaler,RobustScaler

# 선형 회귀분석모델 formula(y ~ X1 + X2 + ...)
import statsmodels.formula.api as smf
# 회귀분석의 정규성 확인
from statsmodels.api import qqplot
# 샘플링 : Over-sampling 등
from imblearn.over_sampling import SMOTE

In [7]:
import warnings
warnings.filterwarnings('ignore')

plt.rc('font',family='NanumGothic ECO')
plt.rc('axes',unicode_minus=False)

In [12]:
df_raw = pd.read_csv("/home/piai/preprocessed_scale.csv")
df_raw

Unnamed: 0,scale,steel_kind,pt_thick,pt_width,pt_length,hsb,fur_heat_temp,fur_heat_time,fur_soak_temp,fur_soak_time,rolling_method,rolling_temp,descaling_count,fur_preheat_time,furnace
0,0,T,32,3700,15100,적용,1144,116,1133,59,TMCP(온도제어),934,8,84,1호기1열
1,0,T,32,3700,15100,적용,1144,122,1135,53,TMCP(온도제어),937,8,63,1호기2열
2,0,T,33,3600,19200,적용,1129,116,1121,55,TMCP(온도제어),889,8,87,2호기1열
3,0,T,33,3600,19200,적용,1152,125,1127,68,TMCP(온도제어),885,8,73,2호기2열
4,0,T,38,3100,13300,적용,1140,134,1128,48,TMCP(온도제어),873,8,64,3호기1열
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
989,0,C,19,3400,41500,적용,1142,55,1151,86,CR(제어압연),948,10,62,1호기2열
990,0,C,19,3400,41500,적용,1142,55,1151,86,CR(제어압연),948,10,68,1호기2열
991,0,C,17,3400,43700,적용,1169,65,1163,77,CR(제어압연),948,10,85,2호기2열
992,0,C,17,3400,43700,적용,1169,65,1163,77,CR(제어압연),948,10,62,2호기2열


In [14]:
# get_dummies: 데이터의 문자형 변수에 대한 더미변수 생성 
df_raw_dummy = pd.get_dummies(df_raw)
df_raw_dummy.head()

Unnamed: 0,scale,pt_thick,pt_width,pt_length,fur_heat_temp,fur_heat_time,fur_soak_temp,fur_soak_time,rolling_temp,descaling_count,...,hsb_미적용,hsb_적용,rolling_method_CR(제어압연),rolling_method_TMCP(온도제어),furnace_1호기1열,furnace_1호기2열,furnace_2호기1열,furnace_2호기2열,furnace_3호기1열,furnace_3호기2열
0,0,32,3700,15100,1144,116,1133,59,934,8,...,0,1,0,1,1,0,0,0,0,0
1,0,32,3700,15100,1144,122,1135,53,937,8,...,0,1,0,1,0,1,0,0,0,0
2,0,33,3600,19200,1129,116,1121,55,889,8,...,0,1,0,1,0,0,1,0,0,0
3,0,33,3600,19200,1152,125,1127,68,885,8,...,0,1,0,1,0,0,0,1,0,0
4,0,38,3100,13300,1140,134,1128,48,873,8,...,0,1,0,1,0,0,0,0,1,0


In [15]:
# 데이터 분리:설명변수, 목표변수 구분
df_raw_x = df_raw_dummy.drop("scale", axis = 1, inplace = False)
df_raw_y = df_raw_dummy["scale"] 

# 데이터 분할 train_test_split(X: 설명변수, Y: 목표변수, test_size = test 데이터 비율)
df_train_x, df_test_x, df_train_y, df_test_y = train_test_split(
    df_raw_x, df_raw_y, test_size = 0.3, random_state = 1234) 

print("분할 전 설명변수 데이터 :", df_raw_x.shape)
print("분할 후 설명변수 데이터 :Train", df_train_x.shape, "  Test",df_test_x.shape)

분할 전 설명변수 데이터 : (994, 22)
분할 후 설명변수 데이터 :Train (695, 22)   Test (299, 22)


In [16]:
# 목표변수 빈도 확인
print(df_raw.value_counts(["scale"]),"\n")
print("scale = 1 비율  ", df_raw.value_counts(df_raw["scale"]==1)/len(df_raw))

scale
0        684
1        310
dtype: int64 

scale = 1 비율   scale
False    0.688129
True     0.311871
dtype: float64


In [18]:
# Over-sampling 설정
sm = SMOTE(sampling_strategy='auto', random_state=1234)

# train데이터를 이용한 Over-sampling
x_resampled, y_resampled = sm.fit_resample(df_train_x,df_train_y)

# 결과 확인
print('Over-Sampling 전:\n',df_train_y.value_counts(),"\n")
print('Over-Sampling 후 Train X: {}'.format(x_resampled.shape))
print('Over-Sampling 후 Train Y: {} \n'.format(y_resampled.shape))

print("Over-Sampling 후 '1':{}".format(sum(y_resampled==1)))
print("Over-Sampling 후 '0':{}".format(sum(y_resampled==0)))

Over-Sampling 전:
 0    473
1    222
Name: scale, dtype: int64 

Over-Sampling 후 Train X: (946, 22)
Over-Sampling 후 Train Y: (946,) 

Over-Sampling 후 '1':473
Over-Sampling 후 '0':473
