<a href="https://colab.research.google.com/github/h-wi/lg-aimers-hackathon/blob/main/%5BBaseline%5D_RandomForest_with_PCA_(0_19).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import

In [None]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Data Load

A. Y_Quality 무시하고 그냥 Y_Class 예측 모형 만들기

**B. Y_Quality를 예측하고 Spec 범위를 활용해서 Y_Class를 생성하기**

C. A.와 B.를 모두 활용하기



In [None]:
cd /content/drive/MyDrive/aimers

/content/drive/MyDrive/aimers


In [None]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [None]:
train_df['LINE'].unique()

array(['T050304', 'T050307', 'T100304', 'T100306', 'T010306', 'T010305'],
      dtype=object)

In [None]:
test_df['LINE'].unique()

array(['T100306', 'T100304', 'T010305', 'T010306', 'T050304', 'T050307'],
      dtype=object)

In [None]:
# 생산 LINE별로 쓰이는 변수가 다르므로 생산 LINE별로 DF 구분

train_grouped = train_df.groupby('LINE')
test_grouped = test_df.groupby('LINE')

for name, group in train_grouped:
    if name == 'T050304':
        t04_df = group.copy()
    elif name == 'T050307':
        t07_df = group.copy()
    elif name == 'T100304':
        t14_df = group.copy()
    elif name == 'T100306':
        t16_df = group.copy()
    elif name == 'T010306':
        t06_df = group.copy()  
    elif name == 'T010305':
        t05_df = group.copy()

for name, group in test_grouped:
    if name == 'T050304':
        test04_df = group.copy()
    elif name == 'T050307':
        test07_df = group.copy()
    elif name == 'T100304':
        test14_df = group.copy()
    elif name == 'T100306':
        test16_df = group.copy()
    elif name == 'T010306':
        test06_df = group.copy()  
    elif name == 'T010305':
        test05_df = group.copy()

In [None]:
# 각 df에서 값을 갖고 있지 않은 열(결측치만 갖고 있는 열) 모두 삭제
# df마다 적합할 모델 만들고 test df도 해당 모델에만 예측가능

t04_df = t04_df.dropna(axis=1, how='all')
t07_df = t07_df.dropna(axis=1, how='all')
t14_df = t14_df.dropna(axis=1, how='all')
t16_df = t16_df.dropna(axis=1, how='all')
t06_df = t06_df.dropna(axis=1, how='all')
t05_df = t05_df.dropna(axis=1, how='all')

test04_df = test04_df.dropna(axis=1, how='all')
test07_df = test07_df.dropna(axis=1, how='all')
test14_df = test14_df.dropna(axis=1, how='all')
test16_df = test16_df.dropna(axis=1, how='all')
test06_df = test06_df.dropna(axis=1, how='all')
test05_df = test05_df.dropna(axis=1, how='all')

In [None]:
# 각 LINE별로 잘 모였는지 확인, LINE별 생산하는 제품의 CODE 확인.

print(t04_df['LINE'].unique(),t04_df['PRODUCT_CODE'].unique())
print(t07_df['LINE'].unique(),t07_df['PRODUCT_CODE'].unique())
print(t14_df['LINE'].unique(),t14_df['PRODUCT_CODE'].unique())
print(t16_df['LINE'].unique(),t16_df['PRODUCT_CODE'].unique())
print(t06_df['LINE'].unique(),t06_df['PRODUCT_CODE'].unique())
print(t05_df['LINE'].unique(),t05_df['PRODUCT_CODE'].unique())

print('########################################')

print(test04_df['LINE'].unique(),test04_df['PRODUCT_CODE'].unique())
print(test07_df['LINE'].unique(),test07_df['PRODUCT_CODE'].unique())
print(test14_df['LINE'].unique(),test14_df['PRODUCT_CODE'].unique())
print(test16_df['LINE'].unique(),test16_df['PRODUCT_CODE'].unique())
print(test06_df['LINE'].unique(),test06_df['PRODUCT_CODE'].unique())
print(test05_df['LINE'].unique(),test05_df['PRODUCT_CODE'].unique())

['T050304'] ['A_31']
['T050307'] ['A_31']
['T100304'] ['T_31' 'O_31']
['T100306'] ['T_31' 'O_31']
['T010306'] ['A_31']
['T010305'] ['A_31']
########################################
['T050304'] ['A_31']
['T050307'] ['A_31']
['T100304'] ['T_31' 'O_31']
['T100306'] ['T_31' 'O_31']
['T010306'] ['A_31']
['T010305'] ['A_31']


## Data Pre-processing

In [None]:
# pandas 전체 행 출력 개수 제한
pd.options.display.max_rows = 5000

A. 각 LINE, PRODUCT_CODE별로 데이터를 분할하고 따로 모형 적합하기

**필요한 변수만 활용하기 때문에 메모리/연산 효율적
각 LINE, PRODUCT_CODE별 모형 최적화 가능**

### 1. Line **T050304(t04)** data preprocessing


#### Divide dataframe and Fill in missing values

Use it https://m.blog.naver.com/tjdrud1323/221720259834

In [None]:
t04_train_x = t04_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
t04_train_y = t04_df['Y_Class']

t04_test_x = test04_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

In [None]:
# qualitative to quantitative
# 각 생산라인과 상품코드를 integer encoding

qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(t04_train_x[i])
    t04_train_x[i] = le.transform(t04_train_x[i])
    
    for label in np.unique(t04_test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    t04_test_x[i] = le.transform(t04_test_x[i]) 
print('Done.')

Done.


In [None]:
# train 결측값 확인하기

# train_x : 전체 행 78개 중 결측치

# X_2063 ~ X_2092 39개  # X_2100 ~ X_2407 39개  # X_2543 ~ X_2627 39개 => 40개는 있어야 하는걸로

t04_train_x.isnull().sum()

In [None]:
# 일단 결측치 39개있는 열도 지우고 성능 별로면 넣어보기 : threshold 38, 78개 중에 40개 값은 있어야 함

t04_train_x = t04_train_x.dropna(axis=1,thresh=len(t04_train_x)-38)

# Columns in test data that are not present in train data
# train data에 맞춰서 test data의 column도 정리해주기.
cols_to_drop = [col for col in t04_test_x.columns if col not in t04_train_x.columns]

# Drop the columns
t04_test_x.drop(columns=cols_to_drop, inplace=True)

In [None]:
# 결측치 threshold 안 넘는 열 채우기
# 일단 중앙값으로 해봅시다. =>  별로면 KNN

t04_train_median = t04_train_x.median()

t04_train_x = t04_train_x.fillna(t04_train_median)
t04_test_x = t04_test_x.fillna(t04_train_median) # **test의 중앙값 사용하면 안됨!!(Data leackage)**

#### Debugging

In [None]:
########################### DEBUG ########################################

if (t04_train_x.isnull().any().any() or t04_test_x.isnull().any().any()):
  print('missing value check again')

if (len(t04_train_x.columns) != len(t04_test_x.columns)):
  print('column # check again')

#### Feature selection
Using PCA with StandardScaling

In [None]:
from sklearn.preprocessing import StandardScaler  # 표준화 패키지 라이브러리 

t04_train_x = StandardScaler().fit_transform(t04_train_x)

In [None]:
from sklearn.decomposition import PCA

t04_pca = PCA(n_components=60) # 주성분을 몇개로 할지 결정
t04_train_new = t04_pca.fit_transform(t04_train_x)
t04_test_new = t04_pca.transform(t04_test_x)



In [None]:
sum(t04_pca.explained_variance_ratio_)

0.9553499773797567

### 2. **Line T050307(t07)** data preprocessing


#### Divide dataframe and Fill in missing values

In [None]:
t07_train_x = t07_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
t07_train_y = t07_df['Y_Class']

t07_test_x = test07_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

In [None]:
# qualitative to quantitative
# 각 생산라인과 상품코드를 integer encoding

qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(t07_train_x[i])
    t07_train_x[i] = le.transform(t07_train_x[i])
    
    for label in np.unique(t07_test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    t07_test_x[i] = le.transform(t07_test_x[i]) 
print('Done.')

Done.


In [None]:
# train 결측값 확인하기

# train_x : X_1167 ~ X_1171 28개, X_1274 ~ X_1278 28개, 
t07_train_x.isnull().sum()

In [None]:
#threshold 24, 42개 중에 28개 값은 있어야 함
t07_train_x = t07_train_x.dropna(axis=1,thresh=24)

# Columns in test data that are not present in train data
# train data에 맞춰서 test data의 column도 정리해주기.
cols_to_drop = [col for col in t07_test_x.columns if col not in t07_train_x.columns]

# Drop the columns
t07_test_x.drop(columns=cols_to_drop, inplace=True)

In [None]:
# 결측치 threshold 안 넘는 열 채우기
# 일단 중앙값으로 해봅시다. 별로면 KNN

t07_train_median = t07_train_x.median()

t07_train_x = t07_train_x.fillna(t07_train_median)
t07_test_x = t07_test_x.fillna(t07_train_median)

#### Debugging

In [None]:
########################### DEBUG ########################################

if (t07_train_x.isnull().any().any() or t07_test_x.isnull().any().any()):
  print('missing value check again')

if (len(t07_train_x.columns) != len(t07_test_x.columns)):
  print('column # check again')

#### Feature selection
Using PCA with StandardScaling

In [None]:
from sklearn.preprocessing import StandardScaler  # 표준화 패키지 라이브러리 

t07_train_x = StandardScaler().fit_transform(t07_train_x)

In [None]:
from sklearn.decomposition import PCA

t07_pca = PCA(n_components=33) # 주성분을 몇개로 할지 결정
t07_train_new = t07_pca.fit_transform(t07_train_x)
t07_test_new = t07_pca.transform(t07_test_x)



In [None]:
sum(t07_pca.explained_variance_ratio_)

0.9514601099470498

### 3. **Line T100304(t14)** data preprocessing


#### Divide dataframe and Fill in missing values

In [None]:
t14_train_x = t14_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
t14_train_y = t14_df['Y_Class']

t14_test_x = test14_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

In [None]:
# qualitative to quantitative
# 각 생산라인과 상품코드를 integer encoding

qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(t14_train_x[i])
    t14_train_x[i] = le.transform(t14_train_x[i])
    
    for label in np.unique(t14_test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    t14_test_x[i] = le.transform(t14_test_x[i]) 
print('Done.')

Done.


In [None]:
# train 결측값 확인하기

t14_train_x.isnull().sum()

In [None]:
t14_train_x = t14_train_x.dropna(axis=1,thresh=100)

# Columns in test data that are not present in train data
# train data에 맞춰서 test data의 column도 정리해주기.
cols_to_drop = [col for col in t14_test_x.columns if col not in t14_train_x.columns]

# Drop the columns
t14_test_x.drop(columns=cols_to_drop, inplace=True)

In [None]:
# 결측치 threshold 안 넘는 열 채우기
# 일단 중앙값으로 해봅시다. 별로면 KNN

t14_train_median = t14_train_x.median()

t14_train_x = t14_train_x.fillna(t14_train_median)
t14_test_x = t14_test_x.fillna(t14_train_median)

#### Debugging

In [None]:
########################### DEBUG ########################################

if (t14_train_x.isnull().any().any() or t14_test_x.isnull().any().any()):
  print('missing value check again')

if (len(t14_train_x.columns) != len(t14_test_x.columns)):
  print('column # check again')

#### Feature selection
Using PCA with StandardScaling

In [None]:
from sklearn.preprocessing import StandardScaler  # 표준화 패키지 라이브러리 

t14_train_x = StandardScaler().fit_transform(t14_train_x)

In [None]:
from sklearn.decomposition import PCA

t14_pca = PCA(n_components=100) # 주성분을 몇개로 할지 결정
t14_train_new = t14_pca.fit_transform(t14_train_x)
t14_test_new = t14_pca.transform(t14_test_x)



In [None]:
sum(t14_pca.explained_variance_ratio_)

0.9528195171806393

### 4. Line **T100306(t16)** data preprocessing


#### Divide dataframe and Fill in missing values

In [None]:
t16_train_x = t16_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
t16_train_y = t16_df['Y_Class']

t16_test_x = test16_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

In [None]:
# qualitative to quantitative
# 각 생산라인과 상품코드를 integer encoding

qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(t16_train_x[i])
    t16_train_x[i] = le.transform(t16_train_x[i])
    
    for label in np.unique(t16_test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    t16_test_x[i] = le.transform(t16_test_x[i]) 
print('Done.')

Done.


In [None]:
# train 결측값 확인하기
t16_train_x.isnull().sum()

In [None]:
t16_train_x = t16_train_x.dropna(axis=1,thresh=100)

# Columns in test data that are not present in train data
# train data에 맞춰서 test data의 column도 정리해주기.
cols_to_drop = [col for col in t16_test_x.columns if col not in t16_train_x.columns]

# Drop the columns
t16_test_x.drop(columns=cols_to_drop, inplace=True)

In [None]:
# 결측치 threshold 안 넘는 열 채우기
# 일단 중앙값으로 해봅시다. 별로면 KNN

t16_train_median = t16_train_x.median()

train_x = t16_train_x.fillna(t16_train_median)
t16_test_x = t16_test_x.fillna(t16_train_median)

#### Debugging

In [None]:
########################### DEBUG ########################################

if (t16_train_x.isnull().any().any() or t16_test_x.isnull().any().any()):
  print('missing value check again')

if (len(t16_train_x.columns) != len(t16_test_x.columns)):
  print('column # check again')

#### Feature selection
Using PCA with StandardScaling

In [None]:
from sklearn.preprocessing import StandardScaler  # 표준화 패키지 라이브러리 

t16_train_x = StandardScaler().fit_transform(t16_train_x)

In [None]:
from sklearn.decomposition import PCA

t16_pca = PCA(n_components=100) # 주성분을 몇개로 할지 결정
t16_train_new = t16_pca.fit_transform(t16_train_x)
t16_test_new = t16_pca.transform(t16_test_x)

print(sum(t16_pca.explained_variance_ratio_))

0.9536603371200982




### 5. Line **T010306(t06)** data preprocessing


#### Divide dataframe and Fill in missing values

In [None]:
t06_train_x = t06_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
t06_train_y = t06_df['Y_Class']

t06_test_x = test06_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

In [None]:
# qualitative to quantitative
# 각 생산라인과 상품코드를 integer encoding

qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(t06_train_x[i])
    t06_train_x[i] = le.transform(t06_train_x[i])
    
    for label in np.unique(t06_test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    t06_test_x[i] = le.transform(t06_test_x[i]) 
print('Done.')

Done.


In [None]:
# train 결측값 확인하기

t06_train_x.isnull().sum()

In [None]:
# 일단 결측치 39개있는 열도 지우고 성능 별로면 넣어보기

t06_train_x = t06_train_x.dropna(axis=1,thresh=35)

# Columns in test data that are not present in train data
# train data에 맞춰서 test data의 column도 정리해주기.
cols_to_drop = [col for col in t06_test_x.columns if col not in t06_train_x.columns]

# Drop the columns
t06_test_x.drop(columns=cols_to_drop, inplace=True)

In [None]:
# 결측치 threshold 안 넘는 열 채우기
# 일단 중앙값으로 해봅시다. 별로면 KNN

t06_train_median = t06_train_x.median()

t06_train_x = t06_train_x.fillna(t06_train_median)
t06_test_x = t06_test_x.fillna(t06_train_median)

#### Debugging

In [None]:
########################### DEBUG ########################################

if (t06_train_x.isnull().any().any() or t06_test_x.isnull().any().any()):
  print('missing value check again')

if (len(t06_train_x.columns) != len(t06_test_x.columns)):
  print('column # check again')

#### Feature selection
Using PCA with StandardScaling

In [None]:
from sklearn.preprocessing import StandardScaler  # 표준화 패키지 라이브러리 

t06_train_x = StandardScaler().fit_transform(t06_train_x)

In [None]:
from sklearn.decomposition import PCA

t06_pca = PCA(n_components=50) # 주성분을 몇개로 할지 결정
t06_train_new = t06_pca.fit_transform(t06_train_x)
t06_test_new = t06_pca.transform(t06_test_x)

print(sum(t16_pca.explained_variance_ratio_))

0.9536603371200982




### 6. Line **T010305(t05)** data preprocessing


#### Divide dataframe and Fill in missing values

In [None]:
t05_train_x = t05_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
t05_train_y = t05_df['Y_Class']

t05_test_x = test05_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

In [None]:
# qualitative to quantitative
# 각 생산라인과 상품코드를 integer encoding

qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(t05_train_x[i])
    t05_train_x[i] = le.transform(t05_train_x[i])
    
    for label in np.unique(t05_test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    t05_test_x[i] = le.transform(t05_test_x[i]) 
print('Done.')

Done.


In [None]:
# train 결측값 확인하기

t05_train_x.isnull().sum()

In [None]:
t05_train_x = t05_train_x.dropna(axis=1,thresh=30)

# Columns in test data that are not present in train data
# train data에 맞춰서 test data의 column도 정리해주기.
cols_to_drop = [col for col in t05_test_x.columns if col not in t05_train_x.columns]

# Drop the columns
t05_test_x.drop(columns=cols_to_drop, inplace=True)

In [None]:
# 결측치 threshold 안 넘는 열 채우기
# 일단 중앙값으로 해봅시다. 별로면 KNN

t05_train_median = t05_train_x.median()

t05_train_x = t05_train_x.fillna(t05_train_median)
t05_test_x = t05_test_x.fillna(t05_train_median)

#### Debugging

In [None]:
########################### DEBUG ########################################

if (t05_train_x.isnull().any().any() or t05_test_x.isnull().any().any()):
  print('missing value check again')

if (len(t05_train_x.columns) != len(t05_test_x.columns)):
  print('column # check again')

#### Feature selection
Using PCA with StandardScaling

In [None]:
from sklearn.preprocessing import StandardScaler  # 표준화 패키지 라이브러리 

t05_train_x = StandardScaler().fit_transform(t05_train_x)

In [None]:
from sklearn.decomposition import PCA

t05_pca = PCA(n_components=45) # 주성분을 몇개로 할지 결정
t05_train_new = t05_pca.fit_transform(t05_train_x)
t05_test_new = t05_pca.transform(t05_test_x)

print(sum(t05_pca.explained_variance_ratio_))

0.9516700632402392




## Classification Model Fit

먼저 fit하고 XAI로 접근해서 X 변수 해석하기

In [None]:
RF04 = RandomForestClassifier(random_state=37).fit(t04_train_new, t04_train_y)
RF07 = RandomForestClassifier(random_state=37).fit(t07_train_new, t07_train_y)
RF14 = RandomForestClassifier(random_state=37).fit(t14_train_new, t14_train_y)
RF16 = RandomForestClassifier(random_state=37).fit(t16_train_new, t16_train_y)
RF06 = RandomForestClassifier(random_state=37).fit(t06_train_new, t06_train_y)
RF05 = RandomForestClassifier(random_state=37).fit(t05_train_new, t05_train_y)
print('Done.')

Done.


## Inference

Y_Quality 값으로 오차 보정하기.

In [None]:
t04_preds = RF04.predict(t04_test_new)
t07_preds = RF07.predict(t07_test_new)
t14_preds = RF14.predict(t14_test_new)
t16_preds = RF16.predict(t16_test_new)
t06_preds = RF06.predict(t06_test_new)
t05_preds = RF05.predict(t05_test_new)
print('Done.')

Done.


## Submit

In [None]:
submit = pd.read_csv('./sample_submission.csv')

In [None]:
submit['Y_Class'].iloc[t05_test_x.index[0]]

0

In [None]:
# 각 데이터에 맞는 위치로 예측값 할당.
for i in range(len(t04_test_x.index)):
  submit['Y_Class'].iloc[t04_test_x.index[i]] = t04_preds[i]

for i in range(len(t07_test_x.index)):
  submit['Y_Class'].iloc[t07_test_x.index[i]] = t07_preds[i]

for i in range(len(t14_test_x.index)):
  submit['Y_Class'].iloc[t14_test_x.index[i]] = t14_preds[i]

for i in range(len(t16_test_x.index)):
  submit['Y_Class'].iloc[t16_test_x.index[i]] = t16_preds[i]

for i in range(len(t06_test_x.index)):
  submit['Y_Class'].iloc[t06_test_x.index[i]] = t06_preds[i]

for i in range(len(t05_test_x.index)):
  submit['Y_Class'].iloc[t05_test_x.index[i]] = t05_preds[i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [None]:
submit.to_csv('./baseline_submission.csv', index=False)