# 1.Data Load

In [None]:
import warnings
warnings.filterwarnings('ignore')

# 라이브러리

import pandas as pd
import random
import os
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.ensemble import VotingClassifier
import lightgbm as lgbm
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier

In [None]:
# Seed 고정

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) 

In [None]:
# 구글드라이브 활용

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/sample_data/

/content/sample_data


In [None]:
!unzip -qq '/content/drive/MyDrive/project_study_KKUL/008_LGaimers/LG_DATA.zip'

In [None]:
test = pd.read_csv('/content/sample_data/test.csv')
train = pd.read_csv('/content/sample_data/train.csv')
sub = pd.read_csv('/content/sample_data/sample_submission.csv')

# 2.데이터 확인

In [None]:
train.head()

Unnamed: 0,PRODUCT_ID,Y_Class,Y_Quality,TIMESTAMP,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,TRAIN_000,1,0.533433,2022-06-13 5:14,T050304,A_31,,,,,...,39.34,40.89,32.56,34.09,77.77,,,,,
1,TRAIN_001,2,0.541819,2022-06-13 5:22,T050307,A_31,,,,,...,38.89,42.82,43.92,35.34,72.55,,,,,
2,TRAIN_002,1,0.531267,2022-06-13 5:30,T050304,A_31,,,,,...,39.19,36.65,42.47,36.53,78.35,,,,,
3,TRAIN_003,2,0.537325,2022-06-13 5:39,T050307,A_31,,,,,...,37.74,39.17,52.17,30.58,71.78,,,,,
4,TRAIN_004,1,0.53159,2022-06-13 5:47,T050304,A_31,,,,,...,38.7,41.89,46.93,33.09,76.97,,,,,


In [None]:
test.head()

Unnamed: 0,PRODUCT_ID,TIMESTAMP,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,TEST_000,2022-09-09 2:01,T100306,T_31,2.0,94.0,0.0,45.0,10.0,0.0,...,,,,,,,,,,
1,TEST_001,2022-09-09 2:09,T100304,T_31,2.0,93.0,0.0,45.0,11.0,0.0,...,,,,,,,,,,
2,TEST_002,2022-09-09 8:42,T100304,T_31,2.0,95.0,0.0,45.0,11.0,0.0,...,,,,,,,,,,
3,TEST_003,2022-09-09 10:56,T010305,A_31,,,,,,,...,,,,,,,,,,
4,TEST_004,2022-09-09 11:04,T010306,A_31,,,,,,,...,,,,,,,,,,


# 3.데이터 전처리

## 1) 불필요한 열 삭제

In [None]:
# train_x = train.drop(columns = ['PRODUCT_ID', 'TIMESTAMP'])
# train_x = train.drop(columns = ['PRODUCT_ID', 'Y_Class', 'Y_Quality', 'TIMESTAMP'])
train_x = train.drop(columns = ['PRODUCT_ID', 'Y_Class', 'TIMESTAMP'])
# train_x = train.drop(columns = ['Y_Quality', 'TIMESTAMP'])
train_y = train['Y_Quality']
test_x = test.drop(columns = ['PRODUCT_ID', 'TIMESTAMP'])

## 2) Label Encoding

In [None]:
#qualitative to quantitative
cols = ['PRODUCT_CODE', 'LINE']

for col in cols:
  le = LabelEncoder()
  le = le.fit(train_x[col])
  train_x[col] = le.transform(train_x[col])

  for label in np.unique(test_x[col]):
    if label not in le.classes_:
      le.classes_ = np.append(le.classes_, label)
  test_x[col] = le.transform(test_x[col])

print('Done')

Done


## 3) PRODUCT_CODE 기준으로 구분

In [None]:
train_a = train_x[train_x['PRODUCT_CODE'] == 0]
train_t = train_x[train_x['PRODUCT_CODE'] == 2]
train_o = train_x[train_x['PRODUCT_CODE'] == 1]

In [None]:
test_a = test_x[test_x['PRODUCT_CODE'] == 0]
test_t = test_x[test_x['PRODUCT_CODE'] == 2]
test_o = test_x[test_x['PRODUCT_CODE'] == 1]

In [None]:
train_a.head()

Unnamed: 0,Y_Quality,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,0.533433,2,0,,,,,,,,...,39.34,40.89,32.56,34.09,77.77,,,,,
1,0.541819,3,0,,,,,,,,...,38.89,42.82,43.92,35.34,72.55,,,,,
2,0.531267,2,0,,,,,,,,...,39.19,36.65,42.47,36.53,78.35,,,,,
3,0.537325,3,0,,,,,,,,...,37.74,39.17,52.17,30.58,71.78,,,,,
4,0.53159,2,0,,,,,,,,...,38.7,41.89,46.93,33.09,76.97,,,,,


In [None]:
test_a.head()

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
3,0,0,,,,,,,,,...,,,,,,,,,,
4,1,0,,,,,,,,,...,,,,,,,,,,
5,1,0,,,,,,,,,...,,,,,,,,,,
6,0,0,,,,,,,,,...,,,,,,,,,,
7,2,0,,,,,,,,,...,57.74,52.51,54.45,57.99,63.16,1.0,,,,


## 4) 각각 전체 평균으로 채우기

In [None]:
train_a_mean = train_a.fillna(train_a.mean())
train_t_mean = train_t.fillna(train_t.mean())
train_o_mean = train_o.fillna(train_o.mean())

In [None]:
test_a_mean = test_a.fillna(train_a.mean())
test_t_mean  = test_t.fillna(train_t.mean())
test_o_mean  = test_o.fillna(train_o.mean())

## 5) 남은 결측치 -> 0으로 채우기

In [None]:
train_a_0 = train_a_mean .fillna(0)
train_t_0 = train_t_mean .fillna(0)
train_o_0 = train_o_mean .fillna(0)

In [None]:
test_a_0 = test_a_mean .fillna(0)
test_t_0 = test_t_mean .fillna(0)
test_o_0 = test_o_mean .fillna(0)

In [None]:
train_a_0.head()

Unnamed: 0,Y_Quality,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,0.533433,2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,39.34,40.89,32.56,34.09,77.77,1.0,0.0,0.0,0.0,0.0
1,0.541819,3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,38.89,42.82,43.92,35.34,72.55,1.0,0.0,0.0,0.0,0.0
2,0.531267,2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,39.19,36.65,42.47,36.53,78.35,1.0,0.0,0.0,0.0,0.0
3,0.537325,3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,37.74,39.17,52.17,30.58,71.78,1.0,0.0,0.0,0.0,0.0
4,0.53159,2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,38.7,41.89,46.93,33.09,76.97,1.0,0.0,0.0,0.0,0.0


In [None]:
test_a_0.head()

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
3,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,50.8073,53.6077,49.6062,51.6598,66.6497,1.0,0.0,0.0,0.0,0.0
4,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,50.8073,53.6077,49.6062,51.6598,66.6497,1.0,0.0,0.0,0.0,0.0
5,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,50.8073,53.6077,49.6062,51.6598,66.6497,1.0,0.0,0.0,0.0,0.0
6,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,50.8073,53.6077,49.6062,51.6598,66.6497,1.0,0.0,0.0,0.0,0.0
7,2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,57.74,52.51,54.45,57.99,63.16,1.0,0.0,0.0,0.0,0.0


## 6) X 변수, Y 변수 설정

* 모든 전처리가 끝난 후 각각 X 변수, Y 변수 설정

In [None]:
train_y_a = train_a[['Y_Quality']]
train_x_a = train_a.drop(columns = 'Y_Quality')

train_y_t = train_t[['Y_Quality']]
train_x_t = train_t.drop(columns = 'Y_Quality')

train_y_o = train_o[['Y_Quality']]
train_x_o = train_o.drop(columns = 'Y_Quality')

# 4.모델링

* PRODUCT_CODE 별로 모델을 구분하여 Y_Quality를 예측하는 모델을 시도했음.  
Y_Quality에 따라 Y_Class가 구분되는 것을 확인 했기 때문에 Y_Quality 예측값을 구한 후 구간 값에 따라 Y_Class를 분류하려고 계획함.  
하지만 기존 train_data의 Y_Quality 값를 한참 벗어나는 Y_Quality 값이 예측됨.

## 1) A

In [None]:
from sklearn.linear_model import LinearRegression

RA = LinearRegression()
RA.fit(train_x_a, train_y_a)

In [None]:
test_a[['Y_Quality']] = 0
test_y_a = test_a[['Y_Quality']]
test_a = test_a.drop(columns = 'Y_Quality')

In [None]:
test_y_a[['Y_Quality']] = RA.predict(test_a)
test_y_a

Unnamed: 0,Y_Quality
3,0.500971
4,0.531200
5,0.523364
6,0.494720
7,0.540144
...,...
284,0.504554
285,0.505404
286,0.502253
292,0.569860


## 2) T

In [None]:
from sklearn.linear_model import LinearRegression

RA = LinearRegression()
RA.fit(train_x_t, train_y_t)

In [None]:
test_t[['Y_Quality']] = 0
test_y_t = test_t[['Y_Quality']]
test_t = test_t.drop(columns = 'Y_Quality')

In [None]:
test_y_t[['Y_Quality']] = RA.predict(test_t)
test_y_t

Unnamed: 0,Y_Quality
0,0.558586
1,0.581402
2,0.550915
15,0.625816
16,0.582193
...,...
305,4.648375
306,4.614585
307,4.611894
308,4.668567


## 3) O

In [None]:
from sklearn.linear_model import LinearRegression

RA = LinearRegression()
RA.fit(train_x_o, train_y_o)

In [None]:
test_o[['Y_Quality']] = 0
test_y_o = test_o[['Y_Quality']]
test_o = test_o.drop(columns = 'Y_Quality')

In [None]:
test_y_o[['Y_Quality']] = RA.predict(test_o)
test_y_o

Unnamed: 0,Y_Quality
138,0.678847
256,0.065568
257,0.528146
287,-0.069644
