In [None]:
import pandas as pd
import numpy as np
import os
import glob

# 회귀 모델 관련 패키지
import statsmodels
from statsmodels.api import OLS, add_constant
from statsmodels.api import add_constant
from statsmodels.regression.linear_model import OLS

# 평가 지표
from sklearn.metrics import mean_squared_error

In [None]:
class make_data:
    def __init__(self, dataset_list, rate, target_temp, std):
        """
        dataset_list : dataset 경로를 list 형태로 받음
        rate : train, test 분할 비율(train = rate, test = 1-rate)
        target_temp : 목표온도
        std : 가용 가능한 목표온도 편차
        """
        self.train_list = dataset_list[:int(len(dataset_list)*rate)]
        self.test_list = dataset_list[int(len(dataset_list)*rate):]
        
        self.target_temp = target_temp - std
    
    def data_merge(self, data_list):
        """
        모든 데이터 경로를 가져와서 분할
        """
        data_set = pd.read_csv(data_list[0], low_memory=False).iloc[:, 6:15].dropna()
        
        # 모든 온도측정센서에서 목표온도에 모두 다 도달한 시점 이후에 데이터만 사용
        data_set["tem_min"] = data_set.min(axis = 1)
        first_index = data_set[(data_set['tem_min']>=self.target_temp)].index[0]
        data_set = data_set.drop(columns=["tem_min"]).iloc[first_index:,:]

        for i in range(len(data_list[:-1])):
            merge_set = pd.read_csv(data_list[i+1], low_memory=False).iloc[:, 6:15].dropna()
            
            # 모든 온도측정센서에서 목표온도에 모두 다 도달한 시점 이후에 데이터만 사용
            merge_set["tem_min"] = merge_set.min(axis = 1)
            first_index = merge_set[(merge_set['tem_min']>=self.target_temp)].index[0]
            merge_set = merge_set.drop(columns=["tem_min"]).iloc[first_index:,:]
            
            # 행 기준 결합
            data_set = pd.concat([data_set, merge_set], axis = 0) 

        return data_set
    
    def return_data(self):
        return self.data_merge(self.train_list), self.data_merge(self.test_list)

t850 = "../../data/Temperature/850c/*.csv"
t900 = "../../data/Temperature/900c/*.csv"
t1000 = "../../data/Temperature/1000c/*.csv"
t850_list = glob.glob(t850)
t900_list = glob.glob(t900)
t1000_list = glob.glob(t1000)
data_list = [t850_list, t900_list, t1000_list]
data_split_rate = 0.7
target_temp_list = [850,900,1000]

train_data = pd.DataFrame()
test_data = pd.DataFrame()
for idx in range(len(data_list)):
    temp_data_list = data_list[idx]
    if len(temp_data_list) > 0:
        temp_train_data, temp_test_data = make_data(temp_data_list,
                                                    rate = data_split_rate,
                                                    target_temp = target_temp_list[idx],
                                                    std = target_temp_list[idx] - 300).return_data()
        train_data = pd.concat([train_data, temp_train_data])
        test_data = pd.concat([test_data, temp_test_data])

In [None]:
train_data_lists = [train_data[col].tolist() for col in train_data.columns]
train_X = add_constant(train_data_lists[-1])
train_ys = []
for i in range(len(train_data_lists)-1):
    train_ys.append(train_data_lists[i])

In [None]:
test_data_lists = [test_data[col].tolist() for col in test_data.columns]
test_X = add_constant(test_data_lists[-1])
test_ys = []
for i in range(len(test_data_lists)-1):
    test_ys.append(test_data_lists[i])

In [None]:
for idx in range(len(train_ys)):
    train_y = train_ys[idx]
    
    # 회귀 모델 학습
    locals()['ch9_lm_'+str(idx)] = OLS(train_y, train_X).fit()

In [None]:
test = locals()['ch9_lm_'+str(1)]

In [None]:
test.conf_int()

In [None]:
with open('./model_summary.txt', 'w') as f:
    f.write(test.summary().as_text())

In [None]:
test.params

In [None]:
test.pvalues

In [None]:
test.rsquared

In [None]:
pred_list = []
eval_list = []
for idx in range(len(test_ys)):
    pred = locals()['ch9_lm_'+str(idx)].predict(test_X)
    pred_list.append(pred)
    
    test_y = test_ys[idx]
    mse = mean_squared_error(test_y, pred)
    rmse = np.sqrt(mse)
    eval_list.append([mse, rmse])

In [None]:
eval_list

In [None]:
eval_df = pd.DataFrame(columns=["mse","rmse"],data=eval_list)
eval_df.head()

In [None]:
pred_df = pd.DataFrame(data=pred_list).T
pred_df.columns = ["ch1","ch2","ch3","ch4","ch5","ch6","ch7","ch8"]

In [None]:
pred_df.head(5)

In [None]:
import pickle

for idx in range(len(test_ys)):
    model = pred = locals()['ch9_lm_'+str(idx)]
    # 모델 저장
    with open(f'./models/model_{idx+1}.pkl', 'wb') as f:
        pickle.dump(model, f)