In [65]:
import pickle
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import json
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
import torch.backends.cudnn as cudnn
import random
from torch.utils.data import Dataset, DataLoader, TensorDataset
from scipy import stats
from sklearn.preprocessing import StandardScaler
from hynix.model_class import LSTM, LSTM_model
from pycaret.regression import *
from sklearn.utils import resample
import warnings
warnings.filterwarnings(action='ignore')

In [66]:
class PreprocessAndPredict:
    class RealTestDataset(Dataset):
        def __init__(self, df):
            self.df = df.reset_index(drop=True)
            
        def __len__(self):
            return len(self.df)
            
        def __getitem__(self,index):
            x = self.df.iloc[index, :].values
            data = {}
            data["x"] = x
            return data
        
    def __init__(self, isfull:bool):
        self.isfull = isfull
        self.nan = np.NaN
        
    # 50번의 시뮬레이션 데이터 생성
    def MakeSimulationData(self, test):
        # csv_obj = PreprocessedCSV.objects.filter(data__contains='prepro_cgw.csv').first()
        # if csv_obj:
        #     file_path = csv_obj.data.path
        file_path = 'test_files/prepro_cgw.csv'
        # 파일을 직접 Pandas 데이터프레임으로 읽기
        org_traindata_df = pd.read_csv(file_path,index_col=0)
        org_traindata_df.drop(columns="Y",inplace=True)

        # test2의 행을 50번 복제
        test = pd.concat([test]*50).reset_index(drop=True)

        # 각 행에 대해 처리
        for i in range(len(test)):
            # 해당 행에서 마지막으로 값이 있는 컬럼 찾기
            last_valid_col = test.iloc[i].last_valid_index()

            # last_valid_col 다음 컬럼부터 값을 채우기
            for col in test.columns[test.columns.get_loc(last_valid_col)+1:]:
                try:
                    non_null_values = org_traindata_df[col].dropna().tolist()
                    if non_null_values:
                        max_range = int(org_traindata_df[col].max(axis=0))
                        min_range = int(org_traindata_df[col].min(axis=0))
                        random_value = random.randint(min_range, max_range)
                        test[col].iloc[i] = random_value
                except:
                    continue
        return test
    
    def sort_time(self, data):
        for idx,col in enumerate(data.columns):
            if data[col].dtype == "object":
                data[col] = pd.to_datetime(data[col])

        ts_data = data.select_dtypes("datetime")
        ts_data.reset_index(drop=True, inplace=True)

        for idx in ts_data.index:
            ts_data.sort_values(by=idx,axis=1, inplace=True)

        result = []
        datatmp = data.columns.to_list()[188:1454]
        for idx,col in enumerate(datatmp):
            if data[col].dtype == "<M8[ns]":
                cur = int(col[1:]) # x195 -> 195
                i = idx
                tmp = []
                while i > 0:
                    i -= 1
                    next = datatmp[i] # x194
                    if data[next].dtype == "<M8[ns]":
                        break
                    else:
                        tmp.append(next)
                        tmp.sort()
                result.append((col,tmp))
        ts_final = []
        for elem in ts_data.columns:
            for target,content in result:
                if elem == target:
                    ts_final.extend(content)
                    ts_final.append(target)
        ts_final = data[ts_final]
        front = data.loc[:,:"x193"]
        back = data.loc[:,"x1461":]
        final = pd.concat([front, ts_final, back], axis = 1)
        
        return final

    def Qtime(self, data, ts_data):
        df = pd.DataFrame(index=data.index)
        for idx in range(1, len(ts_data.columns)):
            col = []
            for jdx in range(len(ts_data.index)):
                try:
                    time1 = datetime.strptime(ts_data.iloc[jdx,idx],"%Y-%m-%d %H:%M")
                    time2 = datetime.strptime(ts_data.iloc[jdx,idx-1],"%Y-%m-%d %H:%M")
                except:
                    time1 = datetime.strptime(ts_data.iloc[jdx,idx],"%Y-%m-%d %H:%M:%S")
                    time2 = datetime.strptime(ts_data.iloc[jdx,idx-1],"%Y-%m-%d %H:%M:%S")

                diff =  time1 - time2
                col.append(round(diff.seconds/(60*60),2))
            df[ts_data.columns[idx-1]] = col
        return df

    def insert_Qtime(self, data, data_q):
        for col in data_q.columns:
            data.loc[:,col] = data_q.loc[:,col]
        if self.isfull:
            data.drop(columns="x197",inplace=True)
        else:
            try:
                last = data.select_dtypes("object").columns[-1]
                data[last] = self.nan
            except:
                pass
        
        return data

    def train_preprocess(self, train):
        print("train preprocess start")
        train.set_index(keys="ID",inplace=True)
        train.drop(columns="x204",inplace=True)
        y_train = train["Y"]
        train = train.drop(columns="Y")
        
        train_options = {}
        
        train = self.sort_time(train)
        ts_train = train.select_dtypes("datetime").astype("str")
        train_q = self.Qtime(train, ts_train)
        train = self.insert_Qtime(train, train_q)
        
        head = train.loc[:,:"x193"]
        mid = train.loc[:,"x205":"x196"]
        tail = train["x1548"]
        
        for elem in head.columns:
            if head[elem].notnull().sum() < 5:
                head.drop(columns=elem,inplace=True)

        sw = []
        sw_pvalues = []

        for col in head.columns:
            x = head[head[col].notnull()][col]

            test_stat, p_val = stats.shapiro(x)
            sw.append((col,p_val))
            sw_pvalues.append(p_val)

        no_cols = []
        for col,val in sw:
            if val < 0.05:
                no_cols.append(col)

        y_cols = []
        for col,val in sw:
            if val >= 0.05:
                y_cols.append(col)
        head = head[y_cols]

        nulldf = head.isnull().copy()
        for col in head.columns:
            for row in head.index:
                if nulldf.loc[row,col] == True:
                    head.loc[row,col] = head[col].mean()+np.random.randn()

        for col in mid.columns:
            mid[col].fillna(mid[col].mean(), inplace=True)

        train = pd.concat([head,mid,tail],axis=1)
        train_options["before_scale_columns"] = train.columns.to_list()
        
        std = StandardScaler()
        std.fit(train)

        train_sc = std.transform(train)
        train = pd.DataFrame(data=train_sc, index=train.index, columns=train.columns)

        pickle.dump(std, open('std_scaler.pkl', 'wb'))
        
        corr_df = train.apply(lambda x: x.corr(y_train))
        corr_df = corr_df.apply(lambda x: round(x ,2))
        df = pd.DataFrame(corr_df[corr_df<1], columns=['corr'])
        cols = df[abs(df["corr"]) >= 0.05].index.to_list()
        train = train[cols]
        
        train_options["column_names"] = train.columns.to_list()
        train_options["column_means"] = list(train.mean().values)

        pickle.dump(train_options, open('train_options.pkl', 'wb'))
        
        y_train /= 100
        train = pd.merge(train, y_train,how="left",on="ID") 

        return train

    def test_preprocess(self, test):
        print("test preprocess start")
        test.set_index(keys="ID",inplace=True)
        test.drop(columns="x204",inplace=True)
        
        if self.isfull:
            print("full data")
            test = self.sort_time(test)
            ts_test = test.select_dtypes("datetime").astype("str")
            test_q = self.Qtime(test, ts_test)
            test = self.insert_Qtime(test, test_q)
        else:
            print("middle data")
            cols = pickle.load(open('models/gw/train_cols.pkl', 'rb'))
            test = test[cols]
            ts_test = test.select_dtypes("object").astype("str")
            test_q = self.Qtime(test, ts_test)
            test = self.insert_Qtime(test, test_q)
        
        test = self.MakeSimulationData(test)
        
        train_options = pickle.load(open('models/gw/train_options.pkl', 'rb'))
        scaler = pickle.load(open('models/gw/std_scaler.pkl', 'rb'))
        
        test = test[train_options["before_scale_columns"]]
        
        test_sc = scaler.transform(test)
        test = pd.DataFrame(data=test_sc, index=test.index, columns=test.columns)
        
        test = test[train_options["column_names"]]
        
        mean_values = train_options["column_means"]
        for idx, col in enumerate(test.columns.to_list()):
            test[col].fillna(mean_values[idx], inplace=True)
        
        test = test[train_options["column_names"]]
        
        return test

In [67]:
test = pd.read_csv('test_files/real2.csv',index_col=0)
test

Unnamed: 0,ID,x1,x3,x4,x5,x6,x7,x8,x9,x10,...,x1478,x1479,x1480,x1481,x1482,x1483,x1484,x1485,x1486,x1548
2497,4244,0.043333,23.2375,,,,-0.08,,,,...,,,,,,,,,,


In [68]:
test.set_index(keys="ID",inplace=True)
test.drop(columns="x204",inplace=True)

In [69]:
pp = PreprocessAndPredict(False)

In [70]:
cols = pickle.load(open('models/gw/train_cols.pkl', 'rb'))
test = test[cols]
test

Unnamed: 0_level_0,x1,x3,x4,x5,x6,x7,x8,x9,x10,x11,...,x1478,x1479,x1480,x1481,x1482,x1483,x1484,x1485,x1486,x1548
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4244,0.043333,23.2375,,,,-0.08,,,,,...,,,,,,,,,,


In [71]:
ts_test = test.select_dtypes("object").astype("str")
ts_test

Unnamed: 0_level_0,x206
ID,Unnamed: 1_level_1
4244,2008-08-30 4:12


In [72]:
test_q = pp.Qtime(test, ts_test)

In [73]:
test_q

4244


In [74]:
test = pp.insert_Qtime(test, test_q)
test["x206"]

ID
4244   NaN
Name: x206, dtype: float64

In [75]:
test = pp.MakeSimulationData(test)
test

Unnamed: 0,x1,x3,x4,x5,x6,x7,x8,x9,x10,x11,...,x1478,x1479,x1480,x1481,x1482,x1483,x1484,x1485,x1486,x1548
0,0.043333,23.2375,,,,-0.08,,,,,...,,,,,,,,,,39.0
1,0.043333,23.2375,,,,-0.08,,,,,...,,,,,,,,,,120.0
2,0.043333,23.2375,,,,-0.08,,,,,...,,,,,,,,,,103.0
3,0.043333,23.2375,,,,-0.08,,,,,...,,,,,,,,,,66.0
4,0.043333,23.2375,,,,-0.08,,,,,...,,,,,,,,,,113.0
5,0.043333,23.2375,,,,-0.08,,,,,...,,,,,,,,,,152.0
6,0.043333,23.2375,,,,-0.08,,,,,...,,,,,,,,,,155.0
7,0.043333,23.2375,,,,-0.08,,,,,...,,,,,,,,,,93.0
8,0.043333,23.2375,,,,-0.08,,,,,...,,,,,,,,,,52.0
9,0.043333,23.2375,,,,-0.08,,,,,...,,,,,,,,,,104.0


In [76]:
test['x206']

0     16.0
1      8.0
2     23.0
3     20.0
4     12.0
5      0.0
6      5.0
7      8.0
8      9.0
9      9.0
10    11.0
11    16.0
12    10.0
13     0.0
14     7.0
15     0.0
16    13.0
17    10.0
18     1.0
19     2.0
20     0.0
21    19.0
22     8.0
23     0.0
24     6.0
25    22.0
26    13.0
27    16.0
28     8.0
29    22.0
30     4.0
31    15.0
32     1.0
33     1.0
34    18.0
35    23.0
36     7.0
37    23.0
38    14.0
39     1.0
40     2.0
41     8.0
42     0.0
43    17.0
44    23.0
45    21.0
46    13.0
47     2.0
48     9.0
49    14.0
Name: x206, dtype: float64