In [None]:
from os import defpath
import pandas as pd
import numpy as np
from datetime import datetime
from pandas.core.frame import DataFrame
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
import pickle
import os
import warnings
warnings.filterwarnings(action='ignore')

In [None]:

def strToScaleTime(str):
    timestamp = datetime.strptime(str,"%Y-%m-%d %H:%M")
    return timestamp.hour/24 + timestamp.minute/(24*60)


In [None]:
def samplingData(X_train,y_train):
    y_classes,_ = np.unique(y_train, return_counts = True)
    SMOTE_dict = {}
    for y_class in y_classes: SMOTE_dict[y_class] = int(len(y_train)/len(y_classes))
    over = SMOTE(SMOTE_dict)
    under = RandomUnderSampler()
    steps = [('u',under),('o', over)]
    pipeline = Pipeline(steps=steps)
    X_train, y_train = pipeline.fit_resample(X_train, y_train)
    return X_train, y_train

In [None]:
def scalingData(X,userName,save=True):
    scaler = StandardScaler(copy=True, with_mean=True,with_std=True)
    scaler.fit(X)
    X = scaler.transform(X)
    if save == True:
        if not os.path.isfile(f'./EvaluateModel/scaler_{userName}.pickle'):
            with open(f'./EvaluateModel/scaler_{userName}.pickle', 'wb') as f: 
                pickle.dump(scaler, f, pickle.HIGHEST_PROTOCOL)
    return X


In [None]:

def importXYdataset(userName, crossValidate = 5, heartrate= True, all=False, sampling=True,scaling=True):
    '''
    
    userName: Choi heeju / Choi Jinwoo / Kong minjin / Seungkeun / Hyuna Kang / Juwon / Hakpyeong Kim / Jongbaek
    crossValidate: How many fold?
    return: X, Y, X_trainSet, X_testSet, y_trainSet, y_testSet
    you can print number of labels data by
    print(dataset['ThermalSensation'].value_counts())
    
    '''
    print("="*30)
    print(f"User name: {userName}")
    print(f"Heartrate: {heartrate}")
    

    dataset = pd.read_excel('../integrateResult.xlsx',index_col=0)
    dataset = dataset.copy()
    dataset = dataset.drop(['Cloth','ThermalComfort'],axis=1)
    englishNameList = pd.read_excel("../CONFIG.xlsx",index_col=0)['names'].to_list()
    
    # 예외처리 1: OutCloth에 '-'이 있음 : 행에서 직접변경 840 - 869
    if all == True: dataset = dataset[dataset['Name'].isin(englishNameList)].dropna().reset_index(drop=True)
    else:           dataset = dataset[dataset['Name']==userName].dropna().reset_index(drop=True)
    dataset['Time'] = dataset['Time'].apply(lambda x : strToScaleTime(x))
    
    
    X = []
    Y = []
    for i in range(len(dataset)):
        if heartrate == True: X.append(dataset.loc[i,['Time','OutCloth','Temperature','Humidity','Heartrate']].to_list())
        else:                 X.append(dataset.loc[i,['Time','OutCloth','Temperature','Humidity']].to_list())
        
        # 예외처리 2: Choi heeju에서 thermal sensation이 공백으로 찍힘
        if dataset.loc[i,'ThermalSensation'] == ' ': Y.append(0)
        else: Y.append(dataset.loc[i,'ThermalSensation'])

    print('Num data: {}'.format(len(Y)))
    
    ####### scaler
    if scaling == True: X = scalingData(X,userName,save=True)
    
    X_trainSet, X_testSet, y_trainSet, y_testSet = [], [], [], []
    skf = StratifiedKFold(n_splits=crossValidate, shuffle=True)
    for train_index, test_index in skf.split(X, Y):
        X_train   = [X[i] for i in train_index]
        X_test    = [X[i] for i in test_index]
        y_train   = [Y[i] for i in train_index]
        y_test    = [Y[i] for i in test_index]
        
        y_classes,_ = np.unique(y_train, return_counts = True)
        
        ####### sampling
        if sampling == True: X_train, y_train = samplingData(X_train,y_train)

        
        
        X_trainSet.append(X_train)
        X_testSet.append(X_test)
        y_trainSet.append(y_train)
        y_testSet.append(y_test)
        
          
    return X,Y, X_trainSet, X_testSet, y_trainSet, y_testSet


In [None]:

if __name__ == "__main__":
    englishNameList = pd.read_excel("../CONFIG.xlsx",index_col=0)['names'].to_list()
    importXYdataset("Juwon", crossValidate = 5, heartrate= False, all=False, sampling=True,scaling=True)    
    # for userName in englishNameList :
    #     importXYdataset(userName, crossValidate = 5, heartrate= False, all=False, sampling=True,scaling=True)    
    # pass