# 데이콘 Basic 칼로리 소모량 예측 AI 경진대회</p>
AutoML 버전

In [42]:
#기본
import os
import pandas as pd
import numpy as np
import random

#전처리
from scipy.stats import skew
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

#학습
import autosklearn.regression
from sklearn.metrics import mean_squared_error

from sklearn.metrics import mean_squared_error, make_scorer

#기타



In [43]:
os.chdir('/root/data/calory')
print(os.getcwd())

/root/data/calory


In [44]:
#결측치 확인
def shownull(df):
    global missval_name
    print(f"데이터셋 차원 확인 \n {df.shape}")
    missval = df.isnull().sum()[df.isnull().sum()!=0]
    print(f"결측값 있는 변수 확인 \n {missval}")
    missval_name = missval.index # 결측치가 존재하는 열이 저장된 변수

#이상치 확인
def zscore_out(df, threshold = 3):
    mean = np.mean(df)
    std = np.std(df)
    z_score = np.abs((df - df.mean()) / df.std())
    outliers = np.where(z_score > threshold)
    # outlier_idx = outliers.nonzero()[0]
    print(outliers[0])

#시드고정
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(84)

## 데이터 전처리

In [45]:
train = pd.read_csv('train.csv', index_col="ID")
test = pd.read_csv('test.csv', index_col="ID")
y_test = pd.read_csv('sample_submission.csv', index_col="ID")

In [47]:
shownull(train)

데이터셋 차원 확인 
 (7500, 10)
결측값 있는 변수 확인 
 Series([], dtype: int64)


In [48]:
X_train = train.drop('Calories_Burned', axis=1)
y_train = train['Calories_Burned']

In [46]:
print(f'--독립변수(Y) : {list(X_train.columns)}')
print(f'--종속변수(Y) : {list(y_train.columns)}')

--독립변수(Y) : ['Exercise_Duration', 'Body_Temperature(F)', 'BPM', 'Height(Feet)', 'Height(Remainder_Inches)', 'Weight(lb)', 'Weight_Status', 'Gender', 'Age']
--종속변수(Y) : ['Calories_Burned']


### Scaling

In [52]:
#범주형변수 분리
cont_train = X_train.select_dtypes(include=[np.number])
cont_test = test.select_dtypes(include=[np.number])

In [53]:
#스케일링
scaler = StandardScaler()

X_train = X_train.copy()
X_train[cont_train.columns] = scaler.fit_transform(cont_train)

X_test = test.copy()
X_test[cont_test.columns]  = scaler.transform(cont_test)
#test 데이터에 어떤 변수가 있는지 알 수 없으므로 변수명을 사용하지 않고 연속형 변수 타입만 선택해서 fit 없이 정규화함

### 변수 전처리

In [54]:
#왜도가 대체로 0에 가까우므로 변수변환 고려 안 함
print(skew(X_train[cont_train.columns]))

[ 0.01229924 -0.9714503   0.02402306  0.03560469  0.07529544  0.22415293
  0.49597973]


In [55]:
cat_vals = X_train.select_dtypes(include=['object', 'category']).columns

for col in cat_vals:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    for label in np.unique(X_test[col]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    X_test[col] = le.transform(X_test[col])

test 데이터에 어떤 변수가 있는지 알 수 없으므로 변수명을 사용하지 않고 범주형변수 타입만 선택해서 변환함

In [56]:
#최종적으로 학습에 들어간 데이터프레임(변수들)
X_train

Unnamed: 0_level_0,Exercise_Duration,Body_Temperature(F),BPM,Height(Feet),Height(Remainder_Inches),Weight(lb),Weight_Status,Gender,Age
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
TRAIN_0000,1.256628,1.108778,1.199774,-0.446979,0.938611,-0.332109,0,0,0.140030
TRAIN_0001,-1.017530,-0.519252,-0.782140,1.349561,0.080753,1.787635,2,1,0.436203
TRAIN_0002,-1.017530,-0.519252,-0.990762,1.349561,-0.777105,1.589472,2,1,-0.807721
TRAIN_0003,0.179395,-0.023765,0.365284,-0.446979,0.080753,-0.530272,0,0,-0.570783
TRAIN_0004,-0.778145,-0.943955,-0.782140,-0.446979,1.224564,0.133274,0,1,-0.274611
...,...,...,...,...,...,...,...,...,...
TRAIN_7495,0.777858,0.754858,0.886840,-2.243519,1.224564,-1.590144,0,0,1.917064
TRAIN_7496,0.538473,0.896426,0.886840,-0.446979,0.652658,-0.530272,0,0,-1.281597
TRAIN_7497,-0.897837,-0.660820,-0.573517,1.349561,-1.063058,1.124089,2,1,0.850844
TRAIN_7498,-0.419067,0.259371,0.156662,-0.446979,0.938611,0.067220,2,1,-0.452314


## Auto-Sklearn

In [57]:
# 모델 생성 및 학습
automl = autosklearn.regression.AutoSklearnRegressor(
    time_left_for_this_task=2000,
    n_jobs=5,
    resampling_strategy='cv',
    resampling_strategy_arguments={"folds": 5},
    metric=autosklearn.metrics.mean_squared_error,
    include={"feature_preprocessor": ["polynomial"]},
)


# Search for best model
automl.fit(X_train, y_train)

# 모델 예측 및 성능 측정
y_pred = automl.predict(X_test)

#경고 : 일부 알고리즘 사용 못하겠단 뜻



In [58]:
submit = pd.read_csv("./sample_submission.csv")
submit.iloc[:,1] = y_pred
submit.to_csv('./submit_lgbm15.csv', index=False)

In [59]:
print(automl.sprint_statistics())

auto-sklearn results:
  Dataset name: ec2c6ee7-de65-11ed-a457-0242ac110002
  Metric: mean_squared_error
  Best validation score: 0.084199
  Number of target algorithm runs: 125
  Number of successful target algorithm runs: 102
  Number of crashed target algorithm runs: 6
  Number of target algorithms that exceeded the time limit: 6
  Number of target algorithms that exceeded the memory limit: 11



## 평가

##### polynomial, 5-fold, 2000초 : RMSE 0.2914599676 점

auto-sklearn results: <p>
  Dataset name: ec2c6ee7-de65-11ed-a457-0242ac110002 <p>
  Metric: mean_squared_error <p>
  Best validation score: 0.084199 <p>
  Number of target algorithm runs: 125 <p>
  Number of successful target algorithm runs: 102 <p>
  Number of crashed target algorithm runs: 6 <p>
  Number of target algorithms that exceeded the time limit: 6 <p>
  Number of target algorithms that exceeded the memory limit: 11 <p>
