## Calories Prediction

### 차별화
##### PolynomialFeatures의 degree=3, interaction_only=True를 사용하여 교호작용 항목만 생성
##### 가장 기본적인 LinearRegressor만을 사용하여 학습

In [1]:
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train_path = 'C:/Users/Ted/Desktop/research/projects/calorie_prediction/dataset/train.csv'
test_path = 'C:/Users/Ted/Desktop/research/projects/calorie_prediction/dataset/test.csv'

In [3]:
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

In [4]:
train.shape, test.shape

((7500, 11), (7500, 10))

## EDA

##### 학습 데이터 중복 확인

In [5]:
train.duplicated().sum()

0

###### 데이터 중복값 없음

##### 데이터 결측치 및 타입 확인

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ID                        7500 non-null   object 
 1   Exercise_Duration         7500 non-null   float64
 2   Body_Temperature(F)       7500 non-null   float64
 3   BPM                       7500 non-null   float64
 4   Height(Feet)              7500 non-null   float64
 5   Height(Remainder_Inches)  7500 non-null   float64
 6   Weight(lb)                7500 non-null   float64
 7   Weight_Status             7500 non-null   object 
 8   Gender                    7500 non-null   object 
 9   Age                       7500 non-null   int64  
 10  Calories_Burned           7500 non-null   float64
dtypes: float64(7), int64(1), object(3)
memory usage: 644.7+ KB


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ID                        7500 non-null   object 
 1   Exercise_Duration         7500 non-null   float64
 2   Body_Temperature(F)       7500 non-null   float64
 3   BPM                       7500 non-null   float64
 4   Height(Feet)              7500 non-null   float64
 5   Height(Remainder_Inches)  7500 non-null   float64
 6   Weight(lb)                7500 non-null   float64
 7   Weight_Status             7500 non-null   object 
 8   Gender                    7500 non-null   object 
 9   Age                       7500 non-null   int64  
dtypes: float64(6), int64(1), object(3)
memory usage: 586.1+ KB


###### 데이터 결측치 없음

##### 특성공학
###### Height(Feet) 와 Height(Remainder_Incses) 를 이용하여 Height(cm) 컬럼 생성
###### ℉ 화씨 온도를 ℃ 섭씨 온도로 변환한 Body_Temperature(C) 컬럼 생성
###### lb(파운드) 를 kg 으로 변환한 Weight(kg) 컬럼 생성
###### 학습과 분석에 불필요한 컬럼들 삭제

In [8]:
def feature_engineering(df):
    df['Height(cm)']=df['Height(Feet)']*30.48 + df['Height(Remainder_Inches)']*2.54

    df['Body_Temperature(C)']=(df['Body_Temperature(F)']-32)*5/9

    df['Weight(kg)']=(df['Weight(lb)']*0.45359237)

    df.drop(['ID','Body_Temperature(F)', 'Height(Feet)', 'Height(Remainder_Inches)', 'Weight(lb)'], axis=1, inplace=True)

##### feature_engineering(df) 함수 실행

In [9]:
feature_engineering(train)
feature_engineering(test)

###### 데이터 확인

In [10]:
train

Unnamed: 0,Exercise_Duration,BPM,Weight_Status,Gender,Age,Calories_Burned,Height(cm),Body_Temperature(C),Weight(kg)
0,26.0,107.0,Normal Weight,F,45,166.0,175.26,40.888889,69.989303
1,7.0,88.0,Overweight,M,50,33.0,198.12,39.611111,102.012924
2,7.0,86.0,Overweight,M,29,23.0,190.50,39.611111,99.019214
3,17.0,99.0,Normal Weight,F,33,91.0,167.64,40.000000,66.995593
4,9.0,88.0,Normal Weight,M,38,32.0,177.80,39.277778,77.019984
...,...,...,...,...,...,...,...,...,...
7495,22.0,104.0,Normal Weight,F,75,151.0,147.32,40.611111,50.983782
7496,20.0,104.0,Normal Weight,F,21,114.0,172.72,40.722222,66.995593
7497,8.0,90.0,Overweight,M,57,41.0,187.96,39.500000,91.988533
7498,12.0,97.0,Overweight,M,35,57.0,175.26,40.222222,76.022081


### 데이터 전처리
###### 카테고리컬 데이터 인코딩
###### PolynomialFeatures 를 사용하여 다항식 특성 추가
###### StandardScaler 를 사용하여 수치형 데이터의 스케일을 통일

In [11]:
# 'Weight_Status', 'Gender'컬럼 인코딩
from sklearn.preprocessing import LabelEncoder

###### categorical features 선언

In [12]:
categorical_features = ['Weight_Status', 'Gender']
numerical_features = ['Exercise_Duration', 'BPM', 'Age', 'Height(cm)', 'Body_Temperature(C)','Weight(kg)']

###### Labelencoder

In [13]:
def preprocessing(train, test):
    for feature in categorical_features:
        le = LabelEncoder()
        le = le.fit(train[feature])
        train[feature] = le.transform(train[feature])
        for label in np.unique(test[feature]):
            if label not in le.classes_:
                le.classes_ = np.append(le.classes_, label)
        test[feature] = le.transform(test[feature])

In [14]:
preprocessing(train, test)

In [15]:
# 타겟 데이터
train_target = train['Calories_Burned']

###### 학습 데이터에서 타겟 컬럼 제거

In [34]:
train1 = train.drop(columns='Calories_Burned')
test1 = test

###### PolynomialFeatures와 StandardScaler를 사용한 학습 및 테스트 데이터 변환

In [35]:
poly = PolynomialFeatures(degree=3, interaction_only=True)
ss = StandardScaler()

train_poly = pd.DataFrame(poly.fit_transform(train1))
test_poly = pd.DataFrame(poly.transform(test1))
train_poly = ss.fit_transform(train_poly)
test_poly = ss.transform(test_poly)


### 모델 학습 및 검정

In [36]:
# 데이터셋 분리
from sklearn.model_selection import train_test_split

train_X, val_X, train_Y, val_Y = train_test_split(train_poly, train_target, test_size=0.30, random_state=42, shuffle=True)

In [37]:
train_X

array([[ 0.        , -0.17968237,  0.36528415, ..., -0.78048491,
        -0.63385835, -1.13365203],
       [ 0.        , -0.8978373 , -0.99076233, ...,  1.33288419,
         1.53040175,  0.20498861],
       [ 0.        , -1.25691477, -1.30369614, ...,  0.11252801,
         0.23542591, -0.92751608],
       ...,
       [ 0.        , -0.53875983, -1.61662994, ...,  0.15691341,
         0.0705335 ,  0.908528  ],
       [ 0.        , -0.53875983, -1.19938487, ..., -0.1255449 ,
         0.0890654 , -0.84983805],
       [ 0.        , -1.61599224, -1.19938487, ..., -1.1332517 ,
        -1.20582331, -1.02737745]])

In [38]:
# 모델 선언 및 학습
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [39]:
lr.fit(train_X, train_Y)

###### 검증세트를 이용한 예측 및 rmse 확인

In [40]:
pred = lr.predict(val_X)
rmse = mean_squared_error(val_Y, pred, squared=False)
print(f'RMSE는 {rmse}입니다.')

RMSE는 0.2893959030869278입니다.


###### 테스트 세트를 이용한 데이터 예측 및 예측 값 저장

In [37]:
# y_pred_test = best_model.predict(test_poly)

In [38]:
# y_pred_test

In [39]:
# sample_submission = pd.read_csv('C:/Users/Ted/Desktop/research/projects/calorie_prediction/dataset/sample_submission.csv')

In [40]:
# sample_submission['Calories_Burned'] = y_pred_test
# sample_submission.to_csv('submission_tf_gpu_gridcv.csv', index=False)