In [1]:
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [2]:
train_path = 'C:/Users/Ted/Desktop/Research/Projects/calories/open/train.csv'
test_path = 'C:/Users/Ted/Desktop/Research/Projects/calories/open/test.csv'

In [3]:
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

In [4]:
train.shape, test.shape

((7500, 11), (7500, 10))

In [5]:
def feature_engineering(df):
    df['Height(cm)']=df['Height(Feet)']*30.48 + df['Height(Remainder_Inches)']*2.54

    df['Body_Temperature(C)']=(df['Body_Temperature(F)']-32)*5/9

    df['Weight(kg)']=(df['Weight(lb)']*0.45359237)

    df.drop(['ID','Body_Temperature(F)', 'Height(Feet)', 'Height(Remainder_Inches)', 'Weight(lb)'], axis=1, inplace=True)

In [6]:
feature_engineering(train)
feature_engineering(test)

In [7]:
# 'Weight_Status', 'Gender'컬럼 인코딩
from sklearn.preprocessing import LabelEncoder

In [8]:
categorical_features = ['Weight_Status', 'Gender']
numerical_features = ['Exercise_Duration', 'BPM', 'Age', 'Height(cm)', 'Body_Temperature(C)','Weight(kg)']

In [9]:
def preprocessing(train, test):
    for feature in categorical_features:
        le = LabelEncoder()
        le = le.fit(train[feature])
        train[feature] = le.transform(train[feature])
        for label in np.unique(test[feature]):
            if label not in le.classes_:
                le.classes_ = np.append(le.classes_, label)
        test[feature] = le.transform(test[feature])

In [10]:
preprocessing(train, test)

In [11]:
# 타겟 데이터
train_target = train['Calories_Burned']

In [12]:
train = train.drop(columns='Calories_Burned')

In [13]:
poly = PolynomialFeatures(degree=2, include_bias=False)
ss = StandardScaler()

train_poly = ss.fit_transform(train)
test_poly = ss.transform(test)
train_poly = poly.fit_transform(train_poly)
test_poly = poly.transform(test_poly)

In [14]:
# 데이터셋 분리
from sklearn.model_selection import train_test_split

train_X, val_X, train_Y, val_Y = train_test_split(train_poly, train_target, test_size=0.30, random_state=42)

In [15]:
import tensorflow as tf
from sklearn.model_selection import GridSearchCV, KFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor


def create_model(hidden_layers=1, neurons=64, dropout_rate=0.0):
    model = Sequential()
    for i in range(hidden_layers):
        if i == 0:
            model.add(Dense(neurons, input_dim=train_X.shape[1], activation='relu'))
        else:
            model.add(Dense(neurons, activation='relu'))
        if dropout_rate > 0.0:
            model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='linear'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

# Create a KerasRegressor
model = KerasRegressor(build_fn=create_model, epochs=100, batch_size=32, verbose=0)

# Define the grid search parameters
param_grid = {
    'hidden_layers': [1, 2, 3],
    'neurons': [32, 64, 128],
    'dropout_rate': [0.0, 0.1, 0.2]
}

# Perform a grid search with cross-validation
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=KFold(n_splits=5))
grid_result = grid.fit(train_X, train_Y)

# Print the best results
print(f"Best score: {-grid_result.best_score_:.2f} using {grid_result.best_params_}")

  model = KerasRegressor(build_fn=create_model, epochs=100, batch_size=32, verbose=0)


Best score: 0.34 using {'dropout_rate': 0.0, 'hidden_layers': 1, 'neurons': 128}


In [16]:
# Get the best model
best_model = grid_result.best_estimator_

# Train the best model
best_model.fit(train_X, train_Y)

<keras.callbacks.History at 0x1c2c4eb22e0>

In [17]:
y_pred = best_model.predict(val_X)

In [18]:
rmse = np.sqrt(mean_squared_error(val_Y, y_pred))

In [19]:
rmse

0.49499855147936256

In [20]:
y_pred_test = best_model.predict(test_poly)

In [21]:
y_pred_test

array([173.01826 , 189.30405 ,  53.761917, ..., 130.02374 ,  31.908773,
       191.01556 ], dtype=float32)

In [22]:
sample_submission = pd.read_csv('C:/Users/Ted/Desktop/Research/Projects/calories/open/sample_submission.csv')

In [23]:
sample_submission['Calories_Burned'] = y_pred_test
sample_submission.to_csv('submission_tf_gridcv.csv', index=False)