In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Data load

In [2]:
train = pd.read_csv("./data/Obesity Risk/train.csv")
test = pd.read_csv("./data/Obesity Risk/test.csv")

train.shape, test.shape

((20758, 18), (13840, 17))

In [6]:
train.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


In [7]:
test.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
0,20758,Male,26.899886,1.848294,120.644178,yes,yes,2.938616,3.0,Sometimes,no,2.825629,no,0.8554,0.0,Sometimes,Public_Transportation
1,20759,Female,21.0,1.6,66.0,yes,yes,2.0,1.0,Sometimes,no,3.0,no,1.0,0.0,Sometimes,Public_Transportation
2,20760,Female,26.0,1.643355,111.600553,yes,yes,3.0,3.0,Sometimes,no,2.621877,no,0.0,0.250502,Sometimes,Public_Transportation
3,20761,Male,20.979254,1.553127,103.669116,yes,yes,2.0,2.977909,Sometimes,no,2.786417,no,0.094851,0.0,Sometimes,Public_Transportation
4,20762,Female,26.0,1.627396,104.835346,yes,yes,3.0,3.0,Sometimes,no,2.653531,no,0.0,0.741069,Sometimes,Public_Transportation


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20758 entries, 0 to 20757
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              20758 non-null  int64  
 1   Gender                          20758 non-null  object 
 2   Age                             20758 non-null  float64
 3   Height                          20758 non-null  float64
 4   Weight                          20758 non-null  float64
 5   family_history_with_overweight  20758 non-null  object 
 6   FAVC                            20758 non-null  object 
 7   FCVC                            20758 non-null  float64
 8   NCP                             20758 non-null  float64
 9   CAEC                            20758 non-null  object 
 10  SMOKE                           20758 non-null  object 
 11  CH2O                            20758 non-null  float64
 12  SCC                             

In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13840 entries, 0 to 13839
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              13840 non-null  int64  
 1   Gender                          13840 non-null  object 
 2   Age                             13840 non-null  float64
 3   Height                          13840 non-null  float64
 4   Weight                          13840 non-null  float64
 5   family_history_with_overweight  13840 non-null  object 
 6   FAVC                            13840 non-null  object 
 7   FCVC                            13840 non-null  float64
 8   NCP                             13840 non-null  float64
 9   CAEC                            13840 non-null  object 
 10  SMOKE                           13840 non-null  object 
 11  CH2O                            13840 non-null  float64
 12  SCC                             

# EDA

In [11]:
# index = id로 설정
train.set_index("id", drop=True, inplace=True)
test.set_index("id", drop=True, inplace=True)

In [21]:
# 수치형, 범주형 변수 리스트 저장
num_list = train.select_dtypes("number").columns.to_list()
cat_list = train.select_dtypes("object").columns.to_list()
print(num_list, "\n", cat_list)

['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE'] 
 ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS', 'NObeyesdad']


# Preprocessing

## data split

In [32]:
X = train[train.columns[:-1]]
y = train["NObeyesdad"]
X.shape, y.shape

((20758, 16), (20758,))

In [36]:
# Make Validation Set
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42, test_size=0.2, stratify=y)

X_train.shape, X_val.shape, y_train.shape, y_val.shape

((16606, 16), (4152, 16), (16606,), (4152,))

## target encoding

In [37]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le_target = LabelEncoder()

In [40]:
y_train_encoded = le.fit_transform(y_train)
y_val_encoded = le.transform(y_val)

## Scaling

In [63]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

In [64]:
X_train_scaled = scaler.fit_transform(X_train.select_dtypes("number"))
X_val_scaled = scaler.transform(X_val.select_dtypes("number"))

In [67]:
X_train_new = pd.DataFrame(X_train_scaled,
                           index=X_train.select_dtypes("number").index, 
                           columns=X_train.select_dtypes("number").columns)

X_val_new = pd.DataFrame(X_val_scaled,
                         index=X_val.select_dtypes("number").index,
                         columns=X_val.select_dtypes("number").columns)

## Encoding

In [68]:
for col in cat_list[:-1]:
    X_train_new[col] = le.fit_transform(X_train[col])
    X_val_new[col] = le.transform(X_val[col])

In [69]:
X_train_new.head()

Unnamed: 0_level_0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,Gender,family_history_with_overweight,FAVC,CAEC,SMOKE,SCC,CALC,MTRANS
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
4515,0.190476,0.494613,0.404097,0.5,0.0,0.5,0.333333,1.0,1,1,1,2,0,0,1,3
7949,0.642857,0.361448,0.300886,1.0,0.0,0.5,0.0,0.0,0,1,1,2,0,0,1,0
20677,0.095238,0.665826,0.13416,0.5,1.0,0.5,0.666667,0.5,1,1,1,1,0,0,2,0
18079,0.095238,0.47559,0.3644,0.5,0.666667,0.5,0.666667,0.5,1,1,1,1,0,0,0,3
5129,0.207984,0.75985,0.655439,0.676162,0.566657,0.678989,0.561527,0.369804,1,1,1,2,0,0,1,3


# Random Forest Classifier

In [70]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)

## RandomSearchCV

In [71]:
max_depth = np.random.randint(1,20,10)
max_features = np.random.uniform(0.4, 1.0, 100)
param_distributions = {"max_depth": max_depth,
                      "max_features": max_features,
                      "min_samples_split": list(range(2,7))}

In [78]:
from sklearn.model_selection import RandomizedSearchCV

clf = RandomizedSearchCV(model,
                        param_distributions,
                        n_iter=100,
                        scoring="accuracy",
                        n_jobs=-1,
                        cv=5,
                        verbose=2,
                        random_state=42)

clf.fit(X_train_new, y_train_encoded)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'max_depth': array([13,  3, 17, 10,  6,  9,  4, 10,  4,  3]),
                                        'max_features': array([0.855014  , 0.43150107, 0.5871656 , 0.97339187, 0.85512933,
       0.87374648, 0.46395496, 0.90427884, 0.56822105, 0.93813469,
       0.42973065, 0.56950584, 0.84393697, 0.96254089, 0.91507866,
       0.913...
       0.42592843, 0.8350803 , 0.65414137, 0.65713483, 0.56314498,
       0.7572864 , 0.91729405, 0.53091145, 0.48311989, 0.80337594,
       0.40491084, 0.50898492, 0.96906643, 0.78840008, 0.99120997,
       0.98945169, 0.9678839 , 0.41225888, 0.81234006, 0.8189719 ,
       0.89260964, 0.50334203, 0.9983996 , 0.81368982, 0.61083423]),
                                        'min_samples_split': [2, 3, 4, 5, 6]},
                   random_state=42, scoring='accuracy', verbose=2)

In [79]:
clf.best_params_

{'min_samples_split': 4, 'max_features': 0.6234511467307055, 'max_depth': 13}

In [80]:
clf.best_score_

0.9000965885637182

In [81]:
pd.DataFrame(clf.cv_results_).sort_values(by="rank_test_score").head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_split,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
94,3.109859,0.037589,0.062814,0.003656,4,0.610834,13,"{'min_samples_split': 4, 'max_features': 0.610...",0.896147,0.900933,0.896718,0.901837,0.904848,0.900097,0.003266,1
72,3.039568,0.040955,0.062858,0.00261,4,0.623451,13,"{'min_samples_split': 4, 'max_features': 0.623...",0.896147,0.900933,0.896718,0.901837,0.904848,0.900097,0.003266,1
85,2.933826,0.033933,0.067215,0.002926,5,0.530911,17,"{'min_samples_split': 5, 'max_features': 0.530...",0.897652,0.900632,0.890394,0.903945,0.906052,0.899735,0.005478,3
22,2.719373,0.032146,0.067815,0.002136,4,0.463995,17,"{'min_samples_split': 4, 'max_features': 0.463...",0.899759,0.898223,0.895815,0.901837,0.900632,0.899253,0.002084,4
11,3.10381,0.025092,0.063015,0.001265,5,0.623451,13,"{'min_samples_split': 5, 'max_features': 0.623...",0.895545,0.900632,0.892803,0.902138,0.903945,0.899013,0.004178,5


In [82]:
clf.score(X_val_new, y_val_encoded)

0.9063102119460501

__trial #1__
- RandomForestClassifier
- RandomSearchCV
- MinMaxScaling, LabelEncoding
- accuracy: (best) 0.90

## Submission

In [109]:
test = pd.read_csv("./data/Obesity Risk/test.csv")
test.set_index("id", drop=True, inplace=True)
test.head()

Unnamed: 0_level_0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
20758,Male,26.899886,1.848294,120.644178,yes,yes,2.938616,3.0,Sometimes,no,2.825629,no,0.8554,0.0,Sometimes,Public_Transportation
20759,Female,21.0,1.6,66.0,yes,yes,2.0,1.0,Sometimes,no,3.0,no,1.0,0.0,Sometimes,Public_Transportation
20760,Female,26.0,1.643355,111.600553,yes,yes,3.0,3.0,Sometimes,no,2.621877,no,0.0,0.250502,Sometimes,Public_Transportation
20761,Male,20.979254,1.553127,103.669116,yes,yes,2.0,2.977909,Sometimes,no,2.786417,no,0.094851,0.0,Sometimes,Public_Transportation
20762,Female,26.0,1.627396,104.835346,yes,yes,3.0,3.0,Sometimes,no,2.653531,no,0.0,0.741069,Sometimes,Public_Transportation


In [97]:
test["CALC"].value_counts()

Sometimes     9979
no            3513
Frequently     346
Always           2
Name: CALC, dtype: int64

In [111]:
# CALC 별도 처리
cat_list_new = cat_list.remove("CALC")

In [114]:
# MinMaxScaling
test_new = scaler.transform(test.select_dtypes("number"))
test_new = pd.DataFrame(test_new,
                         index=test.select_dtypes("number").index,
                         columns=test.select_dtypes("number").columns)

# LabelEncoding
for col in cat_list[:-1]:
    le.fit_transform(X_train[col])
    test_new[col] = le.transform(test[col])

# CALC 별도 처리(따로 인코딩)
test_new["CALC"] = test["CALC"]
for i, val in enumerate(test["CALC"].unique()):
    test_new["CALC"] = test_new["CALC"].replace(val,i)

In [115]:
test_new.head()

Unnamed: 0_level_0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,Gender,family_history_with_overweight,FAVC,CAEC,SMOKE,SCC,MTRANS,CALC
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
20758,0.30714,0.757698,0.647391,0.969308,0.666667,0.912815,0.285133,0.0,1,1,1,2,0,0,3,0
20759,0.166667,0.285354,0.213553,0.5,0.0,1.0,0.333333,0.0,0,1,1,2,0,0,3,0
20760,0.285714,0.367831,0.57559,1.0,0.666667,0.810939,0.0,0.125251,0,1,1,2,0,0,3,0
20761,0.166173,0.196185,0.51262,0.5,0.659303,0.893209,0.031617,0.0,1,1,1,2,0,0,3,0
20762,0.285714,0.337471,0.521879,1.0,0.666667,0.826766,0.0,0.370534,0,1,1,2,0,0,3,0


target decoding

In [117]:
le_target = LabelEncoder()

In [118]:
y_train_encoded = le_target.fit_transform(y_train)
y_val_encoded = le_target.transform(y_val)

In [119]:
sub = clf.predict(test_new)
sub

array([3, 5, 4, ..., 0, 1, 3])

In [126]:
sub = pd.DataFrame({"NObeyesdad": le_target.inverse_transform(sub)}, index = test.index)
sub

Unnamed: 0_level_0,NObeyesdad
id,Unnamed: 1_level_1
20758,Obesity_Type_II
20759,Overweight_Level_I
20760,Obesity_Type_III
20761,Obesity_Type_I
20762,Obesity_Type_III
...,...
34593,Overweight_Level_II
34594,Overweight_Level_I
34595,Insufficient_Weight
34596,Normal_Weight


In [131]:
sub.to_csv("./data/Submission_Obesity.csv", index="id")

In [132]:
pd.read_csv("./data/Submission_Obesity.csv")

Unnamed: 0,id,NObeyesdad
0,20758,Obesity_Type_II
1,20759,Overweight_Level_I
2,20760,Obesity_Type_III
3,20761,Obesity_Type_I
4,20762,Obesity_Type_III
...,...,...
13835,34593,Overweight_Level_II
13836,34594,Overweight_Level_I
13837,34595,Insufficient_Weight
13838,34596,Normal_Weight
