In [1]:
import numpy as np 
import pandas as pd 

import plotly.express as px

from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

In [2]:
train_data = pd.read_csv('/kaggle/input/playground-series-s4e2/train.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s4e2/test.csv')

pd.concat([test_data, train_data]).info(memory_usage=False)

<class 'pandas.core.frame.DataFrame'>
Index: 34598 entries, 0 to 20757
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              34598 non-null  int64  
 1   Gender                          34598 non-null  object 
 2   Age                             34598 non-null  float64
 3   Height                          34598 non-null  float64
 4   Weight                          34598 non-null  float64
 5   family_history_with_overweight  34598 non-null  object 
 6   FAVC                            34598 non-null  object 
 7   FCVC                            34598 non-null  float64
 8   NCP                             34598 non-null  float64
 9   CAEC                            34598 non-null  object 
 10  SMOKE                           34598 non-null  object 
 11  CH2O                            34598 non-null  float64
 12  SCC                             34598

In [3]:
train_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,20758.0,10378.5,5992.46278,0.0,5189.25,10378.5,15567.75,20757.0
Age,20758.0,23.841804,5.688072,14.0,20.0,22.815416,26.0,61.0
Height,20758.0,1.700245,0.087312,1.45,1.631856,1.7,1.762887,1.975663
Weight,20758.0,87.887768,26.379443,39.0,66.0,84.064875,111.600553,165.057269
FCVC,20758.0,2.445908,0.533218,1.0,2.0,2.393837,3.0,3.0
NCP,20758.0,2.761332,0.705375,1.0,3.0,3.0,3.0,4.0
CH2O,20758.0,2.029418,0.608467,1.0,1.792022,2.0,2.549617,3.0
FAF,20758.0,0.981747,0.838302,0.0,0.008013,1.0,1.587406,3.0
TUE,20758.0,0.616756,0.602113,0.0,0.0,0.573887,1.0,2.0


In [4]:
train_data.describe(include='O').T

Unnamed: 0,count,unique,top,freq
Gender,20758,2,Female,10422
family_history_with_overweight,20758,2,yes,17014
FAVC,20758,2,yes,18982
CAEC,20758,4,Sometimes,17529
SMOKE,20758,2,no,20513
SCC,20758,2,no,20071
CALC,20758,3,Sometimes,15066
MTRANS,20758,5,Public_Transportation,16687
NObeyesdad,20758,7,Obesity_Type_III,4046


In [5]:
# Target class values
fig = px.histogram(train_data.NObeyesdad, orientation='h', height=250)
fig.show()

In [6]:
corr = train_data.drop(columns=['id']).corr(numeric_only=True)
fig = px.imshow(corr, text_auto=True, aspect='auto')
fig.update_xaxes(side="top")
fig.show()

In [7]:
X = train_data.drop(columns=['NObeyesdad'])
y = train_data.NObeyesdad

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y) 

X_train.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
7219,7219,Female,26.0,1.622771,111.949972,yes,yes,3.0,3.0,Sometimes,no,2.784303,no,0.0,0.123861,Sometimes,Public_Transportation
10037,10037,Female,25.930376,1.71582,112.95611,yes,yes,3.0,3.0,Sometimes,no,2.738485,no,0.037202,0.153559,Sometimes,Public_Transportation
9426,9426,Male,31.761799,1.755938,120.021161,yes,yes,2.9673,3.0,Sometimes,no,2.425927,no,0.955317,0.0,Sometimes,Automobile
19859,19859,Male,25.027254,1.777971,114.482386,yes,yes,1.108663,3.0,Sometimes,no,2.008361,no,0.336795,0.009254,Sometimes,Public_Transportation
1368,1368,Female,24.0,1.63,84.0,yes,yes,3.0,1.0,Sometimes,no,3.0,no,1.0,0.0,Sometimes,Public_Transportation


In [8]:
cat_features = train_data.select_dtypes('O').columns.values[:-1]

clf = CatBoostClassifier(auto_class_weights = 'Balanced', 
                         iterations=1000, 
                         early_stopping_rounds=50)
clf.fit(X_train,y_train, 
        eval_set=(X_test,y_test), 
        cat_features=cat_features, 
        verbose=50)

Learning rate set to 0.113913
0:	learn: 1.6141261	test: 1.6125423	best: 1.6125423 (0)	total: 272ms	remaining: 4m 31s
50:	learn: 0.4022994	test: 0.4074988	best: 0.4074988 (50)	total: 7.57s	remaining: 2m 20s
100:	learn: 0.3304993	test: 0.3487029	best: 0.3487029 (100)	total: 14.3s	remaining: 2m 7s
150:	learn: 0.2927595	test: 0.3251017	best: 0.3251017 (150)	total: 21.1s	remaining: 1m 58s
200:	learn: 0.2701621	test: 0.3143903	best: 0.3143903 (200)	total: 27.9s	remaining: 1m 50s
250:	learn: 0.2549025	test: 0.3093778	best: 0.3093778 (250)	total: 34.9s	remaining: 1m 44s
300:	learn: 0.2407981	test: 0.3063556	best: 0.3063556 (300)	total: 41.7s	remaining: 1m 36s
350:	learn: 0.2293255	test: 0.3041169	best: 0.3040305 (348)	total: 48.2s	remaining: 1m 29s
400:	learn: 0.2174276	test: 0.3024730	best: 0.3024730 (400)	total: 54.9s	remaining: 1m 22s
450:	learn: 0.2076878	test: 0.3021314	best: 0.3021314 (450)	total: 1m 1s	remaining: 1m 14s
500:	learn: 0.1977045	test: 0.3008828	best: 0.3008190 (498)	total: 

<catboost.core.CatBoostClassifier at 0x7909767dabc0>

In [9]:
submission = pd.read_csv('/kaggle/input/playground-series-s4e2/sample_submission.csv', index_col='id')
submission['NObeyesdad'] = clf.predict(test_data)[:,0]
submission.to_csv('submission.csv')
submission.head()

Unnamed: 0_level_0,NObeyesdad
id,Unnamed: 1_level_1
20758,Obesity_Type_II
20759,Overweight_Level_I
20760,Obesity_Type_III
20761,Obesity_Type_I
20762,Obesity_Type_III
