# Set up Notebook

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# All specific imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

# Set up data

In [4]:
TRAIN_DATA_PATH = "sample_data/obesity_train.csv" # <-- EDIT THIS
TEST_DATA_PATH = "sample_data/obesity_test.csv" # <-- EDIT THIS

index_col = "id" # <-- EDIT THIS, Default is None

real_train_df = pd.read_csv(TRAIN_DATA_PATH, index_col=index_col)
real_test_df = pd.read_csv(TEST_DATA_PATH, index_col=index_col)

In [5]:
df = real_train_df.copy()

In [6]:
resp = "NObeyesdad" # <-- EDIT THIS
preds = [col for col in df.columns if col != resp]
preds_num = list(df[preds].select_dtypes('number').columns)
preds_cat = [pred for pred in preds if pred not in preds_num]

# Feel free to modify the preds here

# Quick Data Pre-Analysis

In [7]:
# TODO

In [8]:
df.head(10)

Unnamed: 0_level_0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II
5,Male,18.128249,1.748524,51.552595,yes,yes,2.919751,3.0,Sometimes,no,2.13755,no,1.930033,1.0,Sometimes,Public_Transportation,Insufficient_Weight
6,Male,29.883021,1.754711,112.725005,yes,yes,1.99124,3.0,Sometimes,no,2.0,no,0.0,0.696948,Sometimes,Automobile,Obesity_Type_II
7,Male,29.891473,1.75015,118.206565,yes,yes,1.397468,3.0,Sometimes,no,2.0,no,0.598655,0.0,Sometimes,Automobile,Obesity_Type_II
8,Male,17.0,1.7,70.0,no,yes,2.0,3.0,Sometimes,no,3.0,yes,1.0,1.0,no,Public_Transportation,Overweight_Level_I
9,Female,26.0,1.638836,111.275646,yes,yes,3.0,3.0,Sometimes,no,2.632253,no,0.0,0.218645,Sometimes,Public_Transportation,Obesity_Type_III


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20758 entries, 0 to 20757
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          20758 non-null  object 
 1   Age                             20758 non-null  float64
 2   Height                          20758 non-null  float64
 3   Weight                          20758 non-null  float64
 4   family_history_with_overweight  20758 non-null  object 
 5   FAVC                            20758 non-null  object 
 6   FCVC                            20758 non-null  float64
 7   NCP                             20758 non-null  float64
 8   CAEC                            20758 non-null  object 
 9   SMOKE                           20758 non-null  object 
 10  CH2O                            20758 non-null  float64
 11  SCC                             20758 non-null  object 
 12  FAF                             

In [10]:
df.describe()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
count,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0
mean,23.841804,1.700245,87.887768,2.445908,2.761332,2.029418,0.981747,0.616756
std,5.688072,0.087312,26.379443,0.533218,0.705375,0.608467,0.838302,0.602113
min,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0
25%,20.0,1.631856,66.0,2.0,3.0,1.792022,0.008013,0.0
50%,22.815416,1.7,84.064875,2.393837,3.0,2.0,1.0,0.573887
75%,26.0,1.762887,111.600553,3.0,3.0,2.549617,1.587406,1.0
max,61.0,1.975663,165.057269,3.0,4.0,3.0,3.0,2.0


In [11]:
for category in preds_cat:
    print(f"{category: <15}: {df[category].unique()}")

Gender         : ['Male' 'Female']
family_history_with_overweight: ['yes' 'no']
FAVC           : ['yes' 'no']
CAEC           : ['Sometimes' 'Frequently' 'no' 'Always']
SMOKE          : ['no' 'yes']
SCC            : ['no' 'yes']
CALC           : ['Sometimes' 'no' 'Frequently']
MTRANS         : ['Public_Transportation' 'Automobile' 'Walking' 'Motorbike' 'Bike']


# Encode Data

In [12]:
# NOTE: The following 2 lines are needed to make the implementation idempotent
train_df, test_df = train_test_split(real_train_df, test_size = 0.3, random_state=42)
df = train_df.copy()


preds_cat_enc = OrdinalEncoder(dtype=int, 
                    handle_unknown='error', #'use_encoded_value', encoded_missing_value=-2, unknown_value=-1                   
                   )
resp_enc = OrdinalEncoder(dtype=int, handle_unknown='error') # <-- EDIT THIS depending on what resp is

df[preds_cat] = preds_cat_enc.fit_transform(df[preds_cat])
df[[resp]] = resp_enc.fit_transform(df[[resp]])

test_df[preds_cat] = preds_cat_enc.transform(test_df[preds_cat])
test_df[[resp]] = resp_enc.transform(test_df[[resp]])

In [13]:
preds_cat_enc.categories_

[array(['Female', 'Male'], dtype=object),
 array(['no', 'yes'], dtype=object),
 array(['no', 'yes'], dtype=object),
 array(['Always', 'Frequently', 'Sometimes', 'no'], dtype=object),
 array(['no', 'yes'], dtype=object),
 array(['no', 'yes'], dtype=object),
 array(['Frequently', 'Sometimes', 'no'], dtype=object),
 array(['Automobile', 'Bike', 'Motorbike', 'Public_Transportation',
        'Walking'], dtype=object)]

# Mutual information

In [14]:
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression

def make_mi_scores(X, y, categorical_features):
    bool_discrete = [True if col in categorical_features else False for col in X.columns]
    mi_scores = mutual_info_classif(X, y, discrete_features=bool_discrete, random_state=42)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores


Weight                            1.363314
Age                               0.794324
Height                            0.760617
CH2O                              0.512070
FCVC                              0.497405
TUE                               0.485103
FAF                               0.452150
Gender                            0.254999
NCP                               0.213612
family_history_with_overweight    0.168444
CAEC                              0.142202
CALC                              0.117869
MTRANS                            0.073387
FAVC                              0.044399
SCC                               0.025149
SMOKE                             0.004980
Name: MI Scores, dtype: float64

In [None]:
%%time
make_mi_scores(df[preds], df[resp], categorical_features=preds_cat)
