In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import tqdm
import seaborn as sns

from sklearn.metrics import roc_curve
from sklearn.metrics import auc

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# игнорировать предупреждения
import warnings
warnings.filterwarnings("ignore")

In [2]:
def calc_auc(y, y_pred, plot_label='', prin=True):
    fpr, tpr, _ = roc_curve(y, y_pred)
    auc_val = auc(fpr, tpr)
    if prin:
        print('ROC AUC: {0:.4f}'.format(auc_val))
    if plot_label:
        plt.plot(fpr, tpr, label=plot_label)
        plt.xlabel('FPR')
        plt.ylabel('TPR')
    return auc_val

In [3]:
data = pd.read_csv('tendency_to_obesity.csv')
data

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.000000,1.620000,64.000000,yes,no,2.0,3.0,Sometimes,no,2.000000,no,0.000000,1.000000,no,Public_Transportation,Normal_Weight
1,Female,21.000000,1.520000,56.000000,yes,no,3.0,3.0,Sometimes,yes,3.000000,yes,3.000000,0.000000,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.000000,1.800000,77.000000,yes,no,2.0,3.0,Sometimes,no,2.000000,no,2.000000,1.000000,Frequently,Public_Transportation,Normal_Weight
3,Male,27.000000,1.800000,87.000000,no,no,3.0,3.0,Sometimes,no,2.000000,no,2.000000,0.000000,Frequently,Walking,Overweight_Level_I
4,Male,22.000000,1.780000,89.800000,no,no,2.0,1.0,Sometimes,no,2.000000,no,0.000000,0.000000,Sometimes,Public_Transportation,Overweight_Level_II
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,Female,20.976842,1.710730,131.408528,yes,yes,3.0,3.0,Sometimes,no,1.728139,no,1.676269,0.906247,Sometimes,Public_Transportation,Obesity_Type_III
2107,Female,21.982942,1.748584,133.742943,yes,yes,3.0,3.0,Sometimes,no,2.005130,no,1.341390,0.599270,Sometimes,Public_Transportation,Obesity_Type_III
2108,Female,22.524036,1.752206,133.689352,yes,yes,3.0,3.0,Sometimes,no,2.054193,no,1.414209,0.646288,Sometimes,Public_Transportation,Obesity_Type_III
2109,Female,24.361936,1.739450,133.346641,yes,yes,3.0,3.0,Sometimes,no,2.852339,no,1.139107,0.586035,Sometimes,Public_Transportation,Obesity_Type_III


In [4]:
for c in data.columns:
    if (len(data[ pd.isnull(data[c]) ])):
      print(c)

In [5]:
data['MTRANS'].unique()

array(['Public_Transportation', 'Walking', 'Automobile', 'Motorbike',
       'Bike'], dtype=object)

In [6]:
data.replace({ 
    'yes': 1, 
    'no': 0, 
    'Male': 1, 
    'Female': 0,
    'Sometimes': 1,
    'Frequently': 2,
    'Always': 3,
    'Automobile': 0,
    'Public_Transportation': 2,
    'Motorbike': 4,
    'Bike': 9,
    'Walking': 12
}, inplace=True)

In [7]:
data['Gender'] = data['Gender'].astype('int')
data['Age'] = data['Age'].astype('int')
data['Height'] = (data['Height'] * 100).astype('int')
data['Weight'] = (data['Weight'] * 10).astype('int')
data['FCVC'] = data['FCVC'].astype('int')
data['NCP'] = data['NCP'].astype('int')
data['CH2O'] = (data['CH2O'] * 100).astype('int')
data['FAF'] = (data['FAF'] * 100).astype('int')
data['TUE'] = (data['TUE'] * 100).astype('int')
data['family_history_with_overweight'] = data['family_history_with_overweight'].astype('int')
data['FAVC'] = data['FAVC'].astype('int')
data['CAEC'] = data['CAEC'].astype('int')
data['SMOKE'] = data['SMOKE'].astype('int')
data['SCC'] = data['SCC'].astype('int')
data['CALC'] = data['CALC'].astype('int')
data['MTRANS'] = data['MTRANS'].astype('int')
data['NObeyesdad'] = data['NObeyesdad'].astype('category')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype   
---  ------                          --------------  -----   
 0   Gender                          2111 non-null   int32   
 1   Age                             2111 non-null   int32   
 2   Height                          2111 non-null   int32   
 3   Weight                          2111 non-null   int32   
 4   family_history_with_overweight  2111 non-null   int32   
 5   FAVC                            2111 non-null   int32   
 6   FCVC                            2111 non-null   int32   
 7   NCP                             2111 non-null   int32   
 8   CAEC                            2111 non-null   int32   
 9   SMOKE                           2111 non-null   int32   
 10  CH2O                            2111 non-null   int32   
 11  SCC                             2111 non-null   int32   
 12  FAF                 

In [8]:
data

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,21,162,640,1,0,2,3,1,0,200,0,0,100,0,2,Normal_Weight
1,0,21,152,560,1,0,3,3,1,1,300,1,300,0,1,2,Normal_Weight
2,1,23,180,770,1,0,2,3,1,0,200,0,200,100,2,2,Normal_Weight
3,1,27,180,870,0,0,3,3,1,0,200,0,200,0,2,12,Overweight_Level_I
4,1,22,178,898,0,0,2,1,1,0,200,0,0,0,1,2,Overweight_Level_II
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,0,20,171,1314,1,1,3,3,1,0,172,0,167,90,1,2,Obesity_Type_III
2107,0,21,174,1337,1,1,3,3,1,0,200,0,134,59,1,2,Obesity_Type_III
2108,0,22,175,1336,1,1,3,3,1,0,205,0,141,64,1,2,Obesity_Type_III
2109,0,24,173,1333,1,1,3,3,1,0,285,0,113,58,1,2,Obesity_Type_III


In [9]:
#X1 = data[['Gender', 'Age', 'Height', 'Weight']]
y1 = data[['NObeyesdad']]
X1 = data.drop(['NObeyesdad'], axis=1)
#y1.replace({
#    'Overweight_Level_I': 'ow', 
#    'Overweight_Level_II': 'ow', 
#    'Obesity_Type_I': 'ow', 
#    'Obesity_Type_II': 'ow', 
#    'Obesity_Type_III': 'ow'
#}, inplace=True)

In [10]:
#sns.pairplot(pd.concat([X1, y1], axis=1), hue='NObeyesdad', diag_kind="kde", palette="colorblind");

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

In [12]:
from sklearn.tree import DecisionTreeClassifier

In [13]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=42)

In [14]:
gs1 = GridSearchCV(DecisionTreeClassifier(),
                  param_grid = {
                      'criterion': ['gini', 'enthropy'],
                      'max_depth': range(1, 20),
                      'splitter': ['best', 'random'],
                      'max_features': [None, 0.8]
                  })
gs1.fit(X1_train, y1_train)
print(classification_report(y1_test, gs1.predict(X1_test)))

                     precision    recall  f1-score   support

Insufficient_Weight       0.93      0.96      0.95        56
      Normal_Weight       0.92      0.89      0.90        62
     Obesity_Type_I       0.97      0.94      0.95        78
    Obesity_Type_II       0.93      0.98      0.96        58
   Obesity_Type_III       1.00      1.00      1.00        63
 Overweight_Level_I       0.90      0.93      0.91        56
Overweight_Level_II       0.96      0.92      0.94        50

           accuracy                           0.95       423
          macro avg       0.94      0.95      0.94       423
       weighted avg       0.95      0.95      0.95       423



In [15]:
gs1.best_params_

{'criterion': 'gini',
 'max_depth': 16,
 'max_features': None,
 'splitter': 'best'}

In [16]:
from sklearn.svm import LinearSVC

In [17]:
gs1 = GridSearchCV(LinearSVC(),
                  param_grid = {
                      'C': [0.001, 0.01, 0.1, 1, 10, 100]
                  })
gs1.fit(X1_train, y1_train)
print(classification_report(y1_test, gs1.predict(X1_test)))

                     precision    recall  f1-score   support

Insufficient_Weight       0.77      1.00      0.87        56
      Normal_Weight       0.75      0.39      0.51        62
     Obesity_Type_I       0.43      0.13      0.20        78
    Obesity_Type_II       0.25      1.00      0.40        58
   Obesity_Type_III       1.00      0.11      0.20        63
 Overweight_Level_I       0.53      0.48      0.50        56
Overweight_Level_II       0.00      0.00      0.00        50

           accuracy                           0.43       423
          macro avg       0.53      0.44      0.38       423
       weighted avg       0.54      0.43      0.38       423



In [18]:
from sklearn.linear_model import LogisticRegression

In [19]:
gs1 = GridSearchCV(LogisticRegression(),
                  param_grid = {
                      'C': [0.01, 0.1, 1],
                      'solver': ['newton-cg'],
                      'multi_class': ['ovr', 'multinomial']
                  })
gs1.fit(X1_train, y1_train)
print(classification_report(y1_test, gs1.predict(X1_test)))

                     precision    recall  f1-score   support

Insufficient_Weight       0.95      0.98      0.96        56
      Normal_Weight       0.98      0.89      0.93        62
     Obesity_Type_I       1.00      0.97      0.99        78
    Obesity_Type_II       0.97      1.00      0.98        58
   Obesity_Type_III       1.00      1.00      1.00        63
 Overweight_Level_I       0.90      0.96      0.93        56
Overweight_Level_II       0.96      0.96      0.96        50

           accuracy                           0.97       423
          macro avg       0.97      0.97      0.97       423
       weighted avg       0.97      0.97      0.97       423



In [20]:
gs1.best_params_

{'C': 0.1, 'multi_class': 'multinomial', 'solver': 'newton-cg'}

In [21]:
from sklearn.linear_model import SGDClassifier

In [22]:
gs1 = GridSearchCV(SGDClassifier(),
                  param_grid = {
                      'epsilon': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
                      'loss': ['hinge', 'log', 'huber'],
                      'alpha': [0.0001, 0.001, 0.01]
                  })
gs1.fit(X1_train, y1_train)
print(classification_report(y1_test, gs1.predict(X1_test)))

                     precision    recall  f1-score   support

Insufficient_Weight       0.79      1.00      0.88        56
      Normal_Weight       0.91      0.16      0.27        62
     Obesity_Type_I       0.00      0.00      0.00        78
    Obesity_Type_II       0.36      0.50      0.42        58
   Obesity_Type_III       0.64      1.00      0.78        63
 Overweight_Level_I       0.41      0.23      0.30        56
Overweight_Level_II       0.27      0.70      0.39        50

           accuracy                           0.49       423
          macro avg       0.48      0.51      0.43       423
       weighted avg       0.47      0.49      0.42       423



In [23]:
from sklearn.ensemble import ExtraTreesClassifier

et = ExtraTreesClassifier(n_estimators=50, min_samples_split=2, max_depth=14, max_features=0.8, max_leaf_nodes=16, min_samples_leaf=20, n_jobs=-1, random_state=42)
et.fit(X1_train, y1_train)

print(classification_report(y1_test, gs1.predict(X1_test)))

                     precision    recall  f1-score   support

Insufficient_Weight       0.79      1.00      0.88        56
      Normal_Weight       0.91      0.16      0.27        62
     Obesity_Type_I       0.00      0.00      0.00        78
    Obesity_Type_II       0.36      0.50      0.42        58
   Obesity_Type_III       0.64      1.00      0.78        63
 Overweight_Level_I       0.41      0.23      0.30        56
Overweight_Level_II       0.27      0.70      0.39        50

           accuracy                           0.49       423
          macro avg       0.48      0.51      0.43       423
       weighted avg       0.47      0.49      0.42       423

