In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsOneClassifier
from sklearn.metrics import accuracy_score

In [4]:
file_path = r"C:\Users\dipak\OneDrive\desktop\github_ai\ml_with_python\three_module\Multi-class-Classification\Obesity_level_prediction_dataset.csv"

In [5]:
df = pd.read_csv(file_path)


In [7]:
df.sample(5)

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
494,Female,18.0,1.67,66.0,no,yes,3.0,3.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Normal_Weight
1469,Male,18.0,1.811738,108.897324,yes,yes,2.0,1.202179,Sometimes,no,2.36293,no,1.0,1.47574,no,Public_Transportation,Obesity_Type_I
770,Male,21.029633,1.607082,67.722222,yes,yes,2.0,3.691226,no,no,3.0,no,1.228136,0.3352,Sometimes,Public_Transportation,Overweight_Level_I
1526,Male,31.457413,1.87407,128.867444,yes,yes,2.956297,3.0,Sometimes,yes,1.2751,no,0.901924,1.875023,Sometimes,Automobile,Obesity_Type_II
1072,Male,19.443639,1.744733,87.27989,yes,yes,2.442536,3.0,Sometimes,no,2.825629,no,3.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2111 non-null   float64
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   object 
 5   FAVC                            2111 non-null   object 
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2111 non-null   object 
 9   SMOKE                           2111 non-null   object 
 10  CH2O                            2111 non-null   float64
 11  SCC                             2111 non-null   object 
 12  FAF                             21

In [9]:
df.describe()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
count,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0
mean,24.3126,1.701677,86.586058,2.419043,2.685628,2.008011,1.010298,0.657866
std,6.345968,0.093305,26.191172,0.533927,0.778039,0.612953,0.850592,0.608927
min,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0
25%,19.947192,1.63,65.473343,2.0,2.658738,1.584812,0.124505,0.0
50%,22.77789,1.700499,83.0,2.385502,3.0,2.0,1.0,0.62535
75%,26.0,1.768464,107.430682,3.0,3.0,2.47742,1.666678,1.0
max,61.0,1.98,173.0,3.0,4.0,3.0,3.0,2.0


Feature Scaling:

In [11]:
# Standardizing the continous numerical features:
continous_columns = df.select_dtypes(include=["float64"]).columns.tolist()

scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[continous_columns])

In [15]:
scaled_features_df = pd.DataFrame(scaled_features, columns=scaler.get_feature_names_out(continous_columns))
scaled_features_df.head()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
0,-0.522124,-0.875589,-0.862558,-0.785019,0.404153,-0.013073,-1.188039,0.561997
1,-0.522124,-1.947599,-1.168077,1.088342,0.404153,1.618759,2.33975,-1.080625
2,-0.206889,1.054029,-0.36609,-0.785019,0.404153,-0.013073,1.16382,0.561997
3,0.423582,1.054029,0.015808,1.088342,0.404153,-0.013073,1.16382,-1.080625
4,-0.364507,0.839627,0.12274,-0.785019,-2.167023,-0.013073,-1.188039,-1.080625


In [16]:
# Combining with the original dataset
scaled_data = pd.concat([df.drop(columns=continous_columns),scaled_features_df], axis=1)

In [17]:
scaled_data.head()

Unnamed: 0,Gender,family_history_with_overweight,FAVC,CAEC,SMOKE,SCC,CALC,MTRANS,NObeyesdad,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
0,Female,yes,no,Sometimes,no,no,no,Public_Transportation,Normal_Weight,-0.522124,-0.875589,-0.862558,-0.785019,0.404153,-0.013073,-1.188039,0.561997
1,Female,yes,no,Sometimes,yes,yes,Sometimes,Public_Transportation,Normal_Weight,-0.522124,-1.947599,-1.168077,1.088342,0.404153,1.618759,2.33975,-1.080625
2,Male,yes,no,Sometimes,no,no,Frequently,Public_Transportation,Normal_Weight,-0.206889,1.054029,-0.36609,-0.785019,0.404153,-0.013073,1.16382,0.561997
3,Male,no,no,Sometimes,no,no,Frequently,Walking,Overweight_Level_I,0.423582,1.054029,0.015808,1.088342,0.404153,-0.013073,1.16382,-1.080625
4,Male,no,no,Sometimes,no,no,Sometimes,Public_Transportation,Overweight_Level_II,-0.364507,0.839627,0.12274,-0.785019,-2.167023,-0.013073,-1.188039,-1.080625


In [18]:
# Identifying categorical columns

In [19]:
categorical_columns = scaled_data.select_dtypes(include=["object"]).columns.to_list()
categorical_columns.remove("NObeyesdad")

In [20]:
categorical_columns

['Gender',
 'family_history_with_overweight',
 'FAVC',
 'CAEC',
 'SMOKE',
 'SCC',
 'CALC',
 'MTRANS']

In [24]:
# Applying one hot encoding
encoder = OneHotEncoder(sparse_output=False, drop="first")

In [25]:
encoded_features = encoder.fit_transform(scaled_data[categorical_columns])

In [26]:
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_columns))

In [27]:
prepped_data = pd.concat([scaled_data.drop(columns=categorical_columns), encoded_df],axis=1)

In [28]:
prepped_data["NObeyesdad"] = prepped_data["NObeyesdad"].astype("category").cat.codes
prepped_data.head()

Unnamed: 0,NObeyesdad,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,Gender_Male,...,CAEC_no,SMOKE_yes,SCC_yes,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,1,-0.522124,-0.875589,-0.862558,-0.785019,0.404153,-0.013073,-1.188039,0.561997,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,1,-0.522124,-1.947599,-1.168077,1.088342,0.404153,1.618759,2.33975,-1.080625,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,1,-0.206889,1.054029,-0.36609,-0.785019,0.404153,-0.013073,1.16382,0.561997,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,5,0.423582,1.054029,0.015808,1.088342,0.404153,-0.013073,1.16382,-1.080625,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,6,-0.364507,0.839627,0.12274,-0.785019,-2.167023,-0.013073,-1.188039,-1.080625,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [29]:
X = prepped_data.drop('NObeyesdad', axis=1)
y = prepped_data["NObeyesdad"]

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42, stratify=y)

In [1]:
model_ova = LogisticRegression(multi_class="ovr",max_iter=1000)
model_ova.fit(X_train,y_train)

NameError: name 'LogisticRegression' is not defined