# Data Preprocessing

In [12]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [13]:
# Step 1: Load the dataset
df = pd.read_csv("dataset.csv")

In [14]:
# Step 1: Handling missing values
# Identify columns with missing values
missing_columns = df.columns[df.isnull().any()]

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   age                            1000 non-null   int64  
 1   income                         1000 non-null   int64  
 2   daily_hours_physical_activity  1000 non-null   int64  
 3   servings_fruits_veggies        1000 non-null   int64  
 4   BMI                            1000 non-null   float64
 5   self_reported_health_status    1000 non-null   object 
 6   gender_male                    1000 non-null   float64
dtypes: float64(2), int64(4), object(1)
memory usage: 62.5+ KB


In [16]:
# Step 2: Identifying and handling outliers
# For simplicity, let's use Z-score method to identify outliers in numerical columns
from scipy import stats
z_scores = np.abs(stats.zscore(df.select_dtypes(include=np.number)))
outliers = (z_scores > 3).any(axis=1)
df = df[~outliers]

In [17]:
# Step 3: Converting categorical variables
# One-hot encoding for categorical variable 'gender'
onehot_encoder = OneHotEncoder(drop='first', sparse=False)
gender_encoded = onehot_encoder.fit_transform(df[['gender']])
gender_encoded_df = pd.DataFrame(gender_encoded, columns=['gender_male'])
df = pd.concat([df, gender_encoded_df], axis=1)
df.drop(columns=['gender'], inplace=True)

In [19]:
# Print the cleaned dataset
print("Cleaned dataset: ")
df.head()

Cleaned dataset:


Unnamed: 0,age,income,daily_hours_physical_activity,servings_fruits_veggies,BMI,self_reported_health_status,gender_male
0,56,49241,2,0,39.469877,Good,1.0
1,46,64569,2,7,36.064939,Fair,0.0
2,32,31745,1,9,36.82977,Poor,0.0
3,60,46029,0,5,38.666252,Very Good,0.0
4,25,33025,0,4,20.329725,Excellent,1.0


1:male
0:female

In [21]:
from sklearn.preprocessing import LabelEncoder

# Label Encoding
label_encoder = LabelEncoder()
df['encoded_health_status'] = label_encoder.fit_transform(df['self_reported_health_status'])

# Print the encoded dataset
print("Encoded dataset:")
df.head()

Encoded dataset:


Unnamed: 0,age,income,daily_hours_physical_activity,servings_fruits_veggies,BMI,self_reported_health_status,gender_male,encoded_health_status
0,56,49241,2,0,39.469877,Good,1.0,2
1,46,64569,2,7,36.064939,Fair,0.0,1
2,32,31745,1,9,36.82977,Poor,0.0,3
3,60,46029,0,5,38.666252,Very Good,0.0,4
4,25,33025,0,4,20.329725,Excellent,1.0,0


0:excellent
1:Fair
2:Good
3:Poor
4:Very Good

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   age                            1000 non-null   int64  
 1   income                         1000 non-null   int64  
 2   daily_hours_physical_activity  1000 non-null   int64  
 3   servings_fruits_veggies        1000 non-null   int64  
 4   BMI                            1000 non-null   float64
 5   self_reported_health_status    1000 non-null   object 
 6   gender_male                    1000 non-null   float64
 7   encoded_health_status          1000 non-null   int32  
dtypes: float64(2), int32(1), int64(4), object(1)
memory usage: 66.4+ KB


In [23]:
# Drop the original column 'self_reported_health_status'
df.drop(columns=['self_reported_health_status'], inplace=True)

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   age                            1000 non-null   int64  
 1   income                         1000 non-null   int64  
 2   daily_hours_physical_activity  1000 non-null   int64  
 3   servings_fruits_veggies        1000 non-null   int64  
 4   BMI                            1000 non-null   float64
 5   gender_male                    1000 non-null   float64
 6   encoded_health_status          1000 non-null   int32  
dtypes: float64(2), int32(1), int64(4)
memory usage: 58.6 KB


In [26]:
from scipy import stats

# Calculate Z-scores for numerical columns
z_scores = stats.zscore(df.select_dtypes(include=np.number))

# Define a threshold for identifying outliers (e.g., Z-score > 3)
threshold = 3

# Identify outliers
outliers = np.abs(z_scores) > threshold

# Remove rows containing outliers
cleaned_df = df[~outliers.any(axis=1)]

# Print the cleaned dataset
print("Cleaned dataset after removing outliers:")
cleaned_df.head()

Cleaned dataset after removing outliers:


Unnamed: 0,age,income,daily_hours_physical_activity,servings_fruits_veggies,BMI,gender_male,encoded_health_status
0,56,49241,2,0,39.469877,1.0,2
1,46,64569,2,7,36.064939,0.0,1
2,32,31745,1,9,36.82977,0.0,3
3,60,46029,0,5,38.666252,0.0,4
4,25,33025,0,4,20.329725,1.0,0


In [27]:
df.to_csv('dataset_new.csv', index=False)