In [None]:
# Activity 2 - Dataset Preparation

# 📌 Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# 📌 Step 2: Load the Dataset
df = pd.read_csv('../data/liver_dataset.csv')  # adjust path as needed

# Display first few rows
df.head()


Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [3]:
# 📌 Step 3: Understand Data Structure
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age                         583 non-null    int64  
 1   Gender                      583 non-null    object 
 2   Total_Bilirubin             583 non-null    float64
 3   Direct_Bilirubin            583 non-null    float64
 4   Alkaline_Phosphotase        583 non-null    int64  
 5   Alamine_Aminotransferase    583 non-null    int64  
 6   Aspartate_Aminotransferase  583 non-null    int64  
 7   Total_Protiens              583 non-null    float64
 8   Albumin                     583 non-null    float64
 9   Albumin_and_Globulin_Ratio  579 non-null    float64
 10  Dataset                     583 non-null    int64  
dtypes: float64(5), int64(5), object(1)
memory usage: 50.2+ KB


In [6]:
# 📌 Step 4: Check for Missing Values
df.isnull().sum()


Unnamed: 0,0
Age,0
Gender,0
Total_Bilirubin,0
Direct_Bilirubin,0
Alkaline_Phosphotase,0
Alamine_Aminotransferase,0
Aspartate_Aminotransferase,0
Total_Protiens,0
Albumin,0
Albumin_and_Globulin_Ratio,4


In [7]:
# 📌 Step 5: Handle Missing Values
# Drop rows with any missing values for simplicity
df.dropna(inplace=True)

# Confirm no missing values remain
df.isnull().sum()


Unnamed: 0,0
Age,0
Gender,0
Total_Bilirubin,0
Direct_Bilirubin,0
Alkaline_Phosphotase,0
Alamine_Aminotransferase,0
Aspartate_Aminotransferase,0
Total_Protiens,0
Albumin,0
Albumin_and_Globulin_Ratio,0


In [8]:
# 📌 Step 6: Encode 'Gender' Column
df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})

# Double-check unique values
df['Gender'].unique()


array([0, 1])

In [9]:
# 📌 Step 7: Standardize Target Column (1 → Cirrhosis, 2 → Healthy)
df['Dataset'] = df['Dataset'].map({1: 1, 2: 0})  # 1 = Cirrhosis, 0 = No Cirrhosis
df['Dataset'].value_counts()


Unnamed: 0_level_0,count
Dataset,Unnamed: 1_level_1
1,414
0,165


In [10]:
# 📌 Step 8: Split Features and Target
X = df.drop('Dataset', axis=1)
y = df['Dataset']

# Show shape
print("Features Shape:", X.shape)
print("Target Shape:", y.shape)


Features Shape: (579, 10)
Target Shape: (579,)


In [11]:
# 📌 Step 8: Split Features and Target
X = df.drop('Dataset', axis=1)
y = df['Dataset']

# Show shape
print("Features Shape:", X.shape)
print("Target Shape:", y.shape)


Features Shape: (579, 10)
Target Shape: (579,)


In [12]:
# 📌 Step 9: Scale Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Create DataFrame from scaled features
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

# Add target back
X_scaled_df['Target'] = y.values

# Preview cleaned dataset
X_scaled_df.head()


Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Target
0,1.247403,-1.770795,-0.42032,-0.495414,-0.42887,-0.355832,-0.319111,0.293722,0.203446,-0.14739,1
1,1.062306,0.564718,1.218936,1.423518,1.675083,-0.093573,-0.035962,0.939655,0.077462,-0.648461,1
2,1.062306,0.564718,0.640375,0.926017,0.816243,-0.115428,-0.146459,0.478274,0.203446,-0.178707,1
3,0.815511,0.564718,-0.372106,-0.388807,-0.449416,-0.36676,-0.312205,0.293722,0.329431,0.16578,1
4,1.679294,0.564718,0.093956,0.179766,-0.395996,-0.295731,-0.177537,0.755102,-0.930414,-1.713237,1


In [None]:
# 📌 Step 10: Save Cleaned Dataset
X_scaled_df.to_csv('../data/liver_dataset_cleaned.csv', index=False)
print("✅ Cleaned dataset saved successfully as 'liver_dataset_cleaned.csv'")


✅ Cleaned dataset saved successfully as 'liver_dataset_cleaned.csv'
