# Early Liver Disease Detection using AI
**Project solution**

## Step 1: Analyze the dataset

### Load the dataset

In [None]:
# Import pandas library for data manipulation
import pandas as pd

In [None]:
# Load the Indian Liver Patient Dataset from CSV file
# Make sure the CSV file is in the same directory as this notebook
df = pd.read_csv("indian_liver_patient_dataset.csv")
# Display a message confirming dataset is loaded
print("Dataset loaded successfully!")

### Print total number of rows

In [None]:
# Get the total number of rows in the dataset
# The shape attribute returns (rows, columns), so we take the first element [0] for rows
total_rows = df.shape[0]

# Print the total number of rows
print(f"Total number of rows: {total_rows}")

### Print top 5 rows

In [None]:
# Display the first 5 rows of the dataset
# This helps us understand the structure and content of our data
print("Top 5 rows of the dataset:")
df.head()

In [None]:
df.isnull().sum()

## Step 2: Identify the features and target columns

In [None]:
### Identify the target (y) column

# The target column is typically the column we want to predict
# In liver patient datasets, this is usually the last column indicating liver disease status
# Let's first check all column names to identify the target
print("All columns in the dataset:")
print(df.columns.tolist())

In [None]:
# The target column is the one that indicates liver disease (usually named 'Dataset' or similar)
# Assuming the last column is our target based on typical liver patient dataset structure
target_column = df.columns[-1]  # Get the last column name
y = df[target_column]

In [None]:
print(f"\nTarget (y) column identified: '{target_column}'")
print(f"Target column values: {y.unique()}")

In [None]:
### Identify the features (X) columns

# Features are all columns except the target column
# We exclude the target column to create our feature set
X = df.drop(columns=[target_column])

print(f"Features (X) columns identified:")
print(f"Number of feature columns: {X.shape[1]}")
print(f"Feature column names: {X.columns.tolist()}")

### Split the data into test and train sets

In [None]:
# Import train_test_split function from sklearn
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
# test_size=0.2 means 20% for testing, 80% for training
# random_state=42 ensures reproducible results
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,      # 20% of data for testing
    random_state=42     # For reproducible results
)

In [None]:
# Print the sizes of train and test sets
print(f"Training set size: {X_train.shape[0]} rows")
print(f"Testing set size: {X_test.shape[0]} rows")
print(f"Training features shape: {X_train.shape}")
print(f"Testing features shape: {X_test.shape}")

## Step 3: Choose the classifier and train the model

In [None]:
### Choose the classifier

# Import LogisticRegression classifier from sklearn
from sklearn.linear_model import LogisticRegression

# Choose LogisticRegression as our classifier
# It's a good choice for binary classification problems like liver disease prediction
print("Classifier chosen: Logistic Regression")
print("Reason: Good for binary classification problems and provides interpretable results")

### Create the model

In [None]:
# Create the LogisticRegression model instance
model = LogisticRegression()

# Display confirmation that model is created
print("Logistic Regression model created successfully!")
print(f"Model parameters: {model.get_params()}")

### Train the model

In [None]:
# Train the model using the training data
# fit() method trains the model on X_train (features) and y_train (target)
model.fit(X_train, y_train)

# Display confirmation that model is trained
print("Model training completed successfully!")
print("The model has learned patterns from the training data")

## Step 4: Evaluate the model

In [None]:
### Print the accuracy score

# Import accuracy_score metric from sklearn
from sklearn.metrics import accuracy_score

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate accuracy: (correct predictions) / (total predictions)
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy score
print(f"Accuracy Score: {accuracy:.4f}")
print(f"Accuracy Percentage: {accuracy * 100:.2f}%")

In [None]:
### Print the sensitivity score

# Import recall_score for calculating sensitivity
from sklearn.metrics import recall_score

# Sensitivity (True Positive Rate) = TP / (TP + FN)
# It measures how well the model identifies positive cases (liver disease patients)
# pos_label parameter specifies which class is considered positive
sensitivity = recall_score(y_test, y_pred, pos_label=1)

In [None]:
# Print the sensitivity score
print(f"Sensitivity Score: {sensitivity:.4f}")
print(f"Sensitivity Percentage: {sensitivity * 100:.2f}%")
print("Sensitivity measures the model's ability to correctly identify liver disease patients")

In [None]:
### Print the specificity score

# Specificity (True Negative Rate) = TN / (TN + FP)
# It measures how well the model identifies negative cases (healthy patients)
# For specificity, we use pos_label=0 (assuming 0 represents healthy patients)
specificity = recall_score(y_test, y_pred,pos_label=0)

In [None]:
# Print the specificity score
print(f"Specificity Score: {specificity:.4f}")
print(f"Specificity Percentage: {specificity * 100:.2f}%")
print("Specificity measures the model's ability to correctly identify healthy patients")