<a href="https://colab.research.google.com/github/chidambarambaskaran/MachineLearning/blob/main/Untitled24.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
# The dataset contains patient records for cirrhosis disease progression
# Each row represents a patient with various features and the target variable

df = pd.read_csv("cirrhosis.csv")

# Drop irrelevant columns
# 'ID' is just a unique identifier, 'N_Days' represents days of follow-up (not needed for prediction)
df.drop(columns=['ID', 'N_Days'], inplace=True)

# Handling missing values
# Fill categorical columns with the mode (most frequent value)
for col in df.select_dtypes(include=['object']).columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Fill numerical columns with the median value to prevent skewness
for col in df.select_dtypes(include=['number']).columns:
    df[col].fillna(df[col].median(), inplace=True)

# Encode categorical variables
# Convert categorical string values into numerical values using Label Encoding
label_encoders = {}
for col in ['Status', 'Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store encoders for later use

# Create a binary target variable 'Advanced_Cirrhosis'
# If 'Stage' >= 3 or 'Status' indicates the patient died (Status=1), label it as Advanced Cirrhosis (1)
# Otherwise, label it as Early/Moderate Cirrhosis (0)
df['Advanced_Cirrhosis'] = np.where(
    (df['Stage'] >= 3) | (df['Status'] == 1),  # Condition for advanced cirrhosis
    1,  # Label as Advanced Cirrhosis
    0   # Label as Early/Moderate Cirrhosis
)

# Define features (X) and target variable (y)
# Drop 'Stage' as it was used to define the target variable
df.drop(columns=['Stage'], inplace=True)
X = df.drop(columns=['Advanced_Cirrhosis'])
y = df['Advanced_Cirrhosis']

# Split data into training (80%) and testing (20%) sets with stratification to maintain class balance
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize numerical features
# This ensures all numerical features have mean 0 and standard deviation 1
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train and evaluate multiple machine learning models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "SVM": SVC(kernel='linear'),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(eval_metric='logloss')  # Removed use_label_encoder=False
}

# Train each model and evaluate performance
for name, model in models.items():
    model.fit(X_train, y_train)  # Train the model
    y_pred = model.predict(X_test)  # Predict on test data

    # Display results
    print(f"\n🔹 Model: {name}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(classification_report(y_test, y_pred, zero_division=1))  # Fixes warning

# Function to allow user to predict cirrhosis severity for a selected range of rows
def predict_range():
    total_rows = len(df)  # Get total number of rows
    print(f"\n🔹 Total number of rows in dataset: {total_rows}")

    while True:
        try:
            # User inputs range of rows (e.g., "1-20")
            user_input = input("\nEnter row range (e.g., 1-20 or 33-54): ").strip()
            start, end = map(int, user_input.split('-'))

            # Validate range input
            if start < 1 or end > total_rows or start > end:
                print("❌ Invalid range. Please enter a valid range within dataset limits.")
                continue

            # Limit the number of rows for performance
            row_count = end - start + 1
            if row_count > 30:
                print("⚠️ Maximum limit is 30 rows. Please enter a smaller range.")
                continue

            break  # Exit loop if valid input

        except ValueError:
            print("❌ Invalid input format. Enter range like: 1-20")

    # Select data for prediction
    selected_data = X.iloc[start-1:end]  # Adjust index (1-based to 0-based)
    selected_labels = y.iloc[start-1:end]  # Get original labels
    selected_data_scaled = scaler.transform(selected_data)  # Standardize numerical features

    # Perform predictions
    print(f"\n🔹 Predicting for rows {start} to {end}:")
    for index, (row, original_value) in enumerate(zip(selected_data_scaled, selected_labels), start=start):
        print(f"\n📌 Row {index} Data: {X.iloc[index-1].to_dict()}")
        print(f"🔵 Original (Actual) Value: {'Advanced Cirrhosis' if original_value == 1 else 'Early/Moderate Cirrhosis'}")

        for name, model in models.items():
            prediction = model.predict([row])[0]
            result = "Advanced Cirrhosis" if prediction == 1 else "Early/Moderate Cirrhosis"
            correctness = "✅ Correct" if prediction == original_value else "❌ Incorrect"
            print(f"{name}: {result} ({correctness})")

# Run prediction function
predict_range()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)



🔹 Model: Logistic Regression
Accuracy: 0.7024
              precision    recall  f1-score   support

           0       0.40      0.27      0.32        22
           1       0.77      0.85      0.81        62

    accuracy                           0.70        84
   macro avg       0.58      0.56      0.57        84
weighted avg       0.67      0.70      0.68        84


🔹 Model: Random Forest
Accuracy: 0.7262
              precision    recall  f1-score   support

           0       0.43      0.14      0.21        22
           1       0.75      0.94      0.83        62

    accuracy                           0.73        84
   macro avg       0.59      0.54      0.52        84
weighted avg       0.67      0.73      0.67        84


🔹 Model: SVM
Accuracy: 0.6905
              precision    recall  f1-score   support

           0       0.33      0.18      0.24        22
           1       0.75      0.87      0.81        62

    accuracy                           0.69        84
   macro 