In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from scipy.stats import zscore

# Load dataset
df = pd.read_csv("C:/Users/UMESH/Downloads/hepatitis_csv.csv")  # Replace with your actual path if different
df.head()

# Step q: Data Cleaning
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)

# Convert to correct data types
df = df.apply(pd.to_numeric, errors='ignore')

# Remove negative values in numerical columns
num_cols = df.select_dtypes(include=[np.number]).columns
df = df[(df[num_cols] >= 0).all(axis=1)]

# Step r: Outlier Removal using Z-score
z_scores = df[num_cols].apply(zscore)
df = df[(np.abs(z_scores) < 3).all(axis=1)]

# Step s: Data Transformation (Label Encoding for categorical features)
le = LabelEncoder()
df['class'] = le.fit_transform(df['class'])  # Convert LIVE/DIE to 1/0

# Encode other categorical columns
cat_cols = df.select_dtypes(include='object').columns
for col in cat_cols:
    df[col] = le.fit_transform(df[col])

# Feature and target separation
X = df.drop('class', axis=1)
y = df['class']

# Scale numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step t: Split data and build models
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 1. Logistic Regression
log_model = LogisticRegression()
log_model.fit(X_train, y_train)
y_pred_log = log_model.predict(X_test)
acc_log = accuracy_score(y_test, y_pred_log)

# 2. Naive Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)
acc_nb = accuracy_score(y_test, y_pred_nb)

# Print results
print("Logistic Regression Accuracy:", acc_log)
print("Naive Bayes Accuracy:", acc_nb)
print("\nClassification Report (Logistic Regression):\n", classification_report(y_test, y_pred_log))
print("\nClassification Report (Naive Bayes):\n", classification_report(y_test, y_pred_nb))


Logistic Regression Accuracy: 0.8
Naive Bayes Accuracy: 0.9333333333333333

Classification Report (Logistic Regression):
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.80      1.00      0.89        12

    accuracy                           0.80        15
   macro avg       0.40      0.50      0.44        15
weighted avg       0.64      0.80      0.71        15


Classification Report (Naive Bayes):
               precision    recall  f1-score   support

           0       0.75      1.00      0.86         3
           1       1.00      0.92      0.96        12

    accuracy                           0.93        15
   macro avg       0.88      0.96      0.91        15
weighted avg       0.95      0.93      0.94        15



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
