In [3]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('jaundice_risk_dataset.csv')

# Encode categorical columns (Blood Type, Baby Feeding Frequency, Diaper Wetting Frequency, Race, Jaundice Risk)
label_encoder = LabelEncoder()

# Encode categorical columns for input features
df["Blood Type of Baby"] = label_encoder.fit_transform(df["Blood Type of Baby"])
df["Blood Type of Mother"] = label_encoder.fit_transform(df["Blood Type of Mother"])
df["Baby Feeding Frequency"] = label_encoder.fit_transform(df["Baby Feeding Frequency"])
df["Diaper Wetting Frequency"] = label_encoder.fit_transform(df["Diaper Wetting Frequency"])
df["Race"] = label_encoder.fit_transform(df["Race"])

# Encode target column (Jaundice Risk)
df["Jaundice Risk"] = label_encoder.fit_transform(df["Jaundice Risk"])

# Separate features (X) and target (y)
X = df.drop("Jaundice Risk", axis=1)  # Features
y = df["Jaundice Risk"]  # Target

# Split the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

joblib.dump(rf_model, 'RandomForest_Model.pkl')

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Classification Report (includes precision, recall, f1-score)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature Importances
print("\nFeature Importances:")
importances = rf_model.feature_importances_
for feature, importance in zip(X.columns, importances):
    print(f"{feature}: {importance:.4f}")
    

Accuracy: 66.00%

Classification Report:
              precision    recall  f1-score   support

           0       0.33      0.11      0.17         9
           1       0.83      0.90      0.87       105
           2       0.43      0.47      0.45        55
           3       0.43      0.33      0.38        30
           4       0.00      0.00      0.00         1

    accuracy                           0.66       200
   macro avg       0.41      0.36      0.37       200
weighted avg       0.64      0.66      0.64       200


Feature Importances:
Weight (kg): 0.2387
Gestational Age (weeks): 0.1564
Family History (1=yes, 0=no): 0.0572
Blood Type of Baby: 0.0996
Blood Type of Mother: 0.1106
Baby Feeding Frequency: 0.0718
Diaper Wetting Frequency: 0.0836
Bruises (1=yes, 0=no): 0.0674
Race: 0.1148


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
