<a href="https://colab.research.google.com/github/gaciuki2402/PID_Model/blob/main/PID.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [189]:
from google.colab import drive
drive.mount("/content/drive")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Importing libraries**

In [190]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix, ConfusionMatrixDisplay, precision_score, recall_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt

**Loading Dataset**

In [191]:
data =pd.read_csv("/content/drive/MyDrive/PID.csv")

In [192]:
data.head()

Unnamed: 0,Patient ID,Age,Race,STDs/UTI History,IUD Use,Past Pelvic Pain,Imaging Results,Abnormal Discharge,Fever Onset,Irregular Periods,Dyspareunia,Dysuria,WBC Count,ESR,CRP Level,PID Diagnosis
0,P001,25,White,No,Yes,No,Normal (US),,5/10/2025,No,Yes,No,Normal,Normal,Normal,Negative
1,P002,30,Black,Yes (2 years ago),No,Yes,Adnexal tenderness (US),Yellow discharge,4/20/2025,Yes,Yes,Yes,Elevated,Elevated,Slightly Elevated,Positive
2,P003,35,Asian,No,Yes,Yes,Pelvic adhesions (US),"Brown discharge, heavy bleeding",,Irregular,Yes,Yes,High,High,High,Positive
3,P004,28,Hispanic,No,No,No,Normal (US),"White, clumpy discharge, itching",,Regular,No,No,Normal,Normal,Normal,Negative
4,P005,33,Indigenous,Yes (current),Yes,No,Normal (US),,,Regular,No,No,Normal,Normal,Normal,Negative


**Data Cleaning **

In [193]:
data.drop(columns=["Patient ID"], inplace=True)
data.drop(columns=["Race"], inplace=True)
data.drop(columns=["Fever Onset"],inplace=True)

In [194]:
data.tail()

Unnamed: 0,Age,STDs/UTI History,IUD Use,Past Pelvic Pain,Imaging Results,Abnormal Discharge,Irregular Periods,Dyspareunia,Dysuria,WBC Count,ESR,CRP Level,PID Diagnosis
514,30,Chlamydia,Yes (3 years ago),Yes (3 years ago),Adnexal tenderness (US),"Yellowish discharge, itching",Irregular (30-35 days),Yes,Yes,Elevated,Elevated,Slightly Elevated,Positive
515,28,No,No,No,Normal (US),,Regular (28 days),No,No,Normal,Normal,Normal,Negative
516,32,No,No,Yes (2 years ago),Pelvic adhesions (US),"Yellowish discharge, odor",Irregular (30-35 days),Yes,No,Elevated,Elevated,Slightly Elevated,Positive
517,25,Chlamydia,Yes (1 year ago),Yes (1 year ago),Adnexal tenderness (US),"Yellow discharge, pain",Irregular (30-35 days),Yes,Yes,Elevated,Elevated,Slightly Elevated,Positive
518,34,No,No,Yes (1 year ago),Normal (US),"Greyish discharge, itching",Regular (28 days),Yes,No,Elevated,Elevated,Slightly Elevated,Positive


**Define Features and Target**

In [195]:
categorical_features = ["STDs/UTI History", "IUD Use", "Past Pelvic Pain", "Imaging Results", "Abnormal Discharge", "Irregular Periods", "Dyspareunia", "Dysuria", "PID Diagnosis"]
numerical_features = ["Age","WBC Count", "ESR", "CRP Level",]

x = data.drop(columns=["PID Diagnosis"])
y = data["PID Diagnosis"].map({"Positive":1, "Negative":0})


In [196]:
print(x)

     Age   STDs/UTI History            IUD Use   Past Pelvic Pain  \
0     25                 No                Yes                 No   
1     30  Yes (2 years ago)                 No                Yes   
2     35                 No                Yes                Yes   
3     28                 No                 No                 No   
4     33      Yes (current)                Yes                 No   
..   ...                ...                ...                ...   
514   30          Chlamydia  Yes (3 years ago)  Yes (3 years ago)   
515   28                 No                 No                 No   
516   32                 No                 No  Yes (2 years ago)   
517   25          Chlamydia   Yes (1 year ago)   Yes (1 year ago)   
518   34                 No                 No   Yes (1 year ago)   

             Imaging Results                Abnormal Discharge  \
0                Normal (US)                               NaN   
1    Adnexal tenderness (US)           

In [197]:
print(y)

0      0
1      1
2      1
3      0
4      0
      ..
514    1
515    0
516    1
517    1
518    1
Name: PID Diagnosis, Length: 519, dtype: int64


**Define Goals and Metrics**

In [198]:
# metrics comparison
metrics = {
    "accuracy":accuracy_score,
    "precision": lambda y_true, y_pred: precision_score(y_true, y_pred, pos_label=1),
    "recall": lambda y_true, y_pred: recall_score(y_true, y_pred, pos_label=1),
    "f1": lambda y_true, y_pred: f1_score(y_true, y_pred, pos_label=1),
    "roc_auc": lambda y_true, y_pred: roc_auc_score(y_true, y_pred)

}

**Split the Data**

In [199]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

**Preprocessing Pipelines**

In [200]:
# categorical data preprocessing
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])
# This pipeline for categorical data first imputes missing values with the string 'missing'
# and then applies one-hot encoding to convert categorical features into numerical format.

In [201]:
# Numerical Data Preprocessing
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])
# This pipeline for numerical data first imputes missing values with the mean of the column
# and then scales the features to have a mean of 0
# and a standard deviation of 1.

**Combine Preprocessing Steps**

In [202]:
Preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)
# The ColumnTransformer combines both preprocessing pipelines,
# applying the appropriate transformations to the respective feature types.

**Evaluate Each Model**

In [205]:
models = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42)
}

results = {}
confusion_matrices = {}
for model_name, model in models.items():
  pipeline = Pipeline(steps=[
      ("preprocessor", Preprocessor),
      ('classifier', models)
  ])

In [207]:
# Train the model
pipeline.fit(x_train, y_train)
results[model_name] = pipeline.score(x_test, y_test)

# predict on test data
y_pred = pipeline.predict(x_test)

# collect metrics
# results[model_name] = {metric_name: metrics(y_test, y_pred) for metrics in metrics.items()}


TypeError: Last step of Pipeline should implement fit or be the string 'passthrough'. '{'Logistic Regression': LogisticRegression(random_state=42), 'Decision Tree': DecisionTreeClassifier(random_state=42), 'Random Forest': RandomForestClassifier(random_state=42)}' (type <class 'dict'>) doesn't