In [18]:
import pandas as pd

df = pd.read_csv("parsed_pcos.csv")
print(df.shape)
df.head()
df.info()

(541, 46)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541 entries, 0 to 540
Data columns (total 46 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              541 non-null    int64  
 1   Sl. No                  541 non-null    int64  
 2   Patient File No.        541 non-null    int64  
 3   PCOS (Y/N)              541 non-null    int64  
 4    Age (yrs)              541 non-null    int64  
 5   Weight (Kg)             541 non-null    float64
 6   Height(Cm)              541 non-null    float64
 7   BMI                     541 non-null    float64
 8   Blood Group             541 non-null    int64  
 9   Pulse rate(bpm)         541 non-null    int64  
 10  RR (breaths/min)        541 non-null    int64  
 11  Hb(g/dl)                541 non-null    float64
 12  Cycle(R/I)              541 non-null    int64  
 13  Cycle length(days)      541 non-null    int64  
 14  Marraige Status (Yrs)   540 non-

In [19]:
DROP_COLS = [
    "Patient File No.",
    "Unnamed: 0" , # if exists,
    "Sl. No"
]

df = df.drop(columns=[c for c in DROP_COLS if c in df.columns])


In [20]:
TARGET = "PCOS (Y/N)"
df[TARGET].value_counts()


PCOS (Y/N)
0    364
1    177
Name: count, dtype: int64

In [21]:
df[TARGET] = df[TARGET].astype(int)


In [22]:
from sklearn.impute import SimpleImputer

num_cols = df.select_dtypes(include=["int64", "float64"]).columns
cat_cols = df.select_dtypes(include=["object"]).columns

df[num_cols] = SimpleImputer(strategy="median").fit_transform(df[num_cols])
df[cat_cols] = SimpleImputer(strategy="most_frequent").fit_transform(df[cat_cols])


In [23]:
df = pd.get_dummies(df, drop_first=True)


In [24]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X = df.drop(columns=[TARGET])
y = df[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

model = RandomForestClassifier(
    n_estimators=300,
    max_depth=8,
    random_state=42,
    class_weight="balanced"
)

model.fit(X_train, y_train)


0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,8
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [43]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

         0.0       0.89      0.96      0.92        73
         1.0       0.90      0.75      0.82        36

    accuracy                           0.89       109
   macro avg       0.89      0.85      0.87       109
weighted avg       0.89      0.89      0.89       109



In [None]:
import numpy as np

importance_df = pd.DataFrame({
    "feature": X.columns,
    "importance": model.feature_importances_
}).sort_values(by="importance", ascending=False)

importance_df.head(15)


ValueError: All arrays must be of the same length

In [55]:
TOP_K = 15
top_features = importance_df.head(TOP_K)["feature"].tolist()

X = X[top_features]
X.head()


Unnamed: 0,Follicle No. (R),Follicle No. (L),hair growth(Y/N),Skin darkening (Y/N),Weight gain(Y/N),Fast food (Y/N),Cycle length(days),Cycle(R/I),BMI,Weight (Kg),Hip(inch),FSH/LH,FSH(mIU/mL),LH(mIU/mL),Endometrium (mm)
0,3.0,3.0,0.0,0.0,0.0,1.0,5.0,2.0,19.3,44.6,36.0,2.16,7.95,3.68,8.5
1,5.0,3.0,0.0,0.0,0.0,0.0,5.0,2.0,24.9,65.0,38.0,6.17,6.73,1.09,3.7
2,15.0,13.0,0.0,0.0,0.0,1.0,5.0,2.0,25.3,68.8,40.0,6.3,5.54,0.88,10.0
3,2.0,2.0,0.0,0.0,0.0,0.0,5.0,2.0,29.7,65.0,42.0,3.42,8.06,2.36,7.5
4,4.0,3.0,0.0,0.0,0.0,0.0,5.0,2.0,20.1,52.0,37.0,4.42,3.98,0.9,7.0


In [67]:
columns_to_drop = ['Follicle No. (R)', 'Follicle No. (L)', 'FSH/LH', 'FSH(mIU/mL)', 'LH(mIU/mL)', 'Endometrium (mm)']

# X = X.drop(columns=columns_to_drop)
X.head()


Unnamed: 0,hair growth(Y/N),Skin darkening (Y/N),Weight gain(Y/N),Fast food (Y/N),Cycle length(days),Cycle(R/I),BMI,Weight (Kg),Hip(inch)
0,0.0,0.0,0.0,1.0,5.0,2.0,19.3,44.6,36.0
1,0.0,0.0,0.0,0.0,5.0,2.0,24.9,65.0,38.0
2,0.0,0.0,0.0,1.0,5.0,2.0,25.3,68.8,40.0
3,0.0,0.0,0.0,0.0,5.0,2.0,29.7,65.0,42.0
4,0.0,0.0,0.0,0.0,5.0,2.0,20.1,52.0,37.0


In [60]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(
    sampling_strategy={0: 1500, 1: 1500},
    random_state=42
)

X_resampled, y_resampled = smote.fit_resample(X, y)


In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=500,
    max_depth=10,
    class_weight={0: 2.5, 1: 1},  # penalize false PCOS
    random_state=42
)

SyntaxError: invalid syntax (1231635601.py, line 9)

In [65]:
# Split BEFORE SMOTE
X_train, X_test, y_train, y_test = train_test_split(
    X_original,
    y_original,
    test_size=0.2,
    stratify=y_original,
    random_state=42
)


NameError: name 'X_original' is not defined