In [None]:
!pip install -q scikit-learn pandas numpy joblib

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import joblib


In [None]:
# number of synthetic patients
np.random.seed(42)
n = 3000

# 1) age
age = np.random.randint(18, 71, n)

# 2) heavy_bleeding_level: 0=normal,1=moderately heavy,2=very heavy
heavy_bleeding_level = np.random.choice([0,1,2], n, p=[0.4,0.35,0.25])

# 3) pain_pattern: 0=no pain,1=pain only during periods,2=pain most days
pain_pattern = np.random.choice([0,1,2], n, p=[0.4,0.35,0.25])

# 4) family_history: 0/1
family_history = np.random.choice([0,1], n, p=[0.8,0.2])

# 5) bleed_between: 0/1
bleed_between = np.random.choice([0,1], n, p=[0.75,0.25])

# 6) period_length_cat: 0=1–3 days,1=4–7 days,2=>7 days
period_length_cat = np.random.choice([0,1,2], n, p=[0.2,0.6,0.2])

# 7) post_menopause_bleed: more likely if age>=45
post_menopause_bleed = np.where(
    age >= 45,
    np.random.choice([0,1], n, p=[0.85,0.15]),
    0
)

# 8) abdominal_swelling: 0/1
abdominal_swelling = np.random.choice([0,1], n, p=[0.7,0.3])

# 9) anemia_symptoms: more likely if heavy_bleeding_level high
anemia_symptoms = np.where(
    heavy_bleeding_level == 2,
    np.random.choice([0,1], n, p=[0.4,0.6]),
    np.random.choice([0,1], n, p=[0.8,0.2])
)

# 10) urinary_frequency: 0/1
urinary_frequency = np.random.choice([0,1], n, p=[0.7,0.3])

# 11) constipation: 0/1
constipation = np.random.choice([0,1], n, p=[0.7,0.3])

# combine to DataFrame in EXACT order used in app.py
df = pd.DataFrame({
    "age": age,
    "heavy_bleeding_level": heavy_bleeding_level,
    "pain_pattern": pain_pattern,
    "family_history": family_history,
    "bleed_between": bleed_between,
    "period_length_cat": period_length_cat,
    "post_menopause_bleed": post_menopause_bleed,
    "abdominal_swelling": abdominal_swelling,
    "anemia_symptoms": anemia_symptoms,
    "urinary_frequency": urinary_frequency,
    "constipation": constipation
})

# ------- create synthetic risk label (y) -------
# start with score 0 and add weights (you can tweak)
score = np.zeros(n)

score += (age >= 45) * 1
score += (age >= 50) * 1

score += heavy_bleeding_level * 2           # more points if heavier bleeding
score += pain_pattern                       # chronic pain adds risk
score += family_history * 2
score += bleed_between * 3
score += (period_length_cat == 2) * 3       # periods >7 days
score += post_menopause_bleed * 6           # big red flag
score += abdominal_swelling * 3
score += anemia_symptoms * 3
score += urinary_frequency * 1
score += constipation * 1

# convert score to binary label: high score = 1 (high risk), else 0
# threshold chosen empirically; you can adjust
y = (score >= 8).astype(int)

X = df.values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
print("Accuracy on synthetic test set:", model.score(X_test, y_test))


Accuracy on synthetic test set: 0.94


In [None]:
joblib.dump(model, "model.pkl")

from google.colab import files
files.download("model.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>