In [1]:
# --- Mount Google Drive ---
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
pip install gradio


Collecting gradio
  Downloading gradio-5.29.1-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.1 (from gradio)
  Downloading gradio_client-1.10.1-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.10-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.

In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, roc_auc_score, classification_report

In [6]:
import joblib


In [7]:
# === 1. LOAD DATA ===
patients = pd.read_csv("/content/drive/MyDrive/patient_data-all.csv", skiprows=1)
trials = pd.read_csv("/content/drive/MyDrive/ctg-studies-modified.csv")

patients.columns = [
    'Patient_ID', 'Biological_Sex', 'Gender_Identity', 'Age', 'Weight_lb', 'Height_in',
    'Race', 'Ethnicity', 'Smoking_Status', 'Residence', 'Current_Medication',
    'Twin_or_Triplet', 'Health_Conditions', 'Regular_Menstruation', 'Willing_to_Travel',
    'Travel_Radius_mi', 'Interested_in_Parenting_Study', 'Survey_Only_Study',
    'Preferred_Language'
]

# === 2. FEATURE ENGINEERING ===
patients['BMI'] = (patients['Weight_lb'] / (patients['Height_in'] ** 2)) * 703
patients['On_Medication'] = patients['Current_Medication'].apply(lambda x: 1 if isinstance(x, str) and x.strip().lower() != 'no' else 0)
patients['PCOS'] = patients['Health_Conditions'].apply(lambda x: 1 if isinstance(x, str) and 'pcos' in x.lower() else 0)
patients['Regular_Menstruation'] = patients['Regular_Menstruation'].str.strip().str.lower().map({'yes': 1, 'no': 0})
patients['Smoking_Status'] = patients['Smoking_Status'].str.strip().str.lower().map({'yes': 1, 'no': 0})
patients['Biological_Sex'] = patients['Biological_Sex'].str.strip().str.lower().map({'female': 0, 'male': 1})

trials['PCOS'] = trials['Health Conditions'].apply(lambda x: 1 if isinstance(x, str) and ('pcos' in x.lower() or 'polycystic' in x.lower()) else 0)
trials['Age'] = trials['Age'].fillna("18-99").astype(str).apply(lambda x: x if '-' in x else "18-99")
trials['BMI (kg/m^2)'] = trials['BMI (kg/m^2)'].fillna("0-100").astype(str).apply(lambda x: x if '-' in x else "0-100")
trials[['Age_Min', 'Age_Max']] = trials['Age'].str.split('-', expand=True).astype(int)
trials[['BMI_Min', 'BMI_Max']] = trials['BMI (kg/m^2)'].str.split('-', expand=True).astype(float)
trials['Regular menstruation?'] = trials['Regular menstruation?'].str.strip().str.lower().map({'yes': 1, 'no': 0})
trials['Smoking status'] = trials['Smoking status'].str.strip().str.lower().map({'yes': 1, 'no': 0})
trials['Current Medication'] = trials['Current Medication'].str.strip().str.lower().map({'yes': 1, 'no': 0})
trials['Biological Sex'] = trials['Biological Sex'].str.strip().str.lower().map({'female': 0, 'male': 1})

# === 3. BUILD PAIRS ===
pairs = []
labels = []

for _, patient in patients.iterrows():
    for _, trial in trials.iterrows():
        age_match = trial['Age_Min'] <= patient['Age'] <= trial['Age_Max']
        sex_match = patient['Biological_Sex'] == trial['Biological Sex']
        bmi_match = trial['BMI_Min'] <= patient['BMI'] <= trial['BMI_Max']
        smoke_match = patient['Smoking_Status'] == trial['Smoking status']
        med_match = trial['Current Medication'] == 1 and patient['On_Medication'] == 1
        pcos_match = trial['PCOS'] == patient['PCOS']
        menstruation_match = patient['Regular_Menstruation'] == trial['Regular menstruation?']

        score = sum([age_match, sex_match, bmi_match, smoke_match, med_match, pcos_match, menstruation_match])
        label = 1 if score >= 6 else 0

        pairs.append({
            'Age': patient['Age'],
            'BMI': patient['BMI'],
            'Biological_Sex': patient['Biological_Sex'],
            'Smoking_Status': patient['Smoking_Status'],
            'On_Medication': patient['On_Medication'],
            'PCOS': patient['PCOS'],
            'Regular_Menstruation': patient['Regular_Menstruation'],
            'Trial_Sex': trial['Biological Sex'],
            'Trial_Smoking': trial['Smoking status'],
            'Trial_Medication': trial['Current Medication'],
            'Trial_PCOS': trial['PCOS'],
            'Trial_Menstruation': trial['Regular menstruation?']
        })

        labels.append(label)

pair_df = pd.DataFrame(pairs)
pair_df['Label'] = labels

# === 4. TRAIN MODEL ===
X = pair_df.drop(columns=['Label'])
y = pair_df['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
model.fit(X_train, y_train)

print("Classification Report:\n", classification_report(y_test, model.predict(X_test)))
print("AUC Score:", roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

# === 5. SAVE MODEL ===
joblib.dump(model, "trial_match_model_v3_numeric.pkl")
pair_df.to_csv("/content/drive/MyDrive/numeric_pair_df.csv", index=False)
trials.to_csv("/content/drive/MyDrive/numeric_cleaned_trials.csv", index=False)


Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.97      0.98      1380
           1       0.84      0.87      0.85       220

    accuracy                           0.96      1600
   macro avg       0.91      0.92      0.92      1600
weighted avg       0.96      0.96      0.96      1600

AUC Score: 0.9940892621870883


In [8]:
import gradio as gr
import pandas as pd
import numpy as np
import joblib

# Load the model and cleaned trial data
model = joblib.load("trial_match_model_v3_numeric.pkl")
trials = pd.read_csv("/content/drive/MyDrive/numeric_cleaned_trials.csv")

# Recommender function
def recommend_ml_numeric(age, weight, height, bio_sex, smoking, medication, pcos, menstruation):
    # Calculate BMI
    bmi = (weight / (height ** 2)) * 703

    results = []

    for _, trial in trials.iterrows():
        try:
            features = {
                'Age': age,
                'BMI': bmi,
                'Biological_Sex': bio_sex,
                'Smoking_Status': smoking,
                'On_Medication': medication,
                'PCOS': pcos,
                'Regular_Menstruation': menstruation,
                'Trial_Sex': trial['Biological Sex'],
                'Trial_Smoking': trial['Smoking status'],
                'Trial_Medication': trial['Current Medication'],
                'Trial_PCOS': trial['PCOS'],
                'Trial_Menstruation': trial['Regular menstruation?']
            }

            X = pd.DataFrame([features])
            score = model.predict_proba(X)[0][1]

            # Explanation (simple match highlights)
            matched = []
            if trial['Age_Min'] <= age <= trial['Age_Max']:
                matched.append("✅ Age in range")
            if trial['BMI_Min'] <= bmi <= trial['BMI_Max']:
                matched.append("✅ BMI in range")
            if trial['Biological Sex'] == bio_sex:
                matched.append("✅ Sex")
            if trial['Smoking status'] == smoking:
                matched.append("✅ Smoking")
            if trial['Current Medication'] == medication:
                matched.append("✅ Meds")
            if trial['PCOS'] == pcos:
                matched.append("✅ PCOS")
            if trial['Regular menstruation?'] == menstruation:
                matched.append("✅ Menstruation")

            explanation = "; ".join(matched)
            results.append((trial['Clinical Trial'], trial['urls'], score, explanation))

        except Exception as e:
            print(f"⚠️ Skipped trial '{trial.get('Clinical Trial', 'Unknown')}' due to error: {e}")
            continue

    # Sort and return top 3 matches
    top_matches = sorted(results, key=lambda x: x[2], reverse=True)[:3]

    if top_matches:
        return "\n\n".join([
            f"🔗 **[{title}]({url})**\nMatch Score: {score:.2f}\n**Why matched:** {explanation}"
            for title, url, score, explanation in top_matches
        ])
    else:
        return "😕 No strong match found. Try adjusting your input."



In [9]:
# Gradio interface
app = gr.Interface(
    fn=recommend_ml_numeric,
    inputs=[
        gr.Number(label="Age"),
        gr.Number(label="Weight (lbs)"),
        gr.Number(label="Height (in)"),
        gr.Radio([0, 1], label="Biological Sex (0=female, 1=male)"),
        gr.Radio([0, 1], label="Smoking Status (0=no, 1=yes)"),
        gr.Radio([0, 1], label="Currently on Medication? (0=no, 1=yes)"),
        gr.Radio([0, 1], label="Diagnosed with PCOS? (0=no, 1=yes)"),
        gr.Radio([0, 1], label="Regular Menstruation? (0=no, 1=yes)")
    ],
    outputs="markdown",
    title="🧬 Clinical Trial Matcher (Numeric)",
    description="Fill in your info (all 0/1 values) to get matched to top trials using an ML model"
)

app.launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://67dd56a0828c755643.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


