# BioMed: Real World Data - Staged Logistic Regression Model

---

**Group:**
- González Méndez, Alvaro ()
- Reyes Castro, Didier Yamil (didier.reyes.castro@alumnos.upm.es)
- Rodriguez Fernández, Cristina ()

**Course:** BioMedical Informatics - 2025/26

**Institution:** Polytechnic University of Madrid (UPM)

**Date:** October 2026

---

## Goals

The goal of the assignment is to implement a staged logistic regression model with real-world biomedical data. The model will be used to rank LOINC documents based on their relevance to specific clinical queries.

## 0 Setup

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
import joblib

## 1 Implementation

In [None]:
# Loading datasets

DATASET_FIRST_STAGE = 'data/first_stage_data.csv'
DATASET_SECOND_STAGE = 'data/second_stage_data.csv'

MODEL_1_PATH = 'first_stage_logistic_regression_model.joblib'
MODEL_2_PATH = 'second_stage_logistic_regression_model.joblib'

try:
    df_first_stage = pd.read_csv(DATASET_FIRST_STAGE)
    df_second_stage = pd.read_csv(DATASET_SECOND_STAGE)
except FileNotFoundError as e:
    print(f"Error loading datasets: {e}")
    exit(1)

### 1.1 Part A: Train First Logistic Regression Model (Intra-Clue)

The elementary clues taken into account for the first stage are: TF, IDF, is_in_component and is_in_system.

In [None]:
features_1 = ['TF', 'IDF', 'is_in_component', 'is_in_system']
target_1 = 'relevance'

X1 = df_first_stage[features_1]
Y1 = df_first_stage[target_1]

In [None]:
# Logistic Regression default parameters: penalty='l2', C=1.0, solver='lbfgs'
# solver can be changed to 'liblinear' as it is great for small datasets and binary
# classification. Check: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
model_1 = LogisticRegression()
model_1.fit(X1, Y1)

# Save the trained model to a file
# joblib.dump(model_1, MODEL_1_PATH)

### 1.2 Part B: Generate Second-Level Dataset

1. Get Log-Odds from First Model (use the first dataset and predict with the model -> this will give you the log O(R/Ai) )
2.  Sum up Log-Odds per Document (group by doc_id and sum the log-odds) -> Gives you the Z score per document.
3. Complete the second stage dataset with the Z score (for those documents with 0 clues (N) fill Z with 0)

In [None]:
# 1. Get log-Odds
df_first_stage['log_odds'] = model_1.decision_function(X1)

In [None]:
# 2. Calculate Z score per document
Z = df_first_stage.groupby(['doc_id', 'query_id'])['log_odds'].sum().reset_index()
Z = Z.rename(columns={'log_odds': 'Z'})

In [None]:
# 3. Complete the second stage dataset with the Z score (for those documents with 0 clues (N) fill Z with 0)
df_second_stage = df_second_stage.merge(Z, on=['doc_id', 'query_id'], how='left')
df_second_stage['Z'] = df_second_stage['Z'].fillna(0)

### 1.3 Part C: Train Second Logistic Regression Model (Inter-Clue)

In [None]:
features_2 = ['Z', 'N']
target_2 = 'relevance'

X2 = df_first_stage[features_2]
Y2 = df_first_stage[target_2]

In [None]:
model_2 = LogisticRegression()
model_2.fit(X2, Y2)

# Save the trained model to a file
# joblib.dump(model_2, MODEL_2_PATH)

## 2 Retrieval

Let's make the ranking of documents for a given query using the two-stage logistic regression model.

In [None]:
# Ideally, these structures should be generated from a large
# biomedical knowledge base. They are hardcoded here for simplicity.
THESAURUS = {
    'glucose': ['glucose'],
    'blood': ['blood', 'bld', 'serum', 'ser', 'plasma', 'plas'],
    'bilirubin': ['bilirubin'],
    'plasma': ['plasma', 'plas'],
    'white blood cells': ['white blood cells', 'wbc', 'leukocyte', 'lymphocyte', 'monocyte'],
}



# Ideally, this mapping should be generated at runtime from the THESAURUS
# This would be part of a large information retrieval module but it is
# out of scope for this example.
QUERY_TO_CONCEPTS = {
    'glucose in blood': ['glucose', 'blood'],
    'bilirubin in plasma': ['bilirubin', 'plasma'],
    'white blood cells count': ['white blood cells'],
}

# Getting our Corpus
CORPUS_PATH = 'data/loinc_docs.csv'
try:
    df_corpus = pd.read_csv(CORPUS_PATH)
except FileNotFoundError as e:
    print(f"Error loading corpus dataset: {e}")
    exit(1)

In [None]:
import math

def check_match(field, concept_terms):
    return any(term.lower() in field.lower() for term in concept_terms)

def check_appears_in_document(loinc_doc, concept_terms):
    return check_match(loinc_doc['long_common_name'], concept_terms) or \
        check_match(loinc_doc['component'], concept_terms) or \
        check_match(loinc_doc['system'], concept_terms)

def build_first_stage_dataset(concepts):
    
    dataset_1_rows = []
    for concept in concepts:
        concept_terms = THESAURUS.get(concept)
        
        for _, loinc_doc in df_corpus.iterrows():

            # Check if any of the terms for the concept are in the document
            if check_appears_in_document(loinc_doc, concept_terms):

                # If there is a match, compute TF and other features
                tf = sum(loinc_doc['long_common_name'].lower().count(term.lower()) for term in concept_terms)
                idf = math.log(len(df_corpus) / sum(1 for _, doc in df_corpus.iterrows() if check_appears_in_document(doc, concept_terms)))
                is_in_component = int(check_match(loinc_doc['component'], concept_terms))
                is_in_system = int(check_match(loinc_doc['system'], concept_terms))

                dataset_1_rows.append({
                    'loinc_num': loinc_doc['loinc_num'],
                    'concept': concept,
                    'TF': tf,
                    'IDF': idf,
                    'is_in_component': is_in_component,
                    'is_in_system': is_in_system,
                })
    
    return pd.DataFrame(dataset_1_rows)

def build_second_stage_dataset(df, query):
    
    # Calculate Z score per document
    Z_query = df.groupby(['loinc_num'])['log_odds'].sum().reset_index()
    Z_query = Z_query.rename(columns={'log_odds': 'Z'})

    # Calculate N (number of unique concepts) per document
    N_query = df.groupby('loinc_num')['concept'].nunique().reset_index()
    N_query = N_query.rename(columns={'concept': 'N'})

    # Merge Z and N dataframes
    df_second_stage = pd.merge(Z_query, N_query, on='loinc_num', how='left')

    # Fill NaN values with 0
    df_second_stage['Z'] = df_second_stage['Z'].fillna(0)
    df_second_stage['N'] = df_second_stage['N'].fillna(0)

    return df_second_stage

def rank_documents(query):

    # 1. Get the concepts for the query. Again this would be part of a larger
    # information retrieval module.
    concepts = QUERY_TO_CONCEPTS.get(query) 

    if not concepts:
        print(f"No concepts found for query: {query}")
        return None
    
    # 2. Build dataset #1 for the query
    df_first_stage_query = build_first_stage_dataset(concepts, query)

    # 3. Get log-odds from first model
    df_first_stage_query['log_odds'] = model_1.decision_function(df_first_stage_query[features_1])

    # 4. Build second stage dataset
    df_second_stage_query = build_second_stage_dataset(df_first_stage_query, query)

    # 5. Predict relevance using second model
    df_second_stage_query['final_score'] = model_2.decision_function(df_second_stage_query[features_2])

    # 6. Rank documents based on final score
    df_ranked = df_second_stage_query.sort_values(by='final_score', ascending=False)

    return df_ranked[['loinc_num', 'long_common_name', 'final_score']]


In [None]:
ranked_list_1 = rank_documents("glucose in blood")
if ranked_list_1 is not None:
    print("--- Top 5 Results for 'glucose in blood' ---")
    print(ranked_list_1.head(5))