# Kenya Clinical Reasoning Challenge

In [1]:
import numpy as np
import pandas as pd

### Data

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train_raw = pd.read_csv('train_raw.csv')

##### Train

In [23]:
train.columns

Index(['Master_Index', 'County', 'Health level', 'Years of Experience',
       'Prompt', 'Nursing Competency', 'Clinical Panel', 'Clinician', 'GPT4.0',
       'LLAMA', 'GEMINI', 'DDX SNOMED'],
      dtype='object')

In [25]:
# checking the volume of data we have for train
print(f"Shape of the training data: {train.shape}")
train.head()

Shape of the training data: (400, 12)


Unnamed: 0,Master_Index,County,Health level,Years of Experience,Prompt,Nursing Competency,Clinical Panel,Clinician,GPT4.0,LLAMA,GEMINI,DDX SNOMED
0,ID_VBWWP,uasin gishu,sub county hospitals and nursing homes,18.0,i am a nurse with 18 years of experience in ge...,pediatric emergency burns,surgery,summary a 4 year old with 5 superficial burns ...,given your vast experience as a nurse in uasin...,1 immediate treatment protocol for second degr...,here s a response addressing the questions reg...,288514009 burn involving 5 percent of body sur...
1,ID_XMBBY,uasin gishu,national referral hospitals,17.0,i am a nurse with 17 years of experience in ge...,child health,paediatrics,summary 6 year old present with vomiting and a...,clinical summary • a 6 year old girl with know...,based on the symptoms and signs you ve describ...,based on the presentation the 6 year old girl ...,420270002 ketoacidosis due to type 1 diabetes ...
2,ID_JZNZW,kiambu,sub county hospitals and nursing homes,12.0,i am a nurse with 12 years of experience in ge...,general emergency,internal medicine,summary a 47 year old man presents with severe...,in this case you re dealing with a 47 year old...,firstly i must commend you on your thorough hi...,this 47 year old male presenting with severe r...,13200003 peptic ulcer disorder 25458004 acute ...
3,ID_QOQTK,uasin gishu,national referral hospitals,12.0,i am a nurse with 12 years of experience in pr...,critical care,internal medicine,summary 72 year old female with inability to w...,given er s clinical presentation and vitals th...,to me with this query based on the information...,this 92 year old female patient er presents wi...,14760008 constipation finding 419284004 altere...
4,ID_ZFJBM,uasin gishu,national referral hospitals,16.0,i am a nurse with 16 years of experience in ge...,adult health,internal medicine,a 22 year old female presents with headache di...,the 22 year old female patient is presenting w...,thank you for presenting this case based on th...,this 22 year old female patient presents with ...,95874006 carbon monoxide poisoning from fire d...


In [8]:
# missing values
print(f"Missing values:\n {train.isnull().sum()}\n")

# percentage of missing values
print(f"Missing values percentage:\n {train.isnull().mean() * 100}\n")

# Show rows with any missing values
missing_rows = train[train.isnull().any(axis=1)]
print(f"Missing rows:\n {missing_rows}")

Missing values:
 Master_Index             0
County                   0
Health level             0
Years of Experience    100
Prompt                   0
Nursing Competency       0
Clinical Panel           0
Clinician                0
GPT4.0                   0
LLAMA                    0
GEMINI                   0
DDX SNOMED               1
dtype: int64

Missing values percentage:
 Master_Index            0.00
County                  0.00
Health level            0.00
Years of Experience    25.00
Prompt                  0.00
Nursing Competency      0.00
Clinical Panel          0.00
Clinician               0.00
GPT4.0                  0.00
LLAMA                   0.00
GEMINI                  0.00
DDX SNOMED              0.25
dtype: float64

Missing rows:
     Master_Index       County                            Health level  \
7       ID_SUOCB  uasin gishu             national referral hospitals   
10      ID_LPHGS     kakamega                          health centres   
16      ID_DBLHH  u

In [15]:
# Count total duplicate rows
print(f"Duplicated values: {train.duplicated().sum()}\n")

# View duplicate rows
duplicates = train[train.duplicated()]
print(f"Duplicates:\n {duplicates}\n")

# Optionally, drop duplicates
# df = df.drop_duplicates()

Duplicated values: 0

Duplicates:
 Empty DataFrame
Columns: [Master_Index, County, Health level, Years of Experience, Prompt, Nursing Competency, Clinical Panel, Clinician, GPT4.0, LLAMA, GEMINI, DDX SNOMED]
Index: []



##### Train_Raw

In [21]:
# check columns
train_raw.columns

Index(['Master_Index', 'County', 'Health level', 'Years of Experience',
       'Prompt', 'Nursing Competency', 'Clinical Panel', 'Clinician', 'GPT4.0',
       'LLAMA', 'GEMINI', 'DDX SNOMED'],
      dtype='object')

In [27]:
# checking the volume of data we have for train_raw
print(f"Shape of the training data: {train_raw.shape}")
train_raw.head()

Shape of the training data: (400, 12)


Unnamed: 0,Master_Index,County,Health level,Years of Experience,Prompt,Nursing Competency,Clinical Panel,Clinician,GPT4.0,LLAMA,GEMINI,DDX SNOMED
0,ID_VBWWP,Uasin Gishu,Sub-county Hospitals and Nursing Homes,18.0,I am a nurse with 18 years of experience in Ge...,Pediatric Emergency Burns,SURGERY,Summary:\nA 4 year old with 5% superficial bur...,Given your vast experience as a nurse in Uasin...,1. Immediate treatment protocol for second-deg...,Here's a response addressing the questions reg...,288514009 | Burn involving 5 percent of body s...
1,ID_XMBBY,Uasin Gishu,National Referral Hospitals,17.0,I am a nurse with 17 years of experience in Ge...,Child Health,PAEDIATRICS,Summary\n6-year-old present with vomiting and ...,Clinical Summary: • A 6-year-old girl w...,Based on the symptoms and signs you've describ...,"Based on the presentation, the 6-year-old girl...",420270002 | Ketoacidosis due to type 1 diabete...
2,ID_JZNZW,Kiambu,Sub-county Hospitals and Nursing Homes,12.0,I am a nurse with 12 years of experience in Ge...,General Emergency,INTERNAL MEDICINE,Summary\nA 47-year-old man presents with sever...,"In this case, you're dealing with a 47-year-ol...","Firstly, I must commend you on your thorough h...","This 47-year-old male presenting with severe, ...",13200003 | Peptic ulcer (disorder)\n25458004 |...
3,ID_QOQTK,Uasin Gishu,National Referral Hospitals,12.0,I am a nurse with 12 years of experience in Pr...,Critical Care,INTERNAL MEDICINE,SUMMARY\n\n72-year-old female with inability t...,"Given ER's clinical presentation and vitals, t...",to me with this query. Based on the informatio...,This 92-year-old female patient (ER) presents ...,14760008 | Constipation (finding)\n419284004 |...
4,ID_ZFJBM,Uasin Gishu,National Referral Hospitals,16.0,I am a nurse with 16 years of experience in Ge...,Adult Health,INTERNAL MEDICINE,"A 22 year old female presents with headache, d...",The 22-year-old female patient is presenting w...,Thank you for presenting this case. Based on t...,This 22-year-old female patient presents with ...,95874006 | Carbon monoxide poisoning from fire...


In [29]:
# missing values
print(f"Missing values:\n {train_raw.isnull().sum()}\n")

# percentage of missing values
print(f"Missing values percentage:\n {train_raw.isnull().mean() * 100}\n")

# Show rows with any missing values
missing_rows = train_raw[train_raw.isnull().any(axis=1)]
print(f"Missing rows:\n {missing_rows}")

Missing values:
 Master_Index             0
County                   0
Health level             0
Years of Experience    100
Prompt                   0
Nursing Competency       0
Clinical Panel           0
Clinician                0
GPT4.0                   0
LLAMA                    0
GEMINI                   0
DDX SNOMED               1
dtype: int64

Missing values percentage:
 Master_Index            0.00
County                  0.00
Health level            0.00
Years of Experience    25.00
Prompt                  0.00
Nursing Competency      0.00
Clinical Panel          0.00
Clinician               0.00
GPT4.0                  0.00
LLAMA                   0.00
GEMINI                  0.00
DDX SNOMED              0.25
dtype: float64

Missing rows:
     Master_Index       County                            Health level  \
7       ID_SUOCB  Uasin Gishu             National Referral Hospitals   
10      ID_LPHGS     Kakamega                          Health centres   
16      ID_DBLHH  U

In [31]:
# Count total duplicate rows
print(f"Duplicated values:\n {train_raw.duplicated().sum()}\n")

# View duplicate rows
duplicates = train_raw[train_raw.duplicated()]
print(f"Duplicates:\n {duplicates}\n")

# Optionally, drop duplicates
# df = df.drop_duplicates()

Duplicated values:
 0

Duplicates:
 Empty DataFrame
Columns: [Master_Index, County, Health level, Years of Experience, Prompt, Nursing Competency, Clinical Panel, Clinician, GPT4.0, LLAMA, GEMINI, DDX SNOMED]
Index: []



##### Test

In [36]:
# checking the columns:
test.columns

Index(['Master_Index', 'County', 'Health level', 'Years of Experience',
       'Prompt', 'Nursing Competency', 'Clinical Panel'],
      dtype='object')

In [38]:
# checking the volume of data we have for test
print(f"Shape of the training data: {test.shape}")
test.head()

Shape of the training data: (100, 7)


Unnamed: 0,Master_Index,County,Health level,Years of Experience,Prompt,Nursing Competency,Clinical Panel
0,ID_CUAOY,uasin gishu,sub county hospitals and nursing homes,2.0,i am a nurse with 2 years of experience in gen...,adult health,surgery ent
1,ID_OGSAY,kiambu,sub county hospitals and nursing homes,22.0,i am a nurse with 22 years of experience in ge...,child health,surgery
2,ID_TYHSA,uasin gishu,national referral hospitals,,i am a nurse working in a national referral ho...,general emergency,internal medicine
3,ID_CZXLD,kakamega,dispensaries and private clinics,,i am a nurse working in a dispensaries and pri...,child health,paediatrics
4,ID_ZJQUQ,kakamega,health centres,,i am a nurse working in a health centres in ka...,child health,paediatrics


In [40]:
# missing values
print(f"Missing values:\n {test.isnull().sum()}\n")

# percentage of missing values
print(f"Missing values percentage:\n {test.isnull().mean() * 100}\n")

# Show rows with any missing values
missing_rows = test[test.isnull().any(axis=1)]
print(f"Missing rows:\n {missing_rows}")

Missing values:
 Master_Index            0
County                  0
Health level            0
Years of Experience    25
Prompt                  0
Nursing Competency      0
Clinical Panel          0
dtype: int64

Missing values percentage:
 Master_Index            0.0
County                  0.0
Health level            0.0
Years of Experience    25.0
Prompt                  0.0
Nursing Competency      0.0
Clinical Panel          0.0
dtype: float64

Missing rows:
    Master_Index       County                            Health level  \
2      ID_TYHSA  uasin gishu             national referral hospitals   
3      ID_CZXLD     kakamega        dispensaries and private clinics   
4      ID_ZJQUQ     kakamega                          health centres   
7      ID_GDFDN     kakamega                          health centres   
8      ID_UFAFI       kiambu  sub county hospitals and nursing homes   
13     ID_KQFSM     kakamega        dispensaries and private clinics   
20     ID_VJVBS  uasin gishu

In [42]:
# Count total duplicate rows
print(f"Duplicated values:\n {test.duplicated().sum()}\n")

# View duplicate rows
duplicates = test[test.duplicated()]
print(f"Duplicates:\n {duplicates}\n")

# Optionally, drop duplicates
# df = df.drop_duplicates()

Duplicated values:
 0

Duplicates:
 Empty DataFrame
Columns: [Master_Index, County, Health level, Years of Experience, Prompt, Nursing Competency, Clinical Panel]
Index: []



### Analysis

In [48]:
# 1. First analysis - Clinician Response Patterns
def analyze_responses(df):
    # Count structured terms
    summary_count = df['Clinician'].str.lower().str.contains('summary').sum()
    dx_count = df['Clinician'].str.lower().str.contains('dx').sum()
    plan_count = df['Clinician'].str.lower().str.contains('plan').sum()
    print(f"Responses with 'summary': {summary_count}/{len(df)} ({summary_count/len(df)*100:.1f}%)")
    print(f"Responses with 'dx': {dx_count}/{len(df)} ({dx_count/len(df)*100:.1f}%)")
    print(f"Responses with 'plan': {plan_count}/{len(df)} ({plan_count/len(df)*100:.1f}%)")

    # Word count
    df['clinician_word_count'] = df['Clinician'].str.split().str.len()
    print(f"Avg word count: {df['clinician_word_count'].mean():.1f}, Min: {df['clinician_word_count'].min()}, Max: {df['clinician_word_count'].max()}")

In [50]:
# train analysis response
analyze_responses(train)

Responses with 'summary': 325/400 (81.2%)
Responses with 'dx': 85/400 (21.2%)
Responses with 'plan': 28/400 (7.0%)
Avg word count: 109.0, Min: 29, Max: 320


In [52]:
# train_raw analysis response
analyze_responses(train_raw)

Responses with 'summary': 325/400 (81.2%)
Responses with 'dx': 85/400 (21.2%)
Responses with 'plan': 28/400 (7.0%)
Avg word count: 107.7, Min: 26, Max: 314


In [56]:
# 2. Second Analysis - Contextual Features
def explore_context(df):
    # Group by Health level
    health_summary = df.groupby('Health level')['clinician_word_count'].agg(['mean', 'count']).round(1)
    print("\nWord count by Health level:\n", health_summary)

    # Group by Nursing Competency
    competency_summary = df.groupby('Nursing Competency')['clinician_word_count'].agg(['mean', 'count']).round(1)
    print("\nWord count by Nursing Competency:\n", competency_summary)

    # Years of Experience vs. word count
    exp_summary = df.groupby('Years of Experience')['clinician_word_count'].agg(['mean', 'count']).round(1)
    print("\nWord count by Years of Experience:\n", exp_summary)

In [58]:
# train contextual features
explore_context(train)


Word count by Health level:
                                          mean  count
Health level                                        
community health centers                104.8      6
county hospitals                         96.9      9
dispensaries and private clinics        106.4     54
health centers                          123.0      1
health centres                          106.3     74
national referral hospitals             110.7    125
sub county hospitals and nursing homes  111.0    131

Word count by Nursing Competency:
                                  mean  count
Nursing Competency                          
adult health                    108.9    123
child health                    104.3     56
critical care                   139.0      2
emergency care adult            102.0      2
emergency care burns             98.0      1
emergency care gbv               80.0      4
emergency care mental health     60.0      1
emergency care pediatric        109.3     23
emergen

In [60]:
# train_raw contextual features
explore_context(train_raw)


Word count by Health level:
                                          mean  count
Health level                                        
Community Health Centers                102.5      6
County Hospitals                         93.8      9
Dispensaries and Private Clinics        104.4     54
Health Centers                          122.0      1
Health Centres                          121.0      5
Health centres                          105.2     69
National Referral Hospitals             109.2    125
Sub-county Hospitals and Nursing Homes  109.4    131

Word count by Nursing Competency:
                                  mean  count
Nursing Competency                          
Adult Health                    107.0    123
Child Health                    103.5     56
Critical Care                   143.0      2
Emergency Care - Adult           99.5      2
Emergency Care - Burns           95.0      1
Emergency Care - GBV             79.8      4
Emergency Care - Mental Health   66.0      1

In [92]:
# 3. Preprocessing Function
def preprocess_clinician(text):
    # Convert to lowercase
    text = text.lower()
    # Keep only alphanumeric characters and spaces
    text = ''.join(c for c in text if c.isalnum() or c.isspace())
    # Replace multiple spaces/newlines with a single space
    text = ' '.join(text.split())
    return text

In [94]:
# Test preprocessing
sample_response = train['Clinician'][0]
print(f"Original:\n {sample_response}\n")
print(f"Preprocessed:\n{preprocess_clinician(sample_response)}")

Original:
 summary a 4 year old with 5 superficial burns no other injuries immediate management paracetamol analgesics to to ensure child has minimal or no pain cleaning and frosting of wound with silver sulpha fizika topical prophylactic can be considered in this case good nutrition high protein diet

Preprocessed:
summary a 4 year old with 5 superficial burns no other injuries immediate management paracetamol analgesics to to ensure child has minimal or no pain cleaning and frosting of wound with silver sulpha fizika topical prophylactic can be considered in this case good nutrition high protein diet


In [96]:
# Test preprocessing
sample_response = train_raw['Clinician'][0]
print(f"Original:\n {sample_response}\n")
print(f"Preprocessed:\n{preprocess_clinician(sample_response)}")

Original:
 Summary:
A 4 year old with 5% superficial burns. No other injuries

Immediate Management:

 * Paracetamol analgesics to to ensure child has minimal
 or no pain
 * Cleaning and frosting of wound with silver sulpha-fizika
* Topical prophylactic can be considered in this case
 * Good nutrition - high protein diet

Preprocessed:
summary a 4 year old with 5 superficial burns no other injuries immediate management paracetamol analgesics to to ensure child has minimal or no pain cleaning and frosting of wound with silver sulphafizika topical prophylactic can be considered in this case good nutrition high protein diet


### Handle missing value

In [100]:
def update_prompt(row):
    if pd.isna(row['Years of Experience']):
        # Insert "with unknown years of experience" after "I am a nurse"
        return row['Prompt'].replace("i am a nurse", "i am a nurse with unknown years of experience")
    return row['Prompt']

In [102]:
train['Prompt'] = train.apply(update_prompt, axis=1)
train_raw['Prompt'] = train_raw.apply(update_prompt, axis=1)
test['Prompt'] = test.apply(update_prompt, axis=1)

In [106]:
train.isnull().sum()

Master_Index              0
County                    0
Health level              0
Years of Experience     100
Prompt                    0
Nursing Competency        0
Clinical Panel            0
Clinician                 0
GPT4.0                    0
LLAMA                     0
GEMINI                    0
DDX SNOMED                1
clinician_word_count      0
dtype: int64

In [89]:
# Check abbreviations in a few responses
for i in [1, 2, 3]:
    print(f"\nIndex {i}:\n", train_raw['Clinician'][i])


Index 1:
 Summary
6-year-old present with vomiting and abdominal pains. Known diabetic on insulin but doesn’t take it as scheduled due to lack of funds.
He is confused, Kussmaul breathing, fruity-scented breath, dry tongue & blurry vision.

Vitals: Temp (N), pulse ↑ (120), BP ↓ (48), rapid laboured SpO₂ 90%.

Diagnosis:
Diabetic Ketoacidosis (DKA) in known DM type 1 patient due to insulin insufficiency and uncompliance.

Differentials 
Sepsis in a knon diabetic patient
Investigations:

1. Laboratory investigations:

Urinalysis 
Blood gas analysis
RBS
HbA1c 
UECs 
CBC



Management:

1. Insert IV large bore cannula.

2. Give normal saline bolus and maintenance fluids.

3. Check RBS regularly.

4. Give insulin 0.1 u/kg/hr (IV).

5. Monitor blood glucose levels.

6. Check UECs for hypokalemia – if present, add it to the IV fluid.

7. Treat underlying infection with antibiotics.

8. Do regular ketone, pH, and bicarbonate checks.

Index 2:
 Summary
A 47-year-old man presents with severe pe

## Baseline Modelling

In [109]:
pip install transformers datasets torch pandas numpy

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-19.0.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m215.3 kB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m[36m0:00:01[0m:01[0m
[?25hDownloading multiprocess-0.70.16-py312-none-any.whl (146 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m146.7/146.7 kB[0m [31m224.2 kB/s[0m eta [36m0:00:00[0m1m205.9 kB/s[0m eta [36m0:00:01[0m
[?25hDownloading pyarrow-19.0.1-cp312-cp312-manylinux_2_28_x86_64.whl

### Preparing the Data (using train_raw)

In [None]:
import pandas as pd
from datasets import Dataset

# Load data
train_raw = pd.read_csv('train_raw.csv')

# Preprocessing function (fixed from Phase 1)
def preprocess_clinician(text):
    text = text.lower()
    text = ''.join(c for c in text if c.isalnum() or c.isspace())
    text = ' '.join(text.split())
    return text

# Handle missing Years of Experience in Prompt
def update_prompt(row):
    if pd.isna(row['Years of Experience']):
        return row['Prompt'].replace("i am a nurse", "i am a nurse with unknown years of experience")
    return row['Prompt']

# Apply preprocessing
train_raw['Prompt'] = train_raw.apply(update_prompt, axis=1)
train_raw['Clinician'] = train_raw['Clinician'].apply(preprocess_clinician)

# Create a Hugging Face Dataset
data = {'input_text': train_raw['Prompt'], 'target_text': train_raw['Clinician']}
dataset = Dataset.from_dict(data)

# Split into train (80%) and validation (20%)
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
val_dataset = train_test_split['test']

# Verify
print("Train sample:", train_dataset[0])
print("Validation sample:", val_dataset[0])