# Maternal Risk Classification with GPT-2

Fine-tuning a pre-trained GPT-2 on the maternal risk dataset.

### Data Processing and Tokenization



In [1]:
# Read dataset
import pandas as pd

df = pd.read_csv('Maternal Health Risk Data Set.csv')
df.head()

Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate,RiskLevel
0,25,130,80,15.0,98.0,86,high risk
1,35,140,90,13.0,98.0,70,high risk
2,29,90,70,8.0,100.0,80,high risk
3,30,140,85,7.0,98.0,70,high risk
4,35,120,60,6.1,98.0,76,low risk


In [2]:
# Functions to clean dataframe
# Keeping diastoliclic BP and heart rate as features here to see if there's any effect

def feature_engineering(data):
    """Perform feature engineering on the dataset."""
    # Turn risk level from categories to numbers

    RiskLevel = {'low risk':0,
        'mid risk':1,
        'high risk':2}

    # apply using map
    data['RiskLevel'] = data['RiskLevel'].map(RiskLevel).astype(int)

    # Remove outlier point
    data = data.drop(data.index[data.HeartRate == 7])

    return data

def clean_data(data, drop_dup = False):
    """Clean and handle missing and duplicate values."""

    # Drop Na/missing values
    data = data.dropna()

    #Drop duplicate values?
    if drop_dup == True:
        data = data.drop_duplicates()

    return data

In [3]:
df_proc = clean_data(df)
df_proc = feature_engineering(df_proc)

In [None]:
# Tokenization
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

df_proc['tokens'] = df_proc.apply(lambda row: tokenizer(
    f"Age {row['Age']}, Systolic BP {row['SystolicBP']}, DiastolicBP {row['DiastolicBP']}, "
    f"Blood Sugar {row['BS']}, Body Temperature {row['BodyTemp']}, Heart Rate {row['HeartRate']}, "
    f"Health Risk Level {row['RiskLevel']}",

    return_tensors="pt"
), axis=1)

In [5]:
df_proc['labels'] = df_proc['RiskLevel']

### Define Dataset and Model


In [6]:
# Create dataset class
import torch
from torch.utils.data import Dataset, DataLoader

class MaternalHealthDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return {
            'input_ids': self.df.iloc[idx]['tokens']['input_ids'].squeeze(),
            'attention_mask': self.df.iloc[idx]['tokens']['attention_mask'].squeeze(),
            'label': torch.tensor(self.df.iloc[idx]['labels'])
        }


In [7]:
# Split dataset
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df_proc, test_size=0.2, random_state=42)
train_dataset = MaternalHealthDataset(train_df)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)

In [None]:
# Define GPT2 model

model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=3,output_attentions=True)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
model.config.pad_token_id = model.config.eos_token_id

### Model Training

In [9]:
# Training loop for fine-tuning GPT2
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {average_loss}")

Epoch 1/3: 100%|██████████| 203/203 [11:09<00:00,  3.30s/it]


Epoch 1/3, Average Loss: 0.41977314366214685


Epoch 2/3: 100%|██████████| 203/203 [10:58<00:00,  3.24s/it]


Epoch 2/3, Average Loss: 0.028727984441123903


Epoch 3/3: 100%|██████████| 203/203 [11:53<00:00,  3.51s/it]

Epoch 3/3, Average Loss: 0.0052866821380600495





### Model Evaluation  

In [10]:
from sklearn.metrics import accuracy_score, classification_report

# Evaluate on the test set
model.eval()
test_dataset = MaternalHealthDataset(test_df)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False)

all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].numpy()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        predictions = torch.argmax(logits, dim=1).cpu().numpy()

        all_predictions.extend(predictions)
        all_labels.extend(labels)

Evaluating: 100%|██████████| 51/51 [00:36<00:00,  1.39it/s]


In [11]:
# Calculate accuracy and other metrics
accuracy = accuracy_score(all_labels, all_predictions)
classification_report_str = classification_report(all_labels, all_predictions, target_names=['0','1','2'])

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report_str)


Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        83
           1       1.00      1.00      1.00        72
           2       1.00      1.00      1.00        48

    accuracy                           1.00       203
   macro avg       1.00      1.00      1.00       203
weighted avg       1.00      1.00      1.00       203



### Save Fine-Tuned Model

In [12]:
output_directory = "fine_tuned_gpt2_model"

# Save model
model.save_pretrained(output_directory)

# Save tokenizer
tokenizer.save_pretrained(output_directory)

('fine_tuned_gpt2_model/tokenizer_config.json',
 'fine_tuned_gpt2_model/special_tokens_map.json',
 'fine_tuned_gpt2_model/vocab.json',
 'fine_tuned_gpt2_model/merges.txt',
 'fine_tuned_gpt2_model/added_tokens.json')