Add CSV to Compute Engine

In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

Import/Install Libraries

In [None]:
%pip install pytorch-tabular



In [None]:
%pip install tab_transformer_pytorch

Collecting tab_transformer_pytorch
  Downloading tab_transformer_pytorch-0.3.0-py3-none-any.whl.metadata (690 bytes)
Downloading tab_transformer_pytorch-0.3.0-py3-none-any.whl (6.9 kB)
Installing collected packages: tab_transformer_pytorch
Successfully installed tab_transformer_pytorch-0.3.0


In [None]:
import torch
import torch.nn as nn
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from tab_transformer_pytorch import FTTransformer
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error, accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, r2_score
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from transformers import BertTokenizer, BertModel
import matplotlib.pyplot as plt

Connect to Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/472 Project

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1JIvKEjbFyoXJCJYSHeYkQm_kukfoyNBd/472 Project


Load Data from CSVs

In [None]:
file_path = "FINAL_DATA_imputed.csv"
data = pd.read_csv(file_path)

# Select relevant columns
selected_columns = ["POS", "HGT", "WGT", "BMI", "BF", "WNGSPN", "STNDRCH", "HANDL", "HANDW", "BAR", "PAN", "REPORT", "IRS"]
data = data[selected_columns]

# Fill missing values and ensure proper data types
data["REPORT"] = data["REPORT"].fillna("No report available").astype(str)

# Split the data into train/test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Define feature columns
numerical_features = ["HGT", "WGT", "BMI", "BF", "WNGSPN", "STNDRCH", "HANDL", "HANDW", "BAR", "PAN"]
text_column = "REPORT"
label_column = "IRS"


In [None]:
class InjuryRiskDataset(Dataset):
    def __init__(self, tabular_data, text_data, labels, tokenizer, max_len=128):
        self.tabular = tabular_data
        self.text = text_data
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        tabular_data = torch.tensor(self.tabular.iloc[idx].values, dtype=torch.float)
        report = self.text.iloc[idx]
        label = torch.tensor(self.labels.iloc[idx], dtype=torch.float)

        # Tokenize text data
        encoding = self.tokenizer(
            report, max_length=self.max_len, padding="max_length", truncation=True, return_tensors="pt"
        )
        input_ids = encoding["input_ids"].squeeze(0)
        attention_mask = encoding["attention_mask"].squeeze(0)

        return input_ids, attention_mask, tabular_data, label

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Prepare datasets
train_dataset = InjuryRiskDataset(
    train_data[numerical_features], train_data[text_column], train_data[label_column], tokenizer
)
test_dataset = InjuryRiskDataset(
    test_data[numerical_features], test_data[text_column], test_data[label_column], tokenizer
)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)



In [None]:
class InjuryRiskModel(nn.Module):
    def __init__(self, tabular_input_dim):
        super(InjuryRiskModel, self).__init__()
        self.bert = AutoModel.from_pretrained("bert-base-uncased")

        # Define FTTransformer for numerical inputs only
        self.ft_transformer = FTTransformer(
            categories=(),  # No categorical features
            num_continuous=tabular_input_dim,
            dim=32,
            dim_out=1,  # Binary prediction
            depth=6,
            heads=8,
            attn_dropout=0.1,
            ff_dropout=0.1,
        )

        # Fully connected layer to combine outputs
        self.fc = nn.Linear(768 + 1, 1)  # Adjusted for FTTransformer output and BERT output

    def forward(self, input_ids, attention_mask, tabular_data):
        # BERT forward pass
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        bert_embedding = bert_output.pooler_output  # Shape: (batch_size, 768)

        # Dummy tensor for categorical features (0 categories)
        batch_size = tabular_data.size(0)
        dummy_categ = torch.zeros((batch_size, 0), dtype=torch.long, device=tabular_data.device)

        # FTTransformer forward pass with dummy categorical and numerical data
        ft_output = self.ft_transformer(x_categ=dummy_categ, x_numer=tabular_data)  # Shape: (batch_size, 1)

        # Combine BERT and FTTransformer outputs
        combined = torch.cat((bert_embedding, ft_output), dim=1)  # Shape: (batch_size, 769)

        return self.fc(combined)  # Shape: (batch_size, 1)



In [None]:
def train_model(model, train_loader, val_loader, epochs, device, lr=1e-4):
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    criterion = nn.MSELoss()
    model.to(device)

    for epoch in range(epochs):
        # Training
        model.train()
        train_loss = 0
        for input_ids, attention_mask, tabular_data, labels in tqdm(train_loader):
            input_ids, attention_mask, tabular_data, labels = (
                input_ids.to(device),
                attention_mask.to(device),
                tabular_data.to(device),
                labels.to(device),
            )

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask, tabular_data).squeeze()
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for input_ids, attention_mask, tabular_data, labels in val_loader:
                input_ids, attention_mask, tabular_data, labels = (
                    input_ids.to(device),
                    attention_mask.to(device),
                    tabular_data.to(device),
                    labels.to(device),
                )

                outputs = model(input_ids, attention_mask, tabular_data).squeeze()
                loss = criterion(outputs, labels)
                val_loss += loss.item()

        print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss / len(train_loader)}, Val Loss: {val_loss / len(val_loader)}")


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = InjuryRiskModel(tabular_input_dim=len(numerical_features))
train_model(model, train_loader, test_loader, epochs=10, device=device)

100%|██████████| 10/10 [00:06<00:00,  1.58it/s]


Epoch 1/10, Train Loss: 98.07840309143066, Val Loss: 37.96341196695963


100%|██████████| 10/10 [00:06<00:00,  1.59it/s]


Epoch 2/10, Train Loss: 75.69899806976318, Val Loss: 32.26097043355306


100%|██████████| 10/10 [00:06<00:00,  1.55it/s]


Epoch 3/10, Train Loss: 73.02829399108887, Val Loss: 33.67170079549154


100%|██████████| 10/10 [00:06<00:00,  1.55it/s]


Epoch 4/10, Train Loss: 71.06875877380371, Val Loss: 33.397433598836265


100%|██████████| 10/10 [00:06<00:00,  1.52it/s]


Epoch 5/10, Train Loss: 72.90549774169922, Val Loss: 31.84684944152832


100%|██████████| 10/10 [00:06<00:00,  1.53it/s]


Epoch 6/10, Train Loss: 74.73967456817627, Val Loss: 32.64815012613932


100%|██████████| 10/10 [00:06<00:00,  1.54it/s]


Epoch 7/10, Train Loss: 70.81602230072022, Val Loss: 32.64227803548177


100%|██████████| 10/10 [00:06<00:00,  1.58it/s]


Epoch 8/10, Train Loss: 73.96493530273438, Val Loss: 32.367563247680664


100%|██████████| 10/10 [00:06<00:00,  1.57it/s]


Epoch 9/10, Train Loss: 70.08533458709717, Val Loss: 32.6087958017985


100%|██████████| 10/10 [00:06<00:00,  1.60it/s]


Epoch 10/10, Train Loss: 70.91068019866944, Val Loss: 32.37890752156576


In [None]:
torch.save(model.state_dict(), "model.pth")

In [None]:
def evaluate_model(model, test_loader, device, threshold=20):
    model.eval()  # Set the model to evaluation mode
    predictions = []
    true_labels = []
    continuous_predictions = []
    continuous_true_labels = []

    with torch.no_grad():
        for input_ids, attention_mask, tabular_data, labels in test_loader:
            input_ids, attention_mask, tabular_data, labels = (
                input_ids.to(device),
                attention_mask.to(device),
                tabular_data.to(device),
                labels.to(device),
            )

            # Model output (regression values)
            outputs = model(input_ids, attention_mask, tabular_data).squeeze()

            # Convert the continuous outputs to binary predictions (injury prone or not)
            predicted_labels = (outputs > threshold).cpu().numpy()  # 1 = injury prone, 0 = not injury prone
            true_labels_bin = (labels > threshold).cpu().numpy()  # 1 = injury prone, 0 = not injury prone

            # Collect predictions for classification metrics
            predictions.extend(predicted_labels)
            true_labels.extend(true_labels_bin)

            # Collect continuous values for regression metrics
            continuous_predictions.extend(outputs.cpu().numpy())
            continuous_true_labels.extend(labels.cpu().numpy())

    # Compute the classification metrics
    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions)
    recall = recall_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)

    # Compute the regression metrics
    mae = mean_absolute_error(continuous_true_labels, continuous_predictions)
    mse = mean_squared_error(continuous_true_labels, continuous_predictions)
    rmse = np.sqrt(mse)

    # Print the classification metrics
    print(f"Classification Metrics:")
    print(f"Accuracy: {accuracy}")


    # Print the regression metrics
    print(f"Regression Metrics:")
    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"Root Mean Squared Error (RMSE): {rmse}")

# Evaluate the model
evaluate_model(model, test_loader, device)

Classification Metrics:
Accuracy: 0.96
Regression Metrics:
Mean Absolute Error (MAE): 4.4721360206604
Root Mean Squared Error (RMSE): 6.049713134765625


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
