In [None]:
%pip install pytorch_tabular



In [None]:
%pip install torch -U



In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/472 Project

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1JIvKEjbFyoXJCJYSHeYkQm_kukfoyNBd/472 Project


In [None]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from transformers import BertTokenizer, BertModel
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
from pytorch_tabular import TabularModel
from pytorch_tabular.models import FTTransformerConfig
from pytorch_tabular.models.common.heads import LinearHeadConfig
from torch.cuda.amp import GradScaler, autocast

In [None]:
# Load data
data_path = "FINAL_DATA_imputed.csv"  # Replace with your file path
df = pd.read_csv(data_path)

# Preprocessing
df['REPORT'].fillna("No scouting report available", inplace=True)  # Handle missing text data

# Select columns to use
features = ['POS', 'HGT', 'WGT', 'BMI', 'BF', 'WNGSPN', 'STNDRCH', 'HANDL', 'HANDW', 'BAR', 'PAN', 'REPORT']
target = 'IRS'

# Split data into train and test
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Define data configuration
data_config = DataConfig(
    target=[target],  # Target should always be a list.
    continuous_cols=['HGT', 'WGT', 'BMI', 'BF', 'WNGSPN', 'STNDRCH', 'HANDL', 'HANDW', 'BAR', 'PAN'],
    categorical_cols=['POS'],  # Add any categorical features like POS
)

head_config = LinearHeadConfig(
    layers="",  # No additional layers in head, just a mapping layer to output_dim
    dropout=0.2,
    initialization="kaiming"  # Kaiming He initialization
).__dict__

# Define FT-Transformer model configuration
model_config = FTTransformerConfig(
    task="regression",
    head="LinearHead",
    head_config=head_config,  # Using the head config for the LinearHead
    target_range=[(0, 100)],  # Adjusted for IRS range
    input_embed_dim=32,  # Customize based on dataset
)

# Define optimizer and trainer configuration
optimizer_config = OptimizerConfig()
trainer_config = TrainerConfig(
    auto_lr_find=True,  # Runs the LRFinder to automatically derive a learning rate
    batch_size=256,
    max_epochs=30,
    early_stopping="valid_loss",  # Monitor valid_loss for early stopping
    early_stopping_mode="min",  # Lower valid_loss is better
    early_stopping_patience=5,  # Wait 5 epochs before terminating training if performance degrades
    checkpoints="valid_loss",  # Save the best model based on validation loss
    load_best=True,  # Load the best checkpoint after training
    accelerator="auto",  # Automatically use 'cpu', 'gpu', or other accelerators as available
)

# Initialize the TabularModel
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)

# Define loss and metrics
loss = nn.L1Loss()



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['REPORT'].fillna("No scouting report available", inplace=True)  # Handle missing text data
INFO:pytorch_tabular.tabular_model:Experiment Tracking is turned off


In [None]:
# Train the model
tabular_model.fit(train=train_df, loss=loss)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

for param in bert_model.parameters():
    param.requires_grad = False
for param in bert_model.encoder.layer[-1].parameters():
    param.requires_grad = True


def tokenize_reports(reports, tokenizer, max_length=128):
    encoding = tokenizer(
        reports.tolist(),
        max_length=max_length,
        truncation=True,
        padding='max_length',
        return_tensors="pt"
    )
    return encoding['input_ids'], encoding['attention_mask']


train_reports_input_ids, train_reports_attention_mask = tokenize_reports(train_df['REPORT'], tokenizer)
test_reports_input_ids, test_reports_attention_mask = tokenize_reports(test_df['REPORT'], tokenizer)

class InjuryRiskModel(nn.Module):
    def __init__(self, bert_model, ft_transformer_model, input_dim, output_dim=1):
        super(InjuryRiskModel, self).__init__()
        self.bert = bert_model
        self.ft_transformer = ft_transformer_model
        self.fc = nn.Linear(input_dim, output_dim)
        self.dropout = nn.Dropout(0.3)

    def forward(self, tabular_data, input_ids, attention_mask):
        # BERT outputs
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = bert_outputs.pooler_output

        # Process tabular data with FT-Transformer
        continuous_data = tabular_data['continuous']
        categorical_data = tabular_data['categorical']

        tabular_output = self.ft_transformer({
            'continuous': continuous_data,
            'categorical': categorical_data
        })

        # Combine outputs
        combined_features = torch.cat((pooled_output, tabular_output), dim=1)
        output = self.fc(self.dropout(combined_features))
        return output


input_dim = bert_model.config.hidden_size + model_config.input_embed_dim
combined_model = InjuryRiskModel(bert_model, tabular_model.model, input_dim)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
combined_model.to(device)

# Define loss and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(combined_model.parameters(), lr=1e-4)

# Update the dataset class

# Ensure that the categorical data is passed correctly in the format expected by the FT-Transformer
class InjuryDataset(torch.utils.data.Dataset):
    def __init__(self, tabular_data, categorical_data, input_ids, attention_mask, targets):
        self.tabular_data = tabular_data
        self.categorical_data = categorical_data  # This should be a tensor of category codes or one-hot encoded values
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.targets = targets

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        return {
            'tabular': {
                'continuous': self.tabular_data[idx],
                'categorical': self.categorical_data[idx]  # Ensure it's in the correct format
            },
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'target': self.targets[idx]
        }

# Separate categorical and continuous data
train_df['POS'] = train_df['POS'].astype('category')
test_df['POS'] = test_df['POS'].astype('category')

train_categorical_data = torch.tensor(train_df['POS'].cat.codes.values, dtype=torch.long)
train_continuous_data = torch.tensor(train_df[data_config.continuous_cols].values, dtype=torch.float32)
test_categorical_data = torch.tensor(test_df['POS'].cat.codes.values, dtype=torch.long)
test_continuous_data = torch.tensor(test_df[data_config.continuous_cols].values, dtype=torch.float32)

# Create dataset instances

train_dataset = InjuryDataset(
    tabular_data=train_continuous_data,
    categorical_data=train_categorical_data,
    input_ids=train_reports_input_ids,
    attention_mask=train_reports_attention_mask,
    targets=torch.tensor(train_df[target].values, dtype=torch.float32)
)

test_dataset = InjuryDataset(
    tabular_data=test_continuous_data,
    categorical_data=test_categorical_data,
    input_ids=test_reports_input_ids,
    attention_mask=test_reports_attention_mask,
    targets=torch.tensor(test_df[target].values, dtype=torch.float32)
)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32)

# Training loop
for epoch in range(trainer_config.max_epochs):
    combined_model.train()
    for batch in train_loader:
        optimizer.zero_grad()

        # Access 'tabular' dictionary correctly
        tabular_data = batch['tabular']
        continuous_data = tabular_data['continuous'].to(device)
        categorical_data = tabular_data['categorical'].to(device)

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['target'].to(device).unsqueeze(1)

        # Ensure that the correct inputs are passed into the model
        outputs = combined_model(tabular_data, input_ids, attention_mask)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")


INFO:lightning_fabric.utilities.seed:Seed set to 42
INFO:pytorch_tabular.tabular_model:Preparing the DataLoaders
INFO:pytorch_tabular.tabular_datamodule:Setting up the datamodule for regression task
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the ori

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=30` reached.
INFO:pytorch_lightning.tuner.lr_finder:LR finder stopped early after 30 steps due to diverging loss.
INFO:pytorch_lightning.tuner.lr_finder:Learning rate set to 1.7378008287493761e-06
INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/drive/.shortcut-targets-by-id/1JIvKEjbFyoXJCJYSHeYkQm_kukfoyNBd/472 Project/.lr_find_7521f05e-996b-466a-9291-7fed840645c1.ckpt
INFO:pytorch_lightning.utilities.rank_zero:Restored all states from the checkpoint at /content/drive/.shortcut-targets-by-id/1JIvKEjbFyoXJCJYSHeYkQm_kukfoyNBd/472 Project/.lr_find_7521f05e-996b-466a-9291-7fed840645c1.ckpt
INFO:pytorch_tabular.tabular_model:Suggested LR: 1.7378008287493761e-06. For plot and detailed analysis, use `find_learning_rate` method.
INFO:pytorch_tabular.tabular_model:Training Started
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=30` reached.


INFO:pytorch_tabular.tabular_model:Training the model completed
INFO:pytorch_tabular.tabular_model:Loading the best model
  return torch.load(f, map_location=map_location)


IndexError: tuple index out of range

In [None]:
# Evaluation
combined_model.eval()
all_predictions, all_targets = [], []
with torch.no_grad():
    for batch in test_loader:
        tabular_data = batch['tabular'].to(device)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['target'].to(device).unsqueeze(1)

        predictions = combined_model(tabular_data, input_ids, attention_mask).cpu().numpy()
        all_predictions.extend(predictions.flatten())
        all_targets.extend(targets.cpu().numpy().flatten())

mae = mean_absolute_error(all_targets, all_predictions)
print(f"Mean Absolute Error: {mae}")
