# All needed libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import StandardScaler
import gc 

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

pd.set_option('display.max_columns', None)

In [None]:
dtypes_train = {'id': 'int32',
          'breath_id': 'int32',
          'R' : 'int8',
          'C' : 'int8',
          'time_step': 'float64',
          'u_in': 'float64',
          'u_out': 'int8',
          'pressure': 'float64'}

dtypes_test = {'id': 'int32',
          'breath_id': 'int32',
          'R' : 'int8',
          'C' : 'int8',
          'time_step': 'float64',
          'u_in': 'float64',
          'u_out': 'int8'}

def read_train():
    train = pd.read_csv('/kaggle/input/ventilator-pressure-prediction/train.csv')
    train = train.astype(dtypes_train)
    return train

def read_test():
    test = pd.read_csv('/kaggle/input/ventilator-pressure-prediction/test.csv')
    test = test.astype(dtypes_test)
    return test  

train = read_train()
test = read_test()

In [None]:
sample_submission = pd.read_csv('/kaggle/input/ventilator-pressure-prediction/sample_submission.csv')

print("Train shape:", train.shape)
print("Test shape:", test.shape)
print("Sample submission shape:", sample_submission.shape)

train.head()

In [None]:
# train.head(200000).describe().round(2)

In [None]:
print(train.dtypes)

print("\nUnique values of breath_id:", train['breath_id'].nunique())

print("R values:", train['R'].unique())
print("C values:", train['C'].unique())

In [None]:
example = train[train['breath_id'] == 500]

plt.figure(figsize=(12,6))
plt.plot(example['time_step'], example['u_in'], label='u_in (valve control)')
plt.plot(example['time_step'], example['pressure'], label='pressure')
plt.plot(example['time_step'], example['u_out'], label='u_out (exhalation phase)')
plt.xlabel("time_step")
plt.ylabel("Value")
plt.title("Example 500 of a single breath")
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(data=train, x='pressure', kde=True, bins=50)
plt.title('Distribution of Target Variable (Pressure)')
plt.show()

In [None]:
sample_breaths = train['breath_id'].unique()[0:5]

for breath_id in sample_breaths:
    breath_df = train[train['breath_id'] == breath_id]
    
    plt.figure(figsize=(12, 5))
    plt.plot(breath_df['time_step'], breath_df['u_in'], label='u_in (Inspiratory Flow)')
    plt.plot(breath_df['time_step'], breath_df['u_out'], label='u_out (Expiratory Phase)')
    plt.plot(breath_df['time_step'], breath_df['pressure'], label='Pressure (Target)', linestyle='--')
    
    plt.title(f'Profile for Breath ID: {breath_id}')
    plt.xlabel('Time Step')
    plt.ylabel('Value')
    plt.legend()
    plt.grid(True)
    plt.show()

# 1. First simple submission

# Submitting mean

In [None]:
mean_pressure = train.loc[train['u_out'] == 0, 'pressure'].mean()

sample_submission['pressure'] = mean_pressure

sample_submission.to_csv('/kaggle/working/submission.csv', index=False)

print("Mean pressure used for prediction:", mean_pressure)

# Linear regression

In [None]:
from sklearn.linear_model import LinearRegression

train_filtered = train[train['u_out'] == 0]

features = ['u_in', 'time_step', 'R', 'C']
X_train = train_filtered[features]
y_train = train_filtered['pressure']

X_test = test[features]

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

sample_submission['pressure'] = y_pred
sample_submission.to_csv('/kaggle/working/submission.csv', index=False)

# Feature engineering

In [None]:
def add_feats(train):
    # # rewritten calculation of lag features from this notebook: https://www.kaggle.com/patrick0302/add-lag-u-in-as-new-feat
# # some of ideas from this notebook: https://www.kaggle.com/mst8823/google-brain-lightgbm-baseline
    # train[["15_out_mean"]] = train.groupby('breath_id')['u_out'].rolling(window=15,min_periods=1).agg({"15_out_mean":"mean"}).reset_index(level=0,drop=True)
    train['last_value_u_in'] = train.groupby('breath_id')['u_in'].transform('last')
    train['u_in_lag1'] = train.groupby('breath_id')['u_in'].shift(1)
    train['u_out_lag1'] = train.groupby('breath_id')['u_out'].shift(1)
    train['u_in_lag_back1'] = train.groupby('breath_id')['u_in'].shift(-1)
    train['u_out_lag_back1'] = train.groupby('breath_id')['u_out'].shift(-1)
    train['u_in_lag2'] = train.groupby('breath_id')['u_in'].shift(2)
    train['u_out_lag2'] = train.groupby('breath_id')['u_out'].shift(2)
    train['u_in_lag3'] = train.groupby('breath_id')['u_in'].shift(3)
    train['u_out_lag3'] = train.groupby('breath_id')['u_out'].shift(3)
    train['u_in_lag_back2'] = train.groupby('breath_id')['u_in'].shift(-2)
    train['u_out_lag_back2'] = train.groupby('breath_id')['u_out'].shift(-2)
    train['u_in_lag_back3'] = train.groupby('breath_id')['u_in'].shift(-3)
    train['u_out_lag_back3'] = train.groupby('breath_id')['u_out'].shift(-3)
    train['u_in_lag_back10'] = train.groupby('breath_id')['u_in'].shift(-10)
    train['u_out_lag_back10'] = train.groupby('breath_id')['u_out'].shift(-10)

    ## time since last step
    train['time_step_diff'] = train.groupby('breath_id')['time_step'].diff().fillna(0)
    ### rolling window ts feats
    train['ewm_u_in_mean'] = train.groupby('breath_id')['u_in'].ewm(halflife=9).mean().reset_index(level=0,drop=True)
    train['ewm_u_in_std'] = train.groupby('breath_id')['u_in'].ewm(halflife=10).std().reset_index(level=0,drop=True) ## could add covar?
    train['ewm_u_in_corr'] = train.groupby('breath_id')['u_in'].ewm(halflife=15).corr().reset_index(level=0,drop=True) # self umin corr
    ## rolling window of 15 periods
    train[["15_in_sum","15_in_min","15_in_max","15_in_mean","15_out_std"]] = train.groupby('breath_id')['u_in'].rolling(window=15,min_periods=1).agg({"15_in_sum":"sum","15_in_min":"min","15_in_max":"max","15_in_mean":"mean","15_in_std":"std"}).reset_index(level=0,drop=True)
    train[["45_in_sum","45_in_min","45_in_max","45_in_mean","45_out_std"]] = train.groupby('breath_id')['u_in'].rolling(window=45,min_periods=1).agg({"45_in_sum":"sum","45_in_min":"min","45_in_max":"max","45_in_mean":"mean","45_in_std":"std"}).reset_index(level=0,drop=True)

    train[["15_out_mean"]] = train.groupby('breath_id')['u_out'].rolling(window=15,min_periods=1).agg({"15_out_mean":"mean"}).reset_index(level=0,drop=True)

    print(train.shape[0])
    display(train)
    train = train.fillna(0) # ORIG

    # max, min, mean value of u_in and u_out for each breath
    train['breath_id__u_in__max'] = train.groupby(['breath_id'])['u_in'].transform('max')

    train['breath_id__u_in__mean'] =train.groupby(['breath_id'])['u_in'].mean()

    train['breath_id__u_in__min'] = train.groupby(['breath_id'])['u_in'].transform('min')

    train['R_div_C'] = train["R"].div(train["C"])

    # difference between consequitive values
    train['R__C'] = train["R"].astype(str) + '__' + train["C"].astype(str)
    train['u_in_diff1'] = train['u_in'] - train['u_in_lag1']
    train['u_out_diff1'] = train['u_out'] - train['u_out_lag1']
    train['u_in_diff2'] = train['u_in'] - train['u_in_lag2']
    train['u_out_diff2'] = train['u_out'] - train['u_out_lag2']
    train['u_in_diff3'] = train['u_in'] - train['u_in_lag3']
    train['u_out_diff3'] = train['u_out'] - train['u_out_lag3']
    ## diff between last 2 steps
    train['u_in_diff_1_2'] = train['u_in_lag1'] - train['u_in_lag2']
    train['u_out_diff_1_2'] = train['u_out_lag1'] - train['u_out_lag2']
    train['u_in_lagback_diff_1_2'] = train['u_in_lag_back1'] - train['u_in_lag_back2']
    train['u_out_lagback_diff_1_2'] = train['u_out_lag_back1'] - train['u_out_lag_back2']

    train['u_in_lagback_diff1'] = train['u_in'] - train['u_in_lag_back1']
    train['u_out_lagback_diff1'] = train['u_out'] - train['u_out_lag_back1']
    train['u_in_lagback_diff2'] = train['u_in'] - train['u_in_lag_back2']
    train['u_out_lagback_diff2'] = train['u_out'] - train['u_out_lag_back2']

    # from here: https://www.kaggle.com/yasufuminakama/ventilator-pressure-lstm-starter
    # setting the differences to 0 for the first time
    train.loc[train['time_step'] == 0, 'u_in_diff'] = 0
    train.loc[train['time_step'] == 0, 'u_out_diff'] = 0

    # difference between the current value of u_in and the max value within the breath
    train['breath_id__u_in__diffmax'] = train.groupby(['breath_id'])['u_in'].transform('max') - train['u_in']
    train['breath_id__u_in__diffmean'] = train.groupby(['breath_id'])['u_in'].transform('mean') - train['u_in']

    print("before OHE")
    display(train)

    # OHE
    train = train.merge(pd.get_dummies(train['R'], prefix='R'), left_index=True, right_index=True).drop(['R'], axis=1)
    train = train.merge(pd.get_dummies(train['C'], prefix='C'), left_index=True, right_index=True).drop(['C'], axis=1)
    train = train.merge(pd.get_dummies(train['R__C'], prefix='R__C'), left_index=True, right_index=True).drop(['R__C'], axis=1)

    # https://www.kaggle.com/c/ventilator-pressure-prediction/discussion/273974
    train['u_in_cumsum'] = train.groupby(['breath_id'])['u_in'].cumsum()
    train['time_step_cumsum'] = train.groupby(['breath_id'])['time_step'].cumsum()

    # feature by u in or out (ideally - make 2 sep columns for each state) # dan
    train['u_in_partition_out_sum'] = train.groupby(['breath_id',"u_out"])['u_in'].transform("sum")

    train = train.fillna(0) # add for consistency with how test is done - dan

    return train

# Adding new features into the dataset

In [None]:
train = pd.read_csv("../input/ventilator-pressure-prediction/train.csv")
test = pd.read_csv("../input/ventilator-pressure-prediction/test.csv")

print("Engineering features for training data...")
train = add_feats(train)

print("Engineering features for test data...")
test = add_feats(test) 

columns = [col for col in train.columns if col not in ['id', 'breath_id', 'pressure']]
target = train["pressure"]

# LGBM

In [None]:
n_splits = 2
scores = []
all_train_losses = []
all_val_losses = []
folds = GroupKFold(n_splits=n_splits)

for fold, (train_idx, val_idx) in enumerate(folds.split(train, target, groups=train['breath_id'])):
    print(f"--- Fold {fold} ---")
    
    X_train, y_train = train.loc[train_idx, columns], target.loc[train_idx]
    X_val, y_val = train.loc[val_idx, columns], target.loc[val_idx]
    
    model = lgb.LGBMRegressor(
        objective='regression_l1',
        n_estimators=500,
        learning_rate=0.05,
        n_jobs=-1,
        random_state=42
    )
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)],
        eval_metric='mae',
        callbacks=[
            lgb.early_stopping(stopping_rounds=100, verbose=False),
            lgb.log_evaluation(period=100) 
        ]
    )
    results = model.evals_result_
    all_train_losses.append(results['training']['l1'])
    all_val_losses.append(results['valid_1']['l1'])
    
    preds = model.predict(X_val)
    score = mean_absolute_error(y_val, preds)
    scores.append(score)
    print(f"Fold {fold} MAE: {score:.4f}\n")

print(f"Average CV MAE with Engineered Features: {np.mean(scores):.4f} (+/- {np.std(scores):.4f})")

mean_train_loss = np.mean(all_train_losses, axis=0)
mean_val_loss = np.mean(all_val_losses, axis=0)

plt.figure(figsize=(12, 7))
plt.plot(mean_train_loss, label='Average Training Loss')
plt.plot(mean_val_loss, label='Average Validation Loss')
plt.title(f'Average Training & Validation Loss Across All Folds (1000 Rounds)')
plt.xlabel('Boosting Round')
plt.ylabel('Loss (MAE)')
plt.legend()
plt.grid(True)
plt.show()

# MLP in PyTorch

In [None]:
# Setup device 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# PyTorch Dataset
class VentilatorDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx].unsqueeze(-1)

class CustomLinearLayer(nn.Module):
    def __init__(self, in_features, out_features):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(in_features, out_features))
        self.bias = nn.Parameter(torch.randn(out_features))

    def forward(self, x):
        return x @ self.weight + self.bias

# MLP Model 
class MLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.layers = nn.Sequential(
            CustomLinearLayer(input_dim, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Dropout(0.2),
            nn.Linear(64, 1) 
        )

    def forward(self, x):
        return self.layers(x)
                
n_splits = 2
scores = []
folds = GroupKFold(n_splits=n_splits)
all_train_losses = []
all_val_losses = []

BATCH_SIZE = 512
EPOCHS = 5
LEARNING_RATE = 1e-3
PATIENCE = 10 # early stopping

for fold, (train_idx, val_idx) in enumerate(folds.split(train, target, groups=train['breath_id'])):
    print(f"--- Fold {fold} ---")
    
    # Split data
    X_train, y_train = train.loc[train_idx, columns], target.iloc[train_idx]
    X_val, y_val = train.loc[val_idx, columns], target.iloc[val_idx]
    
    # Scale features (training data)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)

    # datasets and dataLoaders
    train_dataset = VentilatorDataset(X_train, y_train.values)
    val_dataset = VentilatorDataset(X_val, y_val.values)
    train_loader = DataLoader(
        train_dataset, 
        batch_size=BATCH_SIZE, 
        shuffle=True, 
        num_workers=2, 
        pin_memory=True
    )
    val_loader = DataLoader(
        val_dataset, 
        batch_size=BATCH_SIZE, 
        shuffle=False, 
        num_workers=2, 
        pin_memory=True
    )
    
    # model, loss and optimizer
    model = MLP(input_dim=X_train.shape[1]).to(device)
    criterion = nn.L1Loss() # MAE Loss
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

    train_losses_fold = []
    val_losses_fold = []
    
    best_val_loss = float('inf')
    patience_counter = 0

    for epoch in range(EPOCHS):
        model.train()
        train_loss = 0.0
        for features, labels in train_loader:
            features, labels = features.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for features, labels in val_loader:
                features, labels = features.to(device), labels.to(device)
                outputs = model(features)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
        
        avg_train_loss = train_loss / len(train_loader)
        avg_val_loss = val_loss / len(val_loader)

        train_losses_fold.append(avg_train_loss)
        val_losses_fold.append(avg_val_loss)
        
        print(f"Epoch {epoch+1}/{EPOCHS}, Train MAE: {avg_train_loss:.4f}, Val MAE: {avg_val_loss:.4f}")
        
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= PATIENCE:
                print("Early stopping triggered.")
                break
                
    all_train_losses.append(train_losses_fold)
    all_val_losses.append(val_losses_fold)
    
    scores.append(best_val_loss)
    print(f"Fold {fold} Best MAE: {best_val_loss:.4f}\n")
    
    # memory cleaning
    del model, X_train, y_train, X_val, y_val, train_loader, val_loader
    gc.collect()
    torch.cuda.empty_cache()

print(f"Average CV MAE with PyTorch MLP: {np.mean(scores):.4f} (+/- {np.std(scores):.4f})")

mean_train_loss = np.mean(all_train_losses, axis=0)
mean_val_loss = np.mean(all_val_losses, axis=0)

plt.figure(figsize=(12, 7))
plt.plot(mean_train_loss, label='Average Training Loss')
plt.plot(mean_val_loss, label='Average Validation Loss')
plt.title('Average Training & Validation Loss Across All Folds')
plt.xlabel('Epoch')
plt.ylabel('Loss (MAE)')
plt.legend()
plt.grid(True)
plt.show()

# LSTM in PyTorch

## Training a model to define the most important features

In [None]:
X = train[columns]
y = target

print("Training a baseline model to get feature importances...")
baseline_model = lgb.LGBMRegressor(
    n_estimators=500,
    learning_rate=0.1,
    random_state=42,
    n_jobs=-1
)

baseline_model.fit(X, y)
print("Baseline model training complete.")

In [None]:
feature_importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': baseline_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 10 most important features:")
display(feature_importance_df.head(10))

print("\nTop 10 least important features:")
display(feature_importance_df.tail(10))

## Selecting the most important features 

In [None]:
important_features = feature_importance_df[feature_importance_df['importance'] > 85]['feature'].tolist()

In [None]:
columns_selected = important_features

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


# Data preprocessing for sequences
# Scaling features before reshaping
scaler = StandardScaler()
train[columns_selected] = scaler.fit_transform(train[columns_selected])

# Reshaping data into sequences of 80 timesteps
num_breaths = len(train) // 80
features_reshaped = train[columns_selected].values.reshape(num_breaths, 80, -1)
targets_reshaped = target.values.reshape(num_breaths, 80, 1)
groups = train['breath_id'].unique()


# PyTorch dataset for sequences 
class VentilatorLSTMDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]


# LSTM model 
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, num_layers=2):
        super().__init__()
        self.lstm = nn.LSTM(
            input_dim, hidden_dim, num_layers, 
            batch_first=True, dropout=0.2, bidirectional=True
        )
        self.fc = nn.Linear(hidden_dim*2, 1)

    def forward(self, x):
        # lstm_out shape: (batch_size, seq_len, hidden_dim * 2)
        lstm_out, _ = self.lstm(x)
        # Pass the output of every timestep to the final linear layer
        predictions = self.fc(lstm_out)
        return predictions

n_splits = 2
scores = []
folds = GroupKFold(n_splits=n_splits)

BATCH_SIZE = 64 
EPOCHS = 10
LEARNING_RATE = 1e-3
PATIENCE = 10

for fold, (train_idx, val_idx) in enumerate(folds.split(features_reshaped, targets_reshaped, groups=groups)):
    print(f"--- Fold {fold} ---")
    
    # Splitting reshaped data
    X_train, y_train = features_reshaped[train_idx], targets_reshaped[train_idx]
    X_val, y_val = features_reshaped[val_idx], targets_reshaped[val_idx]

    # datasets and dataLoaders
    train_dataset = VentilatorLSTMDataset(X_train, y_train)
    val_dataset = VentilatorLSTMDataset(X_val, y_val)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=1)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=1)
    
    # model, loss and optimizer
    model = LSTMModel(input_dim=X_train.shape[2]).to(device)
    criterion = nn.L1Loss()
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

    train_losses_fold = []
    val_losses_fold = []
    
    best_val_loss = float('inf')
    patience_counter = 0

    for epoch in range(EPOCHS):
        model.train()
        train_loss = 0.0
        for features, labels in train_loader:
            features, labels = features.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for features, labels in val_loader:
                features, labels = features.to(device), labels.to(device)
                outputs = model(features)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
        
        avg_train_loss = train_loss / len(train_loader)
        avg_val_loss = val_loss / len(val_loader)

        train_losses_fold.append(avg_train_loss)
        val_losses_fold.append(avg_val_loss)
        
        print(f"Epoch {epoch+1}/{EPOCHS}, Train MAE: {avg_train_loss:.4f}, Val MAE: {avg_val_loss:.4f}")
        
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= PATIENCE:
                print("Early stopping triggered.")
                break

    scores.append(best_val_loss)
    print(f"Fold {fold} Best MAE: {best_val_loss:.4f}\n")

    plt.figure(figsize=(10, 6))
    plt.plot(train_losses_fold, label='Training Loss')
    plt.plot(val_losses_fold, label='Validation Loss')
    plt.title(f'Fold {fold} - Training & Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss (MAE)')
    plt.legend()
    plt.grid(True)
    epochs_run = len(train_losses_fold)
    plt.xticks(np.arange(epochs_run), np.arange(1, epochs_run + 1))
    plt.show()
    
    del model, X_train, y_train, X_val, y_val, train_loader, val_loader
    gc.collect()
    torch.cuda.empty_cache()

print(f"Average CV MAE with PyTorch LSTM: {np.mean(scores):.4f} (+/- {np.std(scores):.4f})") 