In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


data = pd.read_csv('Breast_Cancer.csv')  

# Divide features into numeric and categorical ones
numeric_features = list(data.select_dtypes(include=['int64']).columns)
categorical_features = [item for item in data.columns if item not in numeric_features]

# Encode categorical features and handle missing values
data_encoded = data.copy()


data_encoded['Status'] = data_encoded['Status'].astype(pd.CategoricalDtype(categories=['Alive', 'Dead'], ordered=True))
data_encoded['Status'] = data_encoded['Status'].cat.codes

# Update feature types after encoding
numeric_features = list(data_encoded.select_dtypes(include=['int8', 'int64']).columns)
categorical_features = [item for item in data_encoded.columns if item not in numeric_features]

# Create transformers for data preprocessing
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Processing of missing values
    ('scaler', StandardScaler())  # Scaling of numerical features
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Processing of missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # Converting categorical variables to binary numeric ones
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Apply transformers to the data
data_preprocessed = preprocessor.fit_transform(data_encoded)

# encoded categorical feature names after one-hot encoding
encoded_categorical_feature_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(input_features=categorical_features)

# Combine numeric and categorical feature names
encoded_feature_names = numeric_features + list(encoded_categorical_feature_names)

# Verify the shapes match
print(f"Shape of data_preprocessed: {data_preprocessed.shape}")
print(f"Number of encoded feature names: {len(encoded_feature_names)}")

# Check details of generated features
print(f"Numeric features after preprocessing: {numeric_features}")
print(f"Categorical features after one-hot encoding: {list(encoded_categorical_feature_names)}")

# Create the DataFrame
if data_preprocessed.shape[1] == len(encoded_feature_names):
    data_preprocessed_df = pd.DataFrame(data_preprocessed, columns=encoded_feature_names)
    print(data_preprocessed_df.head())
else:
    print(f"Mismatch! Data shape: {data_preprocessed.shape}, Feature names length: {len(encoded_feature_names)}")

# Creating a DataFrame from encoded data
data_preprocessed_df = pd.DataFrame(data_preprocessed, columns=encoded_feature_names)
print(data_preprocessed_df.head())


Shape of data_preprocessed: (4024, 40)
Number of encoded feature names: 40
Numeric features after preprocessing: ['Age', 'Tumor Size', 'Regional Node Examined', 'Reginol Node Positive', 'Survival Months', 'Status']
Categorical features after one-hot encoding: ['Race_Black', 'Race_Other', 'Race_White', 'Marital Status_Divorced', 'Marital Status_Married', 'Marital Status_Separated', 'Marital Status_Single ', 'Marital Status_Widowed', 'T Stage _T1', 'T Stage _T2', 'T Stage _T3', 'T Stage _T4', 'N Stage_N1', 'N Stage_N2', 'N Stage_N3', '6th Stage_IIA', '6th Stage_IIB', '6th Stage_IIIA', '6th Stage_IIIB', '6th Stage_IIIC', 'differentiate_Moderately differentiated', 'differentiate_Poorly differentiated', 'differentiate_Undifferentiated', 'differentiate_Well differentiated', 'Grade_ anaplastic; Grade IV', 'Grade_1', 'Grade_2', 'Grade_3', 'A Stage_Distant', 'A Stage_Regional', 'Estrogen Status_Negative', 'Estrogen Status_Positive', 'Progesterone Status_Negative', 'Progesterone Status_Positive'

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# Custom neural network with Fourier feature generation in the forward pass
class SurvivalPredictionNN(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, hidden_size3):
        super(SurvivalPredictionNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.fc3 = nn.Linear(hidden_size2, hidden_size3)
        self.fc4 = nn.Linear(hidden_size3, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = self.fc4(x)
        return x

# Define the function to train the model with different subsets of data
def RegressionNN(feature, data, train_sizes, batch_size=32, hidden_size1=64, hidden_size2=32, hidden_size3=16, lr=0.0001, num_epochs=30):
    print('Target Variable -', feature)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Separate the target column (Survival Months) from the rest of the data
    X = data.drop(columns=feature)
    y = data[feature]

    # Initial split into 75% training data and 25% test data
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

    # From the remaining 75%, allocate 10% of the original dataset to validation (which is 13.33% of the 75%)
    X_train_full, X_val, y_train_full, y_val = train_test_split(X_train_val, y_train_val, test_size=0.1333, random_state=42)

    # Convert test and validation data to PyTorch tensors and move to device
    X_test = torch.tensor(X_test.values.astype(np.float32)).to(device)
    y_test = torch.tensor(y_test.values.astype(np.float32)).reshape(-1, 1).to(device)
    X_val = torch.tensor(X_val.values.astype(np.float32)).to(device)
    y_val = torch.tensor(y_val.values.astype(np.float32)).reshape(-1, 1).to(device)

    results = []
    for train_size in train_sizes:
        print(f'\nTraining with {train_size*100}% of the data:')
        current_train_size = int(len(X_train_full) * train_size)

        # Split the training data according to the current size
        X_train, _, y_train, _ = train_test_split(X_train_full, y_train_full, train_size=current_train_size, random_state=42)

        # Convert training data to PyTorch tensors and move to device
        X_train = torch.tensor(X_train.values.astype(np.float32)).to(device)
        y_train = torch.tensor(y_train.values.astype(np.float32)).reshape(-1, 1).to(device)

        # Creating Data Loaders for training, validation, and test datasets
        train_dataset = TensorDataset(X_train, y_train)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        
        val_dataset = TensorDataset(X_val, y_val)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        test_dataset = TensorDataset(X_test, y_test)
        test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
            
        # Initializing the model and the optimizer
        input_size = X_train.shape[1]
        model = SurvivalPredictionNN(input_size, hidden_size1, hidden_size2, hidden_size3).to(device)
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=lr)

        # Training with validation
        for epoch in tqdm(range(num_epochs), desc=f"Epochs (Train Size: {train_size*100}%)"):
            model.train()
            running_loss = 0.0
            for inputs, labels in train_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                running_loss += loss.item() * inputs.size(0)

            # Validation
            model.eval()
            val_loss = 0.0
            with torch.no_grad():
                for val_inputs, val_labels in val_loader:
                    val_inputs, val_labels = val_inputs.to(device), val_labels.to(device)
                    val_outputs = model(val_inputs)
                    val_loss += criterion(val_outputs, val_labels).item() * val_inputs.size(0)
            val_loss /= len(val_loader.dataset)
            print(f'Validation Loss: {val_loss:.4f}')

        # Evaluate on test data after training
        model.eval()
        test_loss = 0.0
        predictions = []
        with torch.no_grad():
            for inputs, labels in test_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                test_loss += criterion(outputs, labels).item() * inputs.size(0)
                predictions.extend(outputs.squeeze(1).cpu().tolist())

        test_loss /= len(test_loader.dataset)
        print(f'Test Loss: {test_loss:.4f}')

        predictions_np = np.array(predictions)
        y_test_np = y_test.cpu().numpy()
        mse = mean_squared_error(y_test_np, predictions_np)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test_np, predictions_np)
        
        print(f'MSE: {mse:.4f}, RMSE: {rmse:.4f}, MAE: {mae:.4f}')
        
        # Collect results
        results.append((train_size, mse, rmse, mae))

    return results



# Define train sizes and target feature
train_sizes = [0.2, 0.4, 0.6, 0.8, 0.999668655]  # Proportions of the total data
feature = 'Survival Months'  # Target column

# Load your data here
data = data_preprocessed_df  # Replace with your data source

# Call the function
results = RegressionNN(feature, data, train_sizes)


cuda
Target Variable - Survival Months

Training with 20.0% of the data:


Epochs (Train Size: 20.0%):  19%|█▊        | 13/70 [00:00<00:01, 29.50it/s]

Epoch [10/70], Loss: 0.9303


Epochs (Train Size: 20.0%):  36%|███▌      | 25/70 [00:00<00:01, 31.98it/s]

Epoch [20/70], Loss: 0.7792


Epochs (Train Size: 20.0%):  47%|████▋     | 33/70 [00:01<00:01, 33.22it/s]

Epoch [30/70], Loss: 0.6924


Epochs (Train Size: 20.0%):  64%|██████▍   | 45/70 [00:01<00:00, 33.70it/s]

Epoch [40/70], Loss: 0.6697


Epochs (Train Size: 20.0%):  81%|████████▏ | 57/70 [00:01<00:00, 34.13it/s]

Epoch [50/70], Loss: 0.6565


Epochs (Train Size: 20.0%):  93%|█████████▎| 65/70 [00:02<00:00, 34.03it/s]

Epoch [60/70], Loss: 0.6441


Epochs (Train Size: 20.0%): 100%|██████████| 70/70 [00:02<00:00, 31.96it/s]


Epoch [70/70], Loss: 0.6336
Test Loss: 0.7487
MSE: 0.7487, RMSE: 0.8653, MAE: 0.7080

Training with 40.0% of the data:


Epochs (Train Size: 40.0%):  17%|█▋        | 12/70 [00:00<00:03, 16.99it/s]

Epoch [10/70], Loss: 0.9034


Epochs (Train Size: 40.0%):  31%|███▏      | 22/70 [00:01<00:02, 17.24it/s]

Epoch [20/70], Loss: 0.7728


Epochs (Train Size: 40.0%):  46%|████▌     | 32/70 [00:01<00:02, 17.11it/s]

Epoch [30/70], Loss: 0.7602


Epochs (Train Size: 40.0%):  60%|██████    | 42/70 [00:02<00:01, 17.21it/s]

Epoch [40/70], Loss: 0.7518


Epochs (Train Size: 40.0%):  74%|███████▍  | 52/70 [00:03<00:01, 17.18it/s]

Epoch [50/70], Loss: 0.7422


Epochs (Train Size: 40.0%):  89%|████████▊ | 62/70 [00:03<00:00, 17.13it/s]

Epoch [60/70], Loss: 0.7330


Epochs (Train Size: 40.0%): 100%|██████████| 70/70 [00:04<00:00, 17.08it/s]


Epoch [70/70], Loss: 0.7243
Test Loss: 0.7339
MSE: 0.7339, RMSE: 0.8567, MAE: 0.7036

Training with 60.0% of the data:


Epochs (Train Size: 60.0%):  16%|█▌        | 11/70 [00:00<00:05, 11.23it/s]

Epoch [10/70], Loss: 0.8244


Epochs (Train Size: 60.0%):  30%|███       | 21/70 [00:01<00:04, 11.34it/s]

Epoch [20/70], Loss: 0.7744


Epochs (Train Size: 60.0%):  44%|████▍     | 31/70 [00:02<00:03, 11.15it/s]

Epoch [30/70], Loss: 0.7644


Epochs (Train Size: 60.0%):  59%|█████▊    | 41/70 [00:03<00:02, 11.11it/s]

Epoch [40/70], Loss: 0.7556


Epochs (Train Size: 60.0%):  73%|███████▎  | 51/70 [00:04<00:01, 11.04it/s]

Epoch [50/70], Loss: 0.7469


Epochs (Train Size: 60.0%):  87%|████████▋ | 61/70 [00:05<00:00, 11.14it/s]

Epoch [60/70], Loss: 0.7377


Epochs (Train Size: 60.0%): 100%|██████████| 70/70 [00:06<00:00, 11.12it/s]


Epoch [70/70], Loss: 0.7286
Test Loss: 0.7365
MSE: 0.7365, RMSE: 0.8582, MAE: 0.7085

Training with 80.0% of the data:


Epochs (Train Size: 80.0%):  16%|█▌        | 11/70 [00:01<00:07,  8.32it/s]

Epoch [10/70], Loss: 0.7847


Epochs (Train Size: 80.0%):  30%|███       | 21/70 [00:02<00:05,  8.26it/s]

Epoch [20/70], Loss: 0.7686


Epochs (Train Size: 80.0%):  44%|████▍     | 31/70 [00:03<00:04,  8.37it/s]

Epoch [30/70], Loss: 0.7583


Epochs (Train Size: 80.0%):  59%|█████▊    | 41/70 [00:04<00:03,  8.39it/s]

Epoch [40/70], Loss: 0.7503


Epochs (Train Size: 80.0%):  73%|███████▎  | 51/70 [00:06<00:02,  8.08it/s]

Epoch [50/70], Loss: 0.7392


Epochs (Train Size: 80.0%):  87%|████████▋ | 61/70 [00:07<00:01,  8.32it/s]

Epoch [60/70], Loss: 0.7297


Epochs (Train Size: 80.0%): 100%|██████████| 70/70 [00:08<00:00,  8.26it/s]


Epoch [70/70], Loss: 0.7187
Test Loss: 0.7444
MSE: 0.7444, RMSE: 0.8628, MAE: 0.7088

Training with 99.9668655% of the data:


Epochs (Train Size: 99.9668655%):  16%|█▌        | 11/70 [00:01<00:08,  6.74it/s]

Epoch [10/70], Loss: 0.7753


Epochs (Train Size: 99.9668655%):  30%|███       | 21/70 [00:03<00:07,  6.66it/s]

Epoch [20/70], Loss: 0.7621


Epochs (Train Size: 99.9668655%):  44%|████▍     | 31/70 [00:04<00:05,  6.61it/s]

Epoch [30/70], Loss: 0.7535


Epochs (Train Size: 99.9668655%):  59%|█████▊    | 41/70 [00:06<00:04,  6.93it/s]

Epoch [40/70], Loss: 0.7467


Epochs (Train Size: 99.9668655%):  73%|███████▎  | 51/70 [00:07<00:02,  6.72it/s]

Epoch [50/70], Loss: 0.7390


Epochs (Train Size: 99.9668655%):  87%|████████▋ | 61/70 [00:09<00:01,  6.67it/s]

Epoch [60/70], Loss: 0.7309


Epochs (Train Size: 99.9668655%): 100%|██████████| 70/70 [00:10<00:00,  6.68it/s]

Epoch [70/70], Loss: 0.7227
Test Loss: 0.7436
MSE: 0.7436, RMSE: 0.8623, MAE: 0.7058



