# --------------------
# Clinical Data Preprocessing
# --------------------

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load clinical data from Excel
clinical_data_path = '/Users/ellaquan/Project/CMMD-data/CMMD_clinicaldata_revision.xlsx'
clinical_df = pd.read_excel(clinical_data_path)

# Check the column names to ensure they match expectations
print("Columns in clinical_df:", clinical_df.columns)

# Select the rows where 'ID1' starts with 'D2'
clinical_df = clinical_df[clinical_df['ID1'].str.startswith('D2')]

# Drop 'ID1' duplicates
clinical_df = clinical_df.drop_duplicates(subset='ID1')

# Check the shape of the data to ensure there are still rows
print("Data after filtering 'ID1' starts with 'D2':", clinical_df.shape)

# Filter for malignant classification
malignant_data = clinical_df[clinical_df['classification'] == 'Malignant']

# Check the data to ensure there are still rows after filtering for malignant cases
print("Data after filtering for malignant cases:", malignant_data.shape)
print(malignant_data.head())  # Display a few rows for debugging

# Define indolent cancer as Luminal A or Luminal B (target = 1 for indolent, 0 for non-indolent)
malignant_data['target'] = malignant_data['subtype'].apply(lambda x: 1 if x in ['Luminal A', 'Luminal B'] else 0)

# Drop rows with missing data in key columns
malignant_data.dropna(subset=['classification', 'subtype', 'Age'], inplace=True)

# Check if any rows are left after dropping NaN values
print("Data after dropping NaNs:", malignant_data.shape)
print(malignant_data[['classification', 'subtype', 'Age']].head())  # Check the key columns

# Ensure there's still data for scaling 'Age'
if not malignant_data.empty:
    # Min-Max scaling for 'Age'
    scaler = MinMaxScaler()
    malignant_data['Age'] = scaler.fit_transform(malignant_data[['Age']])

    # Save the preprocessed data to an Excel file
    output_excel_path = '/Users/ellaquan/Project/preprocessed_clinical_data.xlsx'
    malignant_data.to_excel(output_excel_path, index=False)
    print(f"Preprocessed clinical data saved to {output_excel_path}")
else:
    print("No data available after filtering and dropping missing values.")

Columns in clinical_df: Index(['ID1', 'LeftRight', 'Age', 'number', 'abnormality', 'classification',
       'subtype'],
      dtype='object')
Data after filtering 'ID1' starts with 'D2': (749, 7)
Data after filtering for malignant cases: (743, 7)
          ID1 LeftRight  Age  number    abnormality classification  \
1107  D2-0001         L   64       2  calcification      Malignant   
1108  D2-0002         R   69       2  calcification      Malignant   
1109  D2-0003         L   44       2  calcification      Malignant   
1110  D2-0004         L   38       2  calcification      Malignant   
1111  D2-0005         R   41       2  calcification      Malignant   

            subtype  
1107      Luminal B  
1108      Luminal B  
1109      Luminal B  
1110      Luminal B  
1111  HER2-enriched  
Data after dropping NaNs: (739, 8)
     classification        subtype  Age
1107      Malignant      Luminal B   64
1108      Malignant      Luminal B   69
1109      Malignant      Luminal B   44
1110 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  malignant_data['target'] = malignant_data['subtype'].apply(lambda x: 1 if x in ['Luminal A', 'Luminal B'] else 0)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  malignant_data.dropna(subset=['classification', 'subtype', 'Age'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  malignant_data['Age'] = scaler.fit_transform(malignant_data[['Age']

In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import MinMaxScaler

# Load clinical data from Excel
clinical_data_path = '/Users/ellaquan/Project/CMMD-data/CMMD_clinicaldata_revision.xlsx'
clinical_df = pd.read_excel(clinical_data_path)

# Check the column names to ensure they match expectations
print("Columns in clinical_df:", clinical_df.columns)

# Select the rows where 'ID1' starts with 'D2'
clinical_df = clinical_df[clinical_df['ID1'].str.startswith('D2')]

# Drop 'ID1' duplicates
clinical_df = clinical_df.drop_duplicates(subset='ID1')

# Check the shape of the data to ensure there are still rows
print("Data after filtering 'ID1' starts with 'D2':", clinical_df.shape)

# Filter for malignant classification
malignant_data = clinical_df[clinical_df['classification'] == 'Malignant']

# Check the data to ensure there are still rows after filtering for malignant cases
print("Data after filtering for malignant cases:", malignant_data.shape)
print(malignant_data.head())  # Display a few rows for debugging

# Define indolent cancer as Luminal A or Luminal B (target = 1 for indolent, 0 for non-indolent)
malignant_data['target'] = malignant_data['subtype'].apply(lambda x: 1 if x in ['Luminal A', 'Luminal B'] else 0)

# Drop rows with missing data in key columns
malignant_data.dropna(subset=['classification', 'subtype', 'Age'], inplace=True)

# Check if any rows are left after dropping NaN values
print("Data after dropping NaNs:", malignant_data.shape)
print(malignant_data[['classification', 'subtype', 'Age']].head())  # Check the key columns

# Ensure there's still data for scaling 'Age'
if not malignant_data.empty:
    # Min-Max scaling for 'Age'
    scaler = MinMaxScaler()
    malignant_data['Age'] = scaler.fit_transform(malignant_data[['Age']])

    # Define continuous and categorical features
    continuous_features = ['Age']
    
    # Convert continuous features to numpy array
    preprocessed_clinical_data = malignant_data[continuous_features].values

    # Convert the combined clinical data to a PyTorch tensor
    preprocessed_clinical_data_tensor = torch.tensor(preprocessed_clinical_data, dtype=torch.float32)

    # Output shape of tensor
    print(preprocessed_clinical_data_tensor.shape)
else:
    print("No data available after filtering and dropping missing values.")