<a href="https://colab.research.google.com/github/gyasifred/NLP-Techniques/blob/main/ai_malnutrion_TabPFN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/priorlabs/tabpfn-extensions.git

Cloning into 'tabpfn-extensions'...
remote: Enumerating objects: 570, done.[K
remote: Counting objects: 100% (98/98), done.[K
remote: Compressing objects: 100% (63/63), done.[K
remote: Total 570 (delta 44), reused 36 (delta 32), pack-reused 472 (from 1)[K
Receiving objects: 100% (570/570), 296.35 KiB | 7.06 MiB/s, done.
Resolving deltas: 100% (295/295), done.


In [2]:

!pip install -e tabpfn-extensions[post_hoc_ensembles,interpretability,hpo]

Obtaining file:///content/tabpfn-extensions
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Collecting seaborn==0.12.2 (from tabpfn-extensions==0.0.4)
  Downloading seaborn-0.12.2-py3-none-any.whl.metadata (5.4 kB)
Collecting shapiq (from tabpfn-extensions==0.0.4)
  Downloading shapiq-1.2.1-py3-none-any.whl.metadata (19 kB)
Collecting kditransform>=0.2.0 (from tabpfn-extensions==0.0.4)
  Downloading kditransform-0.2.0-py3-none-any.whl.metadata (6.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->tabpfn-extensions==0.0.4)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->tabpfn-extensions==0.0.4)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux201

In [6]:
!pip install tabpfn

Collecting tabpfn
  Downloading tabpfn-2.0.6-py3-none-any.whl.metadata (20 kB)
Downloading tabpfn-2.0.6-py3-none-any.whl (124 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m124.9/124.9 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tabpfn
Successfully installed tabpfn-2.0.6


In [3]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from typing import List, Union, Tuple, Dict

nltk.download('stopwords', quiet=True)

class ClinicalTextPreprocessor(BaseEstimator, TransformerMixin):
    """Custom transformer for preprocessing clinical text."""

    def __init__(self,
                 remove_punctuation: bool = True,
                 lowercase: bool = True,
                 standardize_numbers: bool = True,
                 standardize_dates: bool = True):
        self.remove_punctuation = remove_punctuation
        self.lowercase = lowercase
        self.standardize_numbers = standardize_numbers
        self.standardize_dates = standardize_dates
        self.date_pattern = (r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b|'
                           r'\b(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]* \d{1,2},? \d{4}\b')
        self.number_pattern = r'\b\d+(?:,\d{3})*(?:\.\d+)?\b'

    def fit(self, X: Union[List[str], pd.Series], y=None):
        return self

    def transform(self, X: Union[List[str], pd.Series]) -> List[str]:
        if isinstance(X, pd.Series):
            X = X.tolist()
        return [self._preprocess_text(str(text)) for text in X]

    def _preprocess_text(self, text: str) -> str:
        if self.lowercase:
            text = text.lower()
        if self.standardize_dates:
            text = re.sub(self.date_pattern, '<DATE>', text, flags=re.IGNORECASE)
        if self.standardize_numbers:
            text = re.sub(self.number_pattern, '<NUM>', text)
        if self.remove_punctuation:
            text = re.sub(r'[^\w\s<>]', '', text)
        return ' '.join(text.split())

class StopWordsRemover(BaseEstimator, TransformerMixin):
    """Remove stop words using NLTK's stop words list."""

    def __init__(self, language: str = 'english'):
        self.language = language
        self.stop_words = set(stopwords.words(language))

    def fit(self, X, y=None):
        return self

    def transform(self, X: List[str]) -> List[str]:
        result = []
        for text in X:
            if not isinstance(text, str):
                text = str(text)
            tokens = text.split()
            tokens = [token for token in tokens if token not in self.stop_words]
            result.append(" ".join(tokens))
        return result

class TextStemmer(BaseEstimator, TransformerMixin):
    """Apply Porter Stemming to reduce words to their root form."""

    def __init__(self):
        self.stemmer = PorterStemmer()

    def fit(self, X, y=None):
        return self

    def transform(self, X: List[str]) -> List[str]:
        result = []
        for text in X:
            if not isinstance(text, str):
                text = str(text)
            tokens = text.split()
            tokens = [self.stemmer.stem(token) for token in tokens]
            result.append(" ".join(tokens))
        return result

def process_csv(
    file_path: str,
    text_column: str,
    label_column: str,
    id_column: str,
    max_features: int = 8000,
    remove_stop_words: bool = True,
    apply_stemming: bool = False
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Pipeline, Dict]:
    """
    Process the CSV file containing clinical notes and return multiple formats of the data.

    Args:
        file_path: Path to the CSV file
        text_column: Name of the column containing note text
        label_column: Name of the column containing labels
        id_column: Name of the column containing patient IDs
        max_features: Maximum number of features for vectorization
        remove_stop_words: Whether to remove stop words
        apply_stemming: Whether to apply stemming

    Returns:
        Tuple containing:
        - X_df: DataFrame with features (terms as columns)
        - complete_df: DataFrame with patient_ID, features, and labels
        - y: Series with labels
        - pipeline: Fitted pipeline for future use
        - feature_names: Dictionary mapping feature names to their indices
    """
    try:
        # Read the CSV file
        df = pd.read_csv(file_path)

        # Validate required columns exist
        required_columns = [text_column, label_column, id_column]
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            print(f"Available columns: {list(df.columns)}")
            raise ValueError(f"Missing required columns: {missing_columns}")

        # Create preprocessing steps
        steps = [('preprocessor', ClinicalTextPreprocessor())]

        if remove_stop_words:
            steps.append(('stopword_remover', StopWordsRemover()))

        if apply_stemming:
            steps.append(('stemmer', TextStemmer()))

        # Create vectorizer and TF-IDF transformer
        vectorizer = CountVectorizer(max_features=max_features)
        tfidf = TfidfTransformer()

        steps.extend([
            ('vectorizer', vectorizer),
            ('tfidf', tfidf)
        ])

        # Create and fit pipeline
        pipeline = Pipeline(steps)
        features_sparse = pipeline.fit_transform(df[text_column])

        # Get feature names from the vectorizer
        feature_names = vectorizer.get_feature_names_out()

        # Create DataFrame with proper feature names
        X_df = pd.DataFrame(
            features_sparse.toarray(),
            columns=feature_names,
            index=df.index
        )

        # Create y Series
        y = df[label_column]

        # Create complete DataFrame with ID, features, and label
        complete_df = pd.concat([
            df[[id_column]],  # Keep patient ID
            X_df,             # Add features
            df[[label_column]]  # Add label
        ], axis=1)

        # Create feature names dictionary
        feature_dict = {name: idx for idx, name in enumerate(feature_names)}

        return X_df, complete_df, y, pipeline, feature_dict

    except Exception as e:
        print(f"Error processing CSV file: {str(e)}")
        raise

def split_and_shuffle_data(
    X: pd.DataFrame,
    y: pd.Series,
    test_size: float = 0.2,
    valid_size: float = 0.5,
    random_state: int = 42
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.Series, pd.Series, pd.Series]:
    """
    Split features and labels into training, validation, and test sets.

    Args:
        X: Feature DataFrame
        y: Labels Series
        test_size: Proportion of data for test+validation sets
        valid_size: Proportion of test_size to use for validation
        random_state: Random seed for reproducibility

    Returns:
        Tuple of (X_train, X_valid, X_test, y_train, y_valid, y_test)
    """
    if X.empty or y.empty:
        raise ValueError("Features or labels are empty")

    # First split: separate training set from temporary set
    X_train, X_temp, y_train, y_temp = train_test_split(
        X,
        y,
        test_size=test_size,
        random_state=random_state,
        stratify=y if len(y.unique()) > 1 else None
    )

    # Second split: split temporary set into validation and test sets
    X_valid, X_test, y_valid, y_test = train_test_split(
        X_temp,
        y_temp,
        test_size=valid_size,
        random_state=random_state,
        stratify=y_temp if len(y_temp.unique()) > 1 else None
    )

    return X_train, X_valid, X_test, y_train, y_valid, y_test

# Main execution
if __name__ == "__main__":
    # File path and column names as specified
    file_path = '/content/malnutrition_cases.csv'
    text_column = 'Note_Column'
    label_column = 'Malnutrition_Label'
    id_column = 'Patient_ID'

    try:
        # First, let's check the columns in the CSV file
        df = pd.read_csv(file_path)
        print("Available columns in the CSV file:")
        print(df.columns.tolist())

        # Process the CSV and get all outputs
        X_df, complete_df, y, fitted_pipeline, feature_dict = process_csv(
            file_path=file_path,
            text_column=text_column,
            label_column=label_column,
            id_column=id_column,
            max_features=8000,
            remove_stop_words=True,
            apply_stemming=False
        )

        # Split the features and labels
        X_train, X_valid, X_test, y_train, y_valid, y_test = split_and_shuffle_data(X_df, y)

        # Print information about the outputs
        print("\nFeatures Matrix (X) shape:", X_df.shape)
        print("Sample features (first 5 columns):")
        print(X_df.iloc[:, :5].head())

        print("\nLabels (y) shape:", y.shape)
        print("Sample labels:")
        print(y.head())

        print("\nComplete DataFrame shape:", complete_df.shape)
        print("Sample complete data (first few columns):")
        print(complete_df.iloc[:, :5].head())

        print("\nData split sizes:")
        print(f"Training set: {len(X_train)} samples, Features shape: {X_train.shape}, Labels shape: {y_train.shape}")
        print(f"Validation set: {len(X_valid)} samples, Features shape: {X_valid.shape}, Labels shape: {y_valid.shape}")
        print(f"Test set: {len(X_test)} samples, Features shape: {X_test.shape}, Labels shape: {y_test.shape}")

        # Print label distribution in each split
        print("\nLabel distribution:")
        print("Training set:", y_train.value_counts(normalize=True))
        print("Validation set:", y_valid.value_counts(normalize=True))
        print("Test set:", y_test.value_counts(normalize=True))

        # Print some sample feature names
        print("\nSample terms (first 10):")
        print(list(feature_dict.keys())[:10])

    except Exception as e:
        print(f"Error: {str(e)}")

Available columns in the CSV file:
['Patient_ID', 'Note_Column', 'Encounter_ID', 'Malnutrition_Label']

Features Matrix (X) shape: (118, 344)
Sample features (first 5 columns):
   112mm     114mm  115mm  116mm  117mm
0    0.0  0.000000    0.0    0.0    0.0
1    0.0  0.000000    0.0    0.0    0.0
2    0.0  0.163657    0.0    0.0    0.0
3    0.0  0.000000    0.0    0.0    0.0
4    0.0  0.000000    0.0    0.0    0.0

Labels (y) shape: (118,)
Sample labels:
0    yes
1     no
2    yes
3     no
4    yes
Name: Malnutrition_Label, dtype: object

Complete DataFrame shape: (118, 346)
Sample complete data (first few columns):
  Patient_ID  112mm     114mm  115mm  116mm
0       P001    0.0  0.000000    0.0    0.0
1       P002    0.0  0.000000    0.0    0.0
2       P003    0.0  0.163657    0.0    0.0
3       P004    0.0  0.000000    0.0    0.0
4       P005    0.0  0.000000    0.0    0.0

Data split sizes:
Training set: 94 samples, Features shape: (94, 344), Labels shape: (94,)
Validation set: 12 sa

In [7]:
from sklearn.metrics import accuracy_score, roc_auc_score
from tabpfn import TabPFNClassifier

# Initialize a classifier
clf = TabPFNClassifier()
clf.fit(X_train, y_train)

# Predict probabilities
prediction_probabilities = clf.predict_proba(X_test)
print("ROC AUC:", roc_auc_score(y_test, prediction_probabilities[:, 1]))

# Predict labels
predictions = clf.predict(X_test)
print("Accuracy", accuracy_score(y_test, predictions))

  model, _, config_ = load_model_criterion_config(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tabpfn-v2-classifier.ckpt:   0%|          | 0.00/29.0M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/37.0 [00:00<?, ?B/s]

ROC AUC: 1.0
Accuracy 1.0
