In [1]:
# For General Data Wrangling and Calculations
import pandas as pd
import numpy as np

# For NLP
import nltk

# For stop word filtering
#nltk.download("stopwords")
from nltk.corpus import stopwords

# For tokenizing
from nltk.tokenize import RegexpTokenizer, word_tokenize
#nltk.download('punkt_tab')
#nltk.download('wordnet')

# For lemmatizing
from nltk.stem import WordNetLemmatizer

# For TF-IDF transformation
from sklearn.feature_extraction.text import TfidfVectorizer

# For Word Embedding
import sent2vec

# For plotting
import matplotlib.pyplot as plt
import seaborn as sns

# For PCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# For UMAP
import umap
%matplotlib inline

# DATA VISUALIZATION
## 1. Distribution of number of characters in the text
We will explore the amount of characters are found in each medical abstract pre- and post-processing of text.
### 1.1. Pre-Processing
Define Function to count characters

In [None]:
def text_counter(data_df, counter_type, dataset_type="train"):

    def count_counter(text, counter_type):

        # Count characters
        if counter_type == "char":
            text_length = len(text)

        # Count words
        elif counter_type == "word":
            text_length = len(text.split())

        # Else
        else:
            raise Exception(f"Unknown counter_type {counter_type}, please choose 'char' or 'word'.")

        # Return count
        return text_length

    # Initialize a dictionary to store the character length of sentences
    length_dict = {
        "neoplasms": [],
        "digestive system diseases": [],
        "nervous system diseases": [],
        "cardiovascular diseases": [],
        "general pathological conditions": [],
        "total": []
    }
    
    # Initialize loop to iterate through
    for i, row in data_df.iterrows():
    
        # Get cell
        text = row["medical_abstract"]

        # Check dataset type
        if dataset_type == "train":
            # Get condition
            condition = row["condition_name"]
        
        # Check if the cell contains text
        if isinstance(text, str):

            # Get the text length
            text_length = count_counter(text, counter_type)

            # Add to the Total list in the dictionary
            length_dict["total"].append(text_length)

            # Check dataset type
            if dataset_type == "train":
                # Add to the condition list in the dictionary
                length_dict[condition].append(text_length)

    # Return results
    return length_dict if dataset_type == "train" else length_dict["total"]

Define function to plot

In [None]:
def plot_displot(data_dict, data_type, counter_type):
    """
    Plot distribution plots of text lengths using seaborn's displot with facets.

    Parameters:
        data_dict: dict or list
            If data_type is 'train', this should be a dictionary with conditions and total lengths.
            If data_type is 'test', this should be a list of lengths.
        data_type: str
            Either 'train' or 'test".
    """
    if data_type == "train":
        # Prepare the data for seaborn
        all_data = []
        for condition, lengths in data_dict.items():
            all_data.extend([(condition, length) for length in lengths])

        df = pd.DataFrame(all_data, columns=["Condition", "Length"])

        # Define a color palette for the conditions
        palette = {
            "neoplasms": "#1f77b4",
            "digestive system diseases": "#ff7f0e",
            "nervous system diseases": "#2ca02c",
            "cardiovascular diseases": "#d62728",
            "general pathological conditions": "#9467bd",
            "total": "#808080"
        }

        # Create the facet grid with `hue` and color palette
        g = sns.displot(
            data=df,
            x="Length",
            hue="Condition",  # Use hue to assign colors
            col="Condition",  # Facet by condition
            kind="hist",
            col_wrap=1,  # Ensure one plot per row
            fill=True,
            palette=palette,
            legend=False
        )

        # Adjust the height while maintaining the width
        g.set_axis_labels("Length", "Count")
        g.set_titles(col_template="{col_name}")
        plt.suptitle(f"Distributions of number of {counter_type} by Condition", y=1.02)

        # Set overall figure size
        g.fig.set_size_inches(6, 12)  # (width, height)

        # Adjust layout for spacing between plots
        plt.tight_layout()

    elif data_type == "test":
        # Prepare the data for seaborn
        df = pd.DataFrame(data_dict, columns=["Length"])

        # Plot the distribution
        sns.displot(df, x="Length", color = "#808080", kind="hist", fill=True, legend=False)

        # Set titles and labels
        plt.xlabel("Length")
        plt.ylabel("Count")
        plt.title(f"Distributions of number of {counter_type} by Condition in Test Data")

        # Set figure size
        plt.gcf().set_size_inches(6, 4)  # (width, height)

        # Adjust layout for spacing between plots
        plt.tight_layout()

    else:
        raise ValueError("Invalid data_type. Must be 'train' or 'test'.")

    # Show plot
    plt.show()

### 1.2. Plot
Get character length distribution of datasets

In [None]:
# Training lemmatized data
train_raw_len = text_counter(train_df, "char")
# Testing lemmatized data
test_raw_len = text_counter(test_df, "char", "test")

# Training lemmatized data
train_lem_len = text_counter(train_lemma_df, "char")
# Testing lemmatized data
test_lem_len = text_counter(test_lemma_df, "char", "test")

Plot character length distributions

In [None]:
# For Raw Training data
plot_displot(train_raw_len, "train", "Characters")

In [None]:
# For Lemmatized Procces data
plot_displot(train_lem_len, "train", "Characters")

In [None]:
# For Raw Testing data
plot_displot(test_raw_len, "test", "Characters")

In [None]:
# For Processed Testing data
plot_displot(test_lem_len, "test", "Characters")

## 2. Distribution of number of words in the text
In this section, we will explore the amount of words found in each medical abstract pre- and post-processing of text.
### 2.1. Plot
Get number of words distribution

In [None]:
# Training lemmatized data
train_raw_len = text_counter(train_df, "word")
# Testing lemmatized data
test_raw_len = text_counter(test_df, "word", "test")

# Training lemmatized data
train_lem_len = text_counter(train_lemma_df, "word")
# Testing lemmatized data
test_lem_len = text_counter(test_lemma_df, "word", "test")

Plot number of words distribution

In [None]:
# For Raw Training data
plot_displot(train_raw_len, "train", "Words")

In [None]:
# For Lemmatized Procces data
plot_displot(train_lem_len, "train", "Characters")

In [None]:
# For Raw Testing data
plot_displot(test_raw_len, "test", "Characters")

In [None]:
# For Processed Testing data
plot_displot(test_lem_len, "test", "Characters")

## 3. Dimensionality reduction
### 3.1. Principal Component Analysis (PCA)
Principal Component Analysis (PCA) is a method of dimensionality reduction that transform large datasets into smaller one that preserves most of the information from the large dataset. In this case, we will apply PCA for data visualization on the Principal Component 1 (PC1) and PC2.

We will start standarizing the features and defining the PCA for 2 principal components.

In [None]:
# Standardizing the TF-IDF features
x_train_tfidf = StandardScaler(with_mean = False).fit_transform(train_lem_tdidf)
x_test_tfidf = StandardScaler(with_mean = False).fit_transform(test_lem_tdidf)

# Standardizing the BioSentVec features
x_train_s2v = StandardScaler().fit_transform(train_lem_vec)
x_test_s2v = StandardScaler().fit_transform(test_lem_vec)

# Define PCA
pca = PCA(n_components = 2)

Define function to plot PCA

In [None]:
def plot_PCA(pca_out, pca_fit, conditions, title, dataset_type):

    # Create a dataframe to store data before plotting
    pca_df = pd.DataFrame(data = pca_out, columns = ["PC1", "PC2"])

    # Get Explained variance
    exp_var_pca = pca_fit.explained_variance_ratio_

    # Add condition label to df
    if dataset_type == "train":
        # Add Label
        pca_df["condition_name"] = conditions

        # Define a color palette for the conditions
        palette = {
                "neoplasms": "#1f77b4",
                "digestive system diseases": "#ff7f0e",
                "nervous system diseases": "#2ca02c",
                "cardiovascular diseases": "#d62728",
                "general pathological conditions": "#9467bd"
            }

    # Initialize figure
    fig = plt.figure(figsize = (8,8))
    ax = fig.add_subplot(1,1,1) 

    # Define titles
    ax.set_xlabel(f"Principal Component 1 ({round(exp_var_pca[0]*100, 2)}%)", fontsize = 15)
    ax.set_ylabel(f"Principal Component 2 ({round(exp_var_pca[1]*100, 2)}%)", fontsize = 15)
    ax.set_title(title, fontsize = 20)

    # Plot train
    if dataset_type == "train":
        for condition, color in palette.items():
            indicesToKeep = pca_df["condition_name"] == condition
            ax.scatter(pca_df.loc[indicesToKeep, "PC1"],
                       pca_df.loc[indicesToKeep, "PC2"],
                       c = color,
                       s = 15,
                      alpha = 0.5)
        ax.legend(palette.keys())

    # Plot test
    elif dataset_type == "test":
        ax.scatter(pca_df["PC1"],
                       pca_df["PC2"],
                       s = 15,
                      alpha = 0.5)
    ax.grid()

Plot PCA

In [None]:
# Principal Components from TF-IDF features
pca_train_tfidf = pca.fit_transform(train_lem_tdidf)
# TF-IDF Train Data
plot_PCA(pca_train_tfidf, pca, train_df["condition_name"], "PCA from Train Dataset TF-IDF transformed", "train")

In [None]:
# Principal Components from TF-IDF features
pca_test_tfidf = pca.fit_transform(test_lem_tdidf)
# TF-IDF Test Data
plot_PCA(pca_test_tfidf, pca, None, "PCA from Test Dataset TF-IDF transformed", "test")

In [None]:
# Principal Components from BioSent2Vec features
pca_train_s2v = pca.fit_transform(train_lem_vec)
# BioSentVec Train Data
plot_PCA(pca_train_s2v, pca, train_df["condition_name"], "PCA from Train Dataset BioSentVec transformed", "train")

In [None]:
# Principal Components from BioSent2Vec features
pca_test_s2v = pca.fit_transform(test_lem_vec)
# BioSentVec Test Data
plot_PCA(pca_test_s2v, pca, None, "PCA from Test Dataset BioSentVec transformed", "test")

## 3.2. Uniform Manifold Approximation and Projection for Dimension Reduction (UMAP)
Define the dimensionally reductor of UMAP

In [None]:
reducer = umap.UMAP(random_state = 777)

Define function to calculate an plot UMAP

In [None]:
def umap_and_plot(data_vec, condition_name, dataset_type, title):
    # Initialize the UMAP reducer
    # reducer = UMAP(n_neighbors = 75, min_dist = 0.1, metric = "euclidean")
    
    # Calculate embeddings
    embedding = reducer.fit_transform(data_vec)

    # Show embedding shape
    print(embedding.shape)

    # Define a color palette for the conditions
    palette = {
        "neoplasms": "#1f77b4",
        "digestive system diseases": "#ff7f0e",
        "nervous system diseases": "#2ca02c",
        "cardiovascular diseases": "#d62728",
        "general pathological conditions": "#9467bd"
    }

    # Map conditions to colors
    condition_colors = [palette[condition] for condition in condition_name]
    print(len(condition_colors))

    # Plot
    plt.figure(figsize=(8, 8))
    plt.scatter(
        embedding[:, 0],
        embedding[:, 1],
        c = condition_colors,
        s = 10,  # Point size
        alpha = 0.5  # Transparency
    )
    plt.gca().set_aspect("equal", "datalim")
    plt.title(title, fontsize=24)
    plt.xlabel('UMAP1', fontsize=16)
    plt.ylabel('UMAP2', fontsize=16)
    plt.show()

Plot UMAP

In [None]:
# Train Data TF-IDF transformed
umap_and_plot(train_lem_vec, train_df["condition_name"], None, "UMAP from Train Dataset TF-IDF transformed")

In [None]:
# Train Data TF-IDF transformed
umap_and_plot(train_lem_vec, train_df["condition_name"], None, "UMAP from Train Dataset TF-IDF transformed")