This Jupyter notebook contains code used to visualize different books using a SentenceTransformer model. The theoretical background which underpins this project as well as the actual plots we produce can be found [here](https://wandb.ai/dmeltzer/gutenberg/reports/Visualizing-Literature-using-Transformers--Vmlldzo0MTIyODEx?accessToken=1ekch7p12170nvwbtqzvy2g3shpyyboajfbalciun3ly913cdv033je1rvkoa5bj).

# Dependencies

This notebook was written to be used on Google colab, but can be adapted to run locally.

In [None]:
# Mount google drive and cd into project directory.
from google.colab import drive
drive.mount('/content/drive')
# Name of project directory should be changed to where ever this file is saved.
%cd drive/My Drive/Gutenberg

Mounted at /content/drive
/content/drive/My Drive/Gutenberg


Code block below installs and imports the libraries needed to process and visualize each book using transformer models.

In [None]:
# Install sentence-transformers to encode text.
!pip install -U sentence-transformers
# Install wandb to perform experiment tracking/logging.
!pip install wandb

import os
# umap and TSNE are used to visualize data.
import umap
from sklearn.manifold import TSNE

# Use wandb to log/track data.
import wandb
# use re library to perform data cleaning.
import re
# books will later be saved in a pandas dataframe.
import pandas as pd
# plotly is used to visualize data.
import plotly.io as pio
# Standard import for numpy.
import numpy as np
import plotly.graph_objs as go
from sentence_transformers import SentenceTransformer
# Used to track data processing.
from tqdm import tqdm

# Functions Definitions

In this section we define the functions needed to process the dataframe and plot the sentence embedding vectors. The books we study are scraped from ProjectGutenberg and are saved into a csv file which only contains one row and has the entirety of the book saved in the "text" column. The purpose of the "process_file" is to process each book so we have a Pandas dataframe where each row corresponds to a passage (of some fixed maximum length) and so each passage comes from a distinct chapter. Each passage is then encoded in a high-dimensional vector using a sentence transformer model.

The functions "tsne_plot", "umap_plot" perform dimensional reduction using t-distributed stochastic neighbor embedding (t-SNE) and Uniform Manifold Approximation and Projection (UMAP) and plot the resulting two-dimensional vectors. The function "dist_matrix" forms a matrix of cosine similarity distances between each chapter and "heatmap_plot" forms a heatmap plot from this matrix. 

In [None]:
def process_file(df,model,sent_length,title):
    """
    Converts a scraped book into a dataframe. Book is split by sentence length and chapters.

    Inputs:
    -------
    - df (Pandas Dataframe): Dataframe consisting of book scraped from Project Gutenberg. Each dataframe contains only one row corresponding to entire text of book.
    - model (Huggingface Transformer): Transformer model which is used to encode text.
    - sent_length (int): Splits book into passages with length of at most sent_length.
    - title (str): Title of book.

    Output:
    - df_processed (Pandas Dataframe): Book split into passages with length at most sent_length.
                                       Book is split such that each passage comes from a unique chapter.
    """
    

    # chapt denotes current chapter. Initialize to 1
    chapt=1

    # begin and end correspond to indices for the beginning and end of the passage.
    begin=0
    end=sent_length
    
    # original dataframe, df, consists of one row where the 'Text' column contains text for entire book.
    # process data by lowecasing text and splitting on whitespace.
    text=df.iloc[0]['Text'].lower().split()
    
    # Define new processed dataframe with columns for the text, chapter number, and title of book.
    df_processed=pd.DataFrame(columns=['text','chapter','book'])
    
    # Scan over entire book.
    while begin<=len(text):
        # Extract sentence from original text.
        sent=text[begin:end]

        # If 'chapter_end" is in sentence we need to split the sentence so each row of the
        # processed dataframe contains text from unique chapters.
        if 'chapter_end' in sent:
            # index where 'chapter_end' appears.
            idx=sent.index('chapter_end')
            # If idx>0 then we put all text below idx into one row of the dataframe.
            if idx>0:
                df_processed.loc[len(df_processed)]=[' '.join(sent[:idx]),chapt,title]
                # if 'chapter_end' occurs before end of sent, then place all text after 'chapter_end' in a new row.
                if idx<len(sent)-1:
                    chapt+=1
                    df_processed.loc[len(df_processed)]=[' '.join(sent[idx+1:]),chapt,title]
                # If 'chapter_end' is last element of sent then we increase chapt by one and iterate to next span of text.
                elif idx==len(sent)-1:
                    chapt+=1
            # if "chapter_end" occurs at beginning of sent then we increase chapt by one and place all following text in a new row of the dataframe.
            elif idx==0:
                chapt+=1
                df_processed.loc[len(df_processed)]=[' '.join(sent[idx+1:]),chapt,title]
        # If 'chapter_end' does not appear in sentence then we just convert sent into a string and add it to dataframe.
        else:
            df_processed.loc[len(df_processed)]=[' '.join(sent),chapt,title]
        
        # increase begin and end by sent_length to move onto next span of text.
        begin+=sent_length
        end+=sent_length
        
    sent_vecs=model.encode(df_processed['text'].to_list())
    df_processed['embeddings']=pd.Series(list(sent_vecs))
    
    return df_processed

In [None]:
def tsne_plot(df,
              title=None,
              opacity=.5,
              size=4):
    
    """
    Makes a t-SNE plot based on the processed dataframe for each book.

    Inputs:
    -------
    - df (Pandas Dataframe): Dataframe of processed book split by sentence length and chapter number.
    - title (str): Title of plot.
    - opacity (float): Opacity of the dots in the t-SNE plot.
    - size (float): Size of each dot in the plot.

    Output:
    - go.Figure object of t-SNE plot.
    """

    # Form 2d t-SNE vectors using the embedding vectors for each passage in the book. 
    tsne_vecs=TSNE(n_components=2).fit_transform(np.stack(df['embeddings']))
    
    # Color and label each dot based on the chapter it appears in.
    colors=df.chapter.to_numpy()
    labels=colors

    # Form a scatter plot based on the tsne vectors.
    trace = go.Scatter(
        x=tsne_vecs[:, 0],
        y=tsne_vecs[:, 1],
        text=labels,  # Specify the label for each point
        mode='markers',
        hoverinfo='text',  # Show label when hovering over a point
        marker=dict(
            size=size,
            color=colors,
            opacity=opacity,
            line=dict(width=0.5, color='white'),
            colorbar=dict(title='Colorbar Title')  # Add a color bar with title
        )
    )

    layout = go.Layout(
        title='TSNE Plot: '+title.capitalize() if title is not None else 'TSNE Plot',
        hovermode='closest',
        xaxis=dict(title='X axis'),
        yaxis=dict(title='Y axis')
    )

    # Create a Figure object and add the trace and layout
    fig = go.Figure(data=[trace], layout=layout)
        
    if title is not None:
        pio.write_image(fig, './figures/tsne_'+title, format='png')
    fig.show()
    return fig

def umap_plot(df,
              title,
              n_neighbors=10,
              min_dist=0,
              size=5,
              opacity=.5,
              save_file=True):
    """
    Forms umap plot from the sentence embedding vectors in the processed dataframe.

    Inputs:
    -------
    - df (Pandas Dataframe): Dataframe of processed book split by sentence length and chapter number.
    - title (str): Title of plot.
    - n_neighbors (int): constrains size of the local neighborhood UMAP will look at when attempting to learn the manifold structure of the data.
    - min_dist (float): Controls how tightly UMAP is allowed to pack points together.
    - size (float): Size of each dot in the umap plot.
    - opacity (float): Opacity of each dot in the umap plot.
    - save_file (bool): whether or not to save the umap figure.

    Output:
    - fig (go.Figure): Returns the umap plot object.
    """
    
    # Extract sentence embedding vectors from the processed dataframe.
    sent_vecs=list(df['embeddings'])
    # Forms UMAP vectors from the sentence vectors.
    umap_emb = umap.UMAP(n_neighbors=n_neighbors, 
                         min_dist=min_dist, 
                         metric='euclidean').fit_transform(sent_vecs)
    
    # Color code each sentence vector by the chapter it appears in.
    colors=df.chapter.to_numpy()
    
    trace = go.Scatter(
        x=umap_emb[:, 0],
        y=umap_emb[:, 1],
        text=colors,  # Specify the label for each point
        mode='markers',
        hoverinfo='text',  # Show label when hovering over a point
        marker=dict(
            size=size,
            color=colors,
            opacity=opacity,
            line=dict(width=0.5, color='white'),
            colorbar=dict(title='Colorbar Title')  # Add a color bar with title
        )
    )

    layout = go.Layout(
        title='UMAP: '+title.capitalize() if title is not None else 'UMAP',
        hovermode='closest', 
        xaxis=dict(title='X axis'),
        yaxis=dict(title='Y axis')
    )

    # Create a Figure object and add the trace and layout
    fig = go.Figure(data=[trace], layout=layout)

    # Show the plot
    fig.show()
    
    if save_file is not None:
        pio.write_image(fig, './figures/umap_'+title, format='png')
    return fig

def dist_matrix(df):
    """
    Forms a matrix of cosine-similarity distances between each chapter in the book.

    Input:
    ------
    - df (Pandas Dataframe): Dataframe of processed book split by sentence length and chapter number.

    Output:
    -------
    - cos_distance (list): List of lists where row i and column j corresponds to cosine-distance between chapters (i+1) and (j+1).
    """

    # Form chapter embedding vectors by taking the sum of all sentence embedding vectors in each chapter.
    chapter_vecs=df.groupby('chapter')['embeddings'].sum().values
    # Compute the norm of each chapter embedding vector.
    norms=[np.sqrt(np.dot(chapter_vecs[i],chapter_vecs[i])) for i in range(len(chapter_vecs))]
    # Compute cosine distance between each chapter embedding vector.
    cos_distance= [[np.dot(chapter_vecs[i],chapter_vecs[j])/(norms[i]*norms[j]) \
                    for i in range(len(chapter_vecs))] for j in range(len(chapter_vecs))]
    return cos_distance

def heatmap_plot(df,title=None):
    """
    Makes heatmap plot for the cosine-similarity distance between each chapter.

    Inputs:
    -------
    - df (Pandas Dataframe): Dataframe of processed book split by sentence length and chapter number.
    - title (str): Title of heatmap plot.
    """

    # List of lists containing cosine distances between each chapter.
    # Entry in row i and column j corresponds to distance between chapters (i+1) and (j+1)    
    cos_distance= dist_matrix(df)
    
    # Total number of chapters in the book.
    last_chapter=max(df['chapter'])

    # Layout for heatmap plot.
    layout = go.Layout(
        title_text='Heatmap Plot: '+title.capitalize() if title is not None else 'heatmap Plot',
        title_x=.5,
        hovermode='closest', # Show distance when hovering over a point
        xaxis=dict(title='Chapter'),
        yaxis=dict(title='Chapter')
    )
    
    # Form heatmap plot where value is cos_distance and axes are defined by the chapters.
    data=go.Heatmap(
        z=cos_distance,
        x=np.arange(1,last_chapter+1),
        y=np.arange(1,last_chapter+1),
        colorscale='magma')

    # Form go.Figure object.
    fig = go.Figure(data=data,
        layout=layout
    )

    fig.update_layout(
        xaxis = dict(
            tickmode = 'linear',
            tick0 = 1,
            dtick = 1),
        yaxis = dict(
            tickmode = 'linear',
            tick0 = 1,
            dtick = 1))
    return fig

# Process Books

This section contains the code used to actually process each book.

Run the code block below to define the list of books we study, the model used to form the sentence embedding vectors, and then to process each book.

In [None]:
# List of books being studied.
# "Ulysses", "Portrait", and "Dubliners" are the three major works by James Joyce in the public domain.
# Final three entries correspond to three different translations of "The Odyssey" by Homer.
# "butcher" refers to the translation by Butcher and Lang.
# "butler" refers to translation by Samuel Butler.
# "pope" refers to translation by Alexander Pope.
books=['Ulysses','Portrait','Dubliners','butcher','butler','pope']

# Use all-mpnet-base-v2 model to encode sentences since it is the most powerful sentence-transformer model available on Huggingface.
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

# df_books is a dictionary where the key is the title of the book and the value is the unprocessed dataframe for each book.
df_books={}

# Read each csv file into a pandas dataframe.
# To download each book one needs to scrape the book using the Gutenberg.py file.
df_books['Dubliners']=pd.read_csv('./books/Dubliners_James_Joyce.csv')
df_books['Ulysses']=pd.read_csv('./books/Ulysses_James_Joyce.csv')
df_books['Portrait']=pd.read_csv('./books/Portrait_James_Joyce.csv')

df_books['butcher']=pd.read_csv('./books/Odyssey_Homer_Butcher_Lang.csv')
df_books['butler']=pd.read_csv('./books/Odyssey_Homer_Butler.csv')
df_books['pope']=pd.read_csv('./books/Odyssey_Homer_Pope.csv')

# Clean text for Butler's translation by removing numbers corresponding to footnotes.
text=re.sub(r'\[[0-9]+\]', '', ' '.join(df_books['butler'].iloc[0]['Text'].split()))
df_books['butler'].iloc[0]['Text']=text.lower()

# df_processed is a dictionary where the key is the title of the book and the value is the processed dataframe for each book.
df_processed={}

for book in tqdm(books):
    print(f'working on book: {book}') # Used to monitor progress of the processing of each book.
    # If processed dataset already exists we load it from memory.
    
    processed_file = f'./data/df_processed_{book}'
    
    if os.path.exists(processed_file):
        df_processed[book] = pd.read_csv(processed_file)
        # ensure that the column 'embeddings' contains numpy arrays and not strings or lists.
        df_processed[book]['embeddings']=df_processed[book]['embeddings'].map(eval)
        df_processed[book]['embeddings']=df_processed[book]['embeddings'].map(np.array)
        continue
    df_processed[book]=process_file(df_books[book],model,100,book)
    # Convert the entires in the 'embeddings' column to a numpy array.
    df_processed[book]['embeddings']=df_processed[book]['embeddings'].map(np.array)
    # Save the processed dataframes in a new file.
    df_processed[book].to_csv(processed_file)

# Plots

In [None]:
# For each book we produce a t-sne, umap, and heatmap plot.
# Each plot is then logged on wandb.
for book in tqdm(books):
    print(f'working on {book}')
    
    tsne_plt = tsne_plot(df_processed[book], title = book)
    umap_plt = umap_plot(df_processed[book], title = book)
    heat_matrix_plt = heatmap_plot(df_processed[book], title = book)
    
    with wandb.init(project='gutenberg',
                    entity=None,
                    job_type='EDA',
                    name='EDA_'+book+'_plots') as run:

        run.log({f"{book}: TSNE": tsne_plt})
        run.log({f"{book}: umap": umap_plt})
        run.log({f"{book}: heat_matrix": heat_matrix_plt})
    