In [1]:
# Importing libraries
import numpy as np
import pandas as pd
import seaborn as sns
import nltk
from nltk import sent_tokenize
import matplotlib.pyplot as plt
import re
import torch
from datasets import Dataset
import evaluate

import plotly.express as px
import plotly.graph_objs as go
import plotly.subplots as sp
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

from scipy.stats import gaussian_kde

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
# Global variables for plots
colormap = "viridis"

template = "seaborn"

In [3]:
# For necessary data, run src/data/download_data.py

In [4]:
# For reproducibility
seed = 42

np.random.seed(seed)

torch.manual_seed(seed)

torch.cuda.manual_seed(seed)

torch.backends.cudnn.deterministic = True

torch.backends.cudnn.benchmark = False

torch.use_deterministic_algorithms = True

In [5]:
# Load data
df_train = pd.read_csv("data/raw/samsum-train.csv")
df_test = pd.read_csv("data/raw/samsum-test.csv")
df_val = pd.read_csv("data/raw/samsum-validation.csv")

In [6]:
df_train.head()

Unnamed: 0,id,dialogue,summary
0,13818513,Amanda: I baked cookies. Do you want some?\r\...,Amanda baked cookies and will bring Jerry some...
1,13728867,Olivia: Who are you voting for in this electio...,Olivia and Olivier are voting for liberals in ...
2,13681000,"Tim: Hi, what's up?\r\nKim: Bad mood tbh, I wa...",Kim may try the pomodoro technique recommended...
3,13730747,"Edward: Rachel, I think I'm in ove with Bella....",Edward thinks he is in love with Bella. Rachel...
4,13728094,Sam: hey overheard rick say something\r\nSam:...,"Sam is confused, because he overheard Rick com..."


Our data contains three columns: ID, the dialogue, and the summary.

In [7]:
# Function to describe data
def describe_data(df):
    print(f"Shape of data: {df.shape}")
    print(f"\nMissing Data: \n{df.isnull().sum()}")
    print(f"\nDuplicates: {df.duplicated().sum()}")
    print(f"\nData Types: \n{df.dtypes}")

In [8]:
describe_data(df_train)

Shape of data: (14732, 3)

Missing Data: 
id          0
dialogue    1
summary     0
dtype: int64

Duplicates: 0

Data Types: 
id          object
dialogue    object
summary     object
dtype: object


Next, we check if there are any null values in our data.

In [9]:
null_rows = df_train[df_train.isnull().any(axis=1)]
null_rows

Unnamed: 0,id,dialogue,summary
6054,13828807,,problem with visualization of the content


There is only one row with a null value, so we will just drop it.

In [10]:
df_train = df_train.dropna()

In [11]:
categorical_features = [
    col for col in df_train.columns if df_train[col].dtype == "object"
]
categorical_features.remove("id")

The following function gets the following counts for both the dialogue and the summary: number of words, number of characters, number of sentences, mean number of characters in a word, and mean number of words in a sentence. We expect the number of words, number of characters, and number of sentences to be smaller for sentences, but the mean number of characters in a word and the mean number of words in a sentence probably should not change.

In [12]:
def get_counts(df):
    df_eda = pd.DataFrame()
    for col in categorical_features:
        df_eda[col + "_word_count"] = df[col].apply(lambda x: len(str(x).split()))
        df_eda[col + "_char_count"] = df[col].apply(lambda x: len(str(x)))
        df_eda[col + "_sentence_count"] = df[col].apply(
            lambda x: len(sent_tokenize(str(x)))
        )
        df_eda[col + "_mean_word_length"] = df[col].map(
            lambda x: np.mean([len(word) for word in str(x).split()])
        )
        df_eda[col + "_mean_sentence_length"] = df[col].map(
            lambda x: np.mean([len(sentence) for sentence in sent_tokenize(str(x))])
        )
    return df_eda


df_train_eda = get_counts(df_train)

n = len(df_train_eda.columns) // 2
cols = df_train_eda.columns.tolist()
reordered_cols = [cols[i // 2 + (i % 2) * n] for i in range(2 * n)]
df_train_eda = df_train_eda[reordered_cols]
df_train_eda.head()

Unnamed: 0,dialogue_word_count,summary_word_count,dialogue_char_count,summary_char_count,dialogue_sentence_count,summary_sentence_count,dialogue_mean_word_length,summary_mean_word_length,dialogue_mean_sentence_length,summary_mean_sentence_length
0,16,9,94,56,4,1,4.75,5.333333,22.25,56.0
1,18,10,111,61,4,1,5.0,5.1,26.0,60.0
2,98,14,528,77,3,1,4.306122,4.571429,174.666667,77.0
3,26,18,155,97,3,3,4.884615,4.388889,50.666667,31.333333
4,179,27,909,145,6,3,3.960894,4.407407,149.833333,47.666667


We will use the following code to draw the box plot and KDE plot for each count.

In [13]:
def visualize(df, feat):
    col = df[feat]
    fig = make_subplots(
        rows=1, cols=2, subplot_titles=["Box Plot", "KDE Plot"], horizontal_spacing=0.2
    )

    fig.add_trace(go.Box(y=col, name="Box Plot"), row=1, col=1)

    density = gaussian_kde(col)
    x = np.linspace(min(col), max(col), 1000)
    fig.add_trace(go.Scatter(x=x, y=density(x), name="KDE Plot"), row=1, col=2)

    fig.update_layout(
        title={
            "text": f'<b>{feat.replace("_", " ").title()}</b>',
            "y": 0.9,
            "x": 0.5,
            "xanchor": "center",
            "yanchor": "top",
        },
        showlegend=False,
        template=template,
        height=600,
        width=800,
    )

    fig.update_yaxes(
        title_text=f"<b>{feat.split('_')[-1].title()}</b>", row=1, col=1, showgrid=True
    )
    fig.update_xaxes(title_text="", row=1, col=1, showgrid=False)

    fig.update_yaxes(title_text="<b>Frequency</b>", row=1, col=2, showgrid=False)
    fig.update_xaxes(
        title_text=f"<b>{feat.split('_')[-1].title()}</b>", row=1, col=2, showgrid=True
    )

    return fig


for feat in df_train_eda.columns:
    fig = visualize(df_train_eda, feat)
    fig.show()

The following code will draw the correlation matrices for unigrams, bigrams, and trigrams.

In [14]:
def plot_correlation(df, title, cmap):
    """
    This function plots correlation map among features in the dataset.

    Parameters:
    height = Define height
    width = Define width
    font_size = Define the font size for the annotations
    """
    corr = np.round(df.corr(numeric_only=True), 2)

    mask = np.triu(np.ones_like(corr, dtype=bool))
    corr_masked = corr.mask(mask)
    z = corr_masked.values

    # z = corr.values
    for i in range(len(z)):
        for j in range(len(z[0])):
            if i == j:
                z[i][j] = np.nan

    fig = ff.create_annotated_heatmap(
        z=z,
        x=corr.columns.tolist(),
        y=corr.index.tolist(),
        colorscale=cmap,
        showscale=True,
        xgap=1,
        ygap=1,
        zmin=-1,
        zmax=1,
    )

    fig.update_xaxes(side="bottom")

    fig.update_layout(
        title={
            "text": f"<b>{title}</b>",
            "y": 0.9,
            "x": 0.5,
            "xanchor": "center",
            "yanchor": "top",
        },
        margin=dict(t=210, l=110),
        yaxis=dict(autorange="reversed", showgrid=False),
        xaxis=dict(showgrid=False),
        template=template,
        height=800,
        width=800,
    )

    for i in range(len(fig.layout.annotations)):
        fig.layout.annotations[i].font.size = 12
        if fig.layout.annotations[i].text == "nan":
            fig.layout.annotations[i].text = ""

    return fig

We are using `TfidfVectorizer` to transform our dataset. Note that `TfidfVectorizer` is just `CountVectorizer` followed by `TfidfTransformer`. Using `stop_words='english'` removes words like 'and' or 'the' that are probably uninformative.

In [15]:
for n in [1, 2, 3]:
    for feat in categorical_features:
        vectorizer = TfidfVectorizer(
            max_features=15, stop_words="english", ngram_range=(n, n)
        )  # Top 15 terms
        x = vectorizer.fit_transform(df_train[feat])
        df_tfidfvect = pd.DataFrame(
            x.toarray(), columns=vectorizer.get_feature_names_out()
        )
        fig = plot_correlation(
            df_tfidfvect,
            title=feat.title() + " " + str(n) + "-Gram Correlation",
            cmap="Blues",
        )
        fig.show()
        fig.write_html(
            f"output/tfidfvect_{feat}_{n}-gram.html",
            auto_open=False,
            include_plotlyjs="cdn",
        )

There are some phrases that are slightly correlated. For example, "wants buy" and "buy new" has a correlation of 0.14 in the Summary 2-Gram, possibly due to phrases like "...wants to buy a new..." In the Summary 3-Gram, "year eve party" and "new year eve" has a relatively high correlation of 0.25.

Also, note that in the Dialogue 3-Gram, there is a phrase "file_photo file_photo file_photo." This is presumably where there was a photo in the original text. Let us look at an example where "file_photo" appears in the dialogue:

In [16]:
df_train["dialogue"]
indices = [i for i, s in enumerate(df_train["dialogue"]) if "file_photo" in s]
print(df_train["dialogue"].iloc[indices[2]])

Ashley: Guys, you have to read this book!  <file_photo>
Marcus: Why, what's so special about it?
Erin: I think I've already heard about it from someone. Is it really that good?
Ashley: It's the best thing I've ever read! Completely life-changing! It's opened my eyes to a lot of things.
Seamus: Sorry, but I don't like books that are written to change my life. I prefer books that are simply fun to read :P
Marcus: I get what you mean. I feel like some authors are so concentrated on making their books full of wisdom that they completely forget that they should also be readable.
Erin: Do you mean Coelho? XD
Marcus: No, while I'm not a fan of his, at least I've never fallen asleep while reading his books. I meant this one for example: <file_other>
Ashley: Erm, I quite like his books.
Seamus: Did they change your life too? :D
Ashley: Wait, I meant Coelho. I've never read the other guy.
Marcus: Trust me, don't. There are lots of better ways of wasting your time.
Ashley: LOL, okay, I trust you.