In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
project_dir = '/content/drive/My Drive/cnn-dailymail-summarizer'
os.chdir(project_dir)

!pip install -r requirements.txt

In [None]:
import pandas as pd
from cnn_dailymail_news_text_summarizer.dataset import load_datasets
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
import nltk

## Loading Data

In [None]:
train_path = os.path.join(project_dir, 'data/raw/cnn_dailymail/train.csv')
test_path = os.path.join(project_dir, 'data/raw/cnn_dailymail/test.csv')
val_path = os.path.join(project_dir, 'data/raw/cnn_dailymail/validation.csv')


In [None]:
train_data, test_data, val_data = load_datasets(train_path, test_path, val_path)

In [None]:
train_data.head()

## Exploratory Data Analysis

In [None]:
sample = train_data.sample()

list(sample['article'])

In [None]:
list(sample['highlights'])

In [None]:
len(train_data)

### Counts and Lengths

In [None]:
eda_data = train_data.sample(frac=0.1)

In [None]:
plt.hist(eda_data['article'].str.len(), bins=50, edgecolor='white')
plt.xlabel("Number of Characters in Article")
plt.ylabel("Number of Articles")
plt.title("Distribution of Characters per Article")
plt.show()

In [None]:
plt.hist(eda_data['article'].str.split().map(lambda x: len(x)), bins=50, edgecolor='white')
plt.xlabel("Number of Words in Article")
plt.ylabel("Number of Articles")
plt.title("Distribution of Words per Article")
plt.show()

In [None]:
nltk.download('punkt')

In [None]:
plt.hist(eda_data['article'].apply(lambda x: len(nltk.sent_tokenize(x))), bins=50, edgecolor='white')
plt.xlabel("Number of Sentences in Article")
plt.ylabel("Number of Articles")
plt.title("Distribution of Sentences per Article")
plt.show()

In [None]:
eda_data['mean_word_length'] = eda_data['article'].map(lambda x : np.mean([len(word) for word in x.split()]))
eda_data.head(10)

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=eda_data, y='mean_word_length')
plt.ylabel("Mean Word Length")
plt.title("Boxplot of Mean Word Length per Article")
plt.show()

In [None]:
plt.hist(eda_data['highlights'].str.len(), bins=50, edgecolor='white')
plt.xlabel("Number of Characters in Article Summary")
plt.ylabel("Number of Articles")
plt.title("Distribution of Characters per Article Summary")
plt.show()

In [None]:
plt.hist(eda_data['highlights'].str.split().map(lambda x: len(x)), bins=50, edgecolor='white')
plt.xlabel("Number of Words in Article Summary")
plt.ylabel("Number of Articles")
plt.title("Distribution of Words per Article Summary")
plt.show()

In [None]:
plt.hist(eda_data['highlights'].apply(lambda x: len(nltk.sent_tokenize(x))), bins=50, edgecolor='white')
plt.xlabel("Number of Sentences in Article Summary")
plt.ylabel("Number of Articles")
plt.title("Distribution of Sentences per Article Summary")
plt.show()

## Term frequency

In [None]:
eda_data.drop('mean_word_length', axis=1, inplace=True)