In [None]:
from google.colab import drive
drive.mount('/content/drive')
import os
project_dir = '/content/drive/My Drive/cnn-dailymail-summarizer'
os.chdir(project_dir)

!pip install -r requirements.txt

In [None]:

import pandas as pd
from cnn_dailymail_news_text_summarizer.dataset import load_datasets, remove_punctuation, preprocess_text, tokenize, save_tokenized_datasets, load_tokenized_datasets
from cnn_dailymail_news_text_summarizer.plots import plot_num_characters, plot_num_words, plot_num_sentences, plot_mean_word_length, create_corpus, plot_most_frequent_stopwords, plot_most_frequent_words, get_top_ngram
from cnn_dailymail_news_text_summarizer.training import train_model
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
import nltk
from collections import Counter
from collections import defaultdict
import re
from sklearn.feature_extraction.text import CountVectorizer
from transformers import BartForConditionalGeneration, BartTokenizer, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import Dataset, concatenate_datasets
import torch
import evaluate

In [None]:
if torch.cuda.is_available():
    device_name = torch.device("cuda")
else:
    device_name = torch.device('cpu')
print("Using {}.".format(device_name))

## Loading Data

In [None]:

train_path = os.path.join(project_dir, 'data/raw/cnn_dailymail/train.csv')
test_path = os.path.join(project_dir, 'data/raw/cnn_dailymail/test.csv')
val_path = os.path.join(project_dir, 'data/raw/cnn_dailymail/validation.csv')


In [None]:
train_data, test_data, val_data = load_datasets(train_path, test_path, val_path)

In [None]:
train_data.head()

## Exploratory Data Analysis



In [None]:
sample = train_data.sample()
list(sample['article'])

In [None]:
list(sample['highlights'])

In [None]:
len(train_data)

### Counts and Lengths

In [None]:
eda_data = train_data.sample(frac=0.1)

In [None]:
plot_num_characters(eda_data, 'article')

In [None]:
plot_num_words(eda_data, 'article')

In [None]:
nltk.download('punkt')

In [None]:
plot_num_sentences(eda_data, 'article')

In [None]:
plot_mean_word_length(eda_data, 'article')

In [None]:
plot_num_characters(eda_data, 'highlights')

In [None]:
plot_num_words(eda_data, 'highlights')

In [None]:
plot_num_sentences(eda_data, 'highlights')

### Term frequency

In [None]:
eda_data.drop('mean_word_length', axis=1, inplace=True)

In [None]:
nltk.download('stopwords')

In [None]:
stop = set(nltk.corpus.stopwords.words('english'))

In [None]:
plot_most_frequent_stopwords(eda_data, stop)

In [None]:
plot_most_frequent_words(eda_data, stop)

### N-gram frequency

In [None]:
eda_data = eda_data.sample(frac=0.1)

In [None]:
get_top_ngram(eda_data, list(stop), 2)

In [None]:
get_top_ngram(eda_data, list(stop), 3)

In [None]:
get_top_ngram(eda_data, list(stop), 5)

## Data Preprocessing

In [None]:
checkpoint = "facebook/bart-base"

In [None]:
tokenizer = BartTokenizer.from_pretrained(checkpoint)

In [None]:
tokenizer(train_data['article'][0])

In [None]:
sample = test_data.sample()
sample_text = sample['article'].iloc[0]
print(sample_text)

In [None]:
model = BartForConditionalGeneration.from_pretrained(checkpoint)
model.to(device_name)
inputs = tokenizer(sample_text, max_length=1024, return_tensors='pt', truncation=True)
inputs = {key: value.to(device_name) for key, value in inputs.items()}
summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=128, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print("Predicted summary: ", summary)
print("")
print("Actual summary: ", sample['highlights'].iloc[0])

In [None]:
#train_data = train_data.sample(frac=0.2, random_state=42)
#val_data = val_data.sample(frac=0.2, random_state=42)
#test_data = test_data.sample(frac=0.2, random_state=42)

In [None]:
#train_data = tokenize(train_data, tokenizer)

In [None]:
#print(train_data)

In [None]:
#val_data = tokenize(val_data, tokenizer)

In [None]:
#test_data = tokenize(test_data, tokenizer)

In [None]:
processed_path = os.path.join(project_dir, 'data/processed/')


In [None]:
#save_tokenized_datasets(train_data, test_data, val_data, processed_path)
train_data, _, val_data = load_tokenized_datasets(processed_path)

## Fine-tuning

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, padding=True)

In [None]:
rouge = evaluate.load("rouge")

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir='./model',
    num_train_epochs=11,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=4,
    warmup_steps=100,
    weight_decay=0.01,
    logging_steps=10,
    evaluation_strategy="epoch",
    save_total_limit=2,
    save_strategy="epoch",
    load_best_model_at_end=True,
    predict_with_generate=True,
    fp16=True,
    remove_unused_columns=True,
    save_safetensors=False
)


In [None]:
train_model(train_data, val_data, tokenizer, data_collator, training_args, device_name, rouge, checkpoint=True)

In [None]:
torch.cuda.empty_cache()

## Evaluation

In [None]:
model = BartForConditionalGeneration.from_pretrained('/content/drive/MyDrive/cnn-dailymail-summarizer/model/checkpoint-9867')
model.to('cuda')

In [None]:
sample = test_data.sample()
sample_text = sample['article'].iloc[0]
print(sample_text)

In [None]:
inputs = tokenizer(sample_text, max_length=1024, return_tensors='pt', truncation=True)
inputs = {key: value.to(device_name) for key, value in inputs.items()}
summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=128, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print("Predicted summary: ", summary)
print("")
print("Actual summary: ", sample['highlights'].iloc[0])