In [1]:
import os
import numpy as np
import pandas as pd
import nltk.tokenize
import re
import random
from nltk.util import ngrams
import tqdm
from nltk.tokenize import RegexpTokenizer
import torch

In [2]:
from transformers import LEDTokenizer, LongformerTokenizer, LEDForConditionalGeneration
import torch
from transformers import TrainingArguments, Trainer

In [3]:
def read_text(path):
    files= os.listdir(path) 
    results = {'text':[], 'highlight': [], 'highlight_1':[], 'highlight_2':[], 'highlight_3':[], 'highlight_4':[]}
    for file in tqdm.tqdm(files):
        if not os.path.isdir(file):
            file_name = path + '/'+file
            with open(file_name, encoding="utf-8") as f:
                text = (f.read()).replace('\n', " ").replace("(CNN)", "").replace("--", "")
                if len(text)<1000:
                    continue
                text_highlights = text.split("@highlight")
                final_text = text_highlights[0]
                results['text'].append(final_text.strip())
                all_highlight = ""
                for i in range(1, 5):
                    key = 'highlight_'+str(i)
                    if i<len(text_highlights):
                        results[key].append(text_highlights[i])
                        all_highlight += text_highlights[i] + '.'
                    else:
                        results[key].append("")
                results['highlight'].append(all_highlight.strip())
    return pd.DataFrame(results)

In [4]:
train_dir = 'train_data'
test_dir = 'test_data'
test_data = read_text(test_dir)
train_data = read_text(train_dir)

100%|████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:00<00:00, 6154.55it/s]
100%|██████████████████████████████████████████████████████████████████████████| 40000/40000 [00:06<00:00, 6660.43it/s]


In [5]:
test_data

Unnamed: 0,text,highlight,highlight_1,highlight_2,highlight_3,highlight_4
0,It's official: U.S. President Barack Obama wan...,Syrian official: Obama climbed to the top of t...,Syrian official: Obama climbed to the top of...,Obama sends a letter to the heads of the Hou...,Obama to seek congressional approval on mili...,"Aim is to determine whether CW were used, no..."
1,This week the Supreme Court heard two historic...,Ken Klukowski: Cases heard by Supreme Court co...,Ken Klukowski: Cases heard by Supreme Court ...,He says there are questions of whether cases...,"If court issues sweeping ruling, it could de...",Klukowski: Gay marriage is such a new phenom...
2,"Zango Town, Liberia At the gravesite in a no...",Liberia is one of the countries worst-hit by t...,Liberia is one of the countries worst-hit by...,Entire towns and villages have been placed i...,Health workers must ensure those who die of ...,"""Running away from Ebola is not a solution ..."
3,The big winners of this Formula One season cou...,The first race of the 2014 Formula One season ...,The first race of the 2014 Formula One seaso...,"Turbo engines are back in the sport, with ea...",Former F1 winner Jody Scheckter expects F1 t...,"For the first time in the sport's history, d..."
4,If that car parked in Harvard Yard is a rockin...,Harvard bans all romantic relationships betwee...,Harvard bans all romantic relationships betw...,Policy comes on heels of investigation into ...,,
...,...,...,...,...,...,...
1977,WASHINGTON The Obama administration is givin...,Departure of General Motors' CEO part of gover...,Departure of General Motors' CEO part of gov...,"GM official: White House signaled that ""new ...",Officials: GM to get 60 days of financing; C...,"GM, Chrysler were told to prove viability to..."
1978,NFL star Adrian Peterson pleaded no contest Tu...,Adrian Peterson says he loves his son and regr...,Adrian Peterson says he loves his son and re...,DA says the NFL star received no special tre...,"Peterson is on probation for 2 years, will m...","He is still on the Vikings roster, but has b..."
1979,(EW.com) Moms and Dads: Get your kids to take...,"Movie critics have crowned ""The Lego Movie"" as...","Movie critics have crowned ""The Lego Movie"" ...",Reviews are pegging it as a cross between Pi...,,
1980,The man who made Formula One's bravest comebac...,"Niki Lauda says Ferrari has made a ""very good""...","Niki Lauda says Ferrari has made a ""very goo...",The three-time world champion says it will a...,He warns managing Raikkonen and Alonso in 20...,Lauda says bringing Lewis Hamilton to Merced...


In [6]:
tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")

In [7]:
x_train = list(train_data["text"])
y_train = list(train_data["highlight"])

x_test= list(test_data["text"])
y_test = list(test_data["highlight"])

x_train_tokenized = tokenizer(x_train, padding=True, truncation=True, return_tensors="pt", max_length = 1536)
y_train_tokenized = tokenizer(y_train, padding=True, truncation=True, return_tensors="pt", max_length = 256)
x_test_tokenized = tokenizer(x_test, padding=True, truncation=True, return_tensors="pt", max_length = 1536)
y_test_tokenized = tokenizer(y_test, padding=True, truncation=True, return_tensors="pt", max_length = 256)

In [8]:
tokenizer.vocab_size

50265

In [9]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.input_ids = encodings.input_ids
        self.attention_mask = encodings.attention_mask
        result = []
        for label in labels.input_ids:
            tmp =list(label)
            result.append([-100 if token_id == tokenizer.pad_token_id else token_id for token_id in label])
        self.labels = result

    def __getitem__(self, idx):
        item = {}
        item["input_ids"] = self.input_ids[idx]
        item["attention_mask"] = self.attention_mask[idx]
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.input_ids)

In [10]:
train_dataset = Dataset(x_train_tokenized, y_train_tokenized)
test_dataset = Dataset(x_test_tokenized, y_test_tokenized)

In [11]:
torch.save(train_dataset, "LED_dataset/train_dataset.pth")
torch.save(test_dataset, "LED_dataset/test_dataset.pth")

FileNotFoundError: [Errno 2] No such file or directory: 'LED_dataset/train_dataset.pth'