In [1]:
!pip install transformers




In [None]:
import pandas as pd
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv

In [None]:
news = pd.read_csv('/content/drive/MyDrive/Textgen/news_uk.csv')
news.head()

Unnamed: 0.1,Unnamed: 0,title,description
0,0,Manchester Arena bombing: Saffie Roussos's par...,The parents of the Manchester Arena bombing's ...
1,1,Ukraine war: PM to hold talks with world leade...,Boris Johnson is to meet the Canadian and Dutc...
2,2,Ukraine war: UK grants 50 Ukrainian refugee vi...,"The home secretary says she is ""surging capaci..."
3,3,"Covid: Fourth jab for Scotland's vulnerable, a...",Five things you need to know about the coronav...
4,4,'They thought he was too young for heart disease',A mother whose son died from heart disease urg...


In [None]:
data = news[news['description'].apply(lambda x: len(x.split(' ')) < 250)]

In [None]:
len(data)

6024

In [None]:

test_set = data.sample(n = 100)
data = data.loc[~data.index.isin(test_set.index)]

test_set = test_set.reset_index()
data = data.reset_index()
data[:1000]

Unnamed: 0.1,index,Unnamed: 0,title,description
0,0,0,Manchester Arena bombing: Saffie Roussos's par...,The parents of the Manchester Arena bombing's ...
1,1,1,Ukraine war: PM to hold talks with world leade...,Boris Johnson is to meet the Canadian and Dutc...
2,2,2,Ukraine war: UK grants 50 Ukrainian refugee vi...,"The home secretary says she is ""surging capaci..."
3,3,3,"Covid: Fourth jab for Scotland's vulnerable, a...",Five things you need to know about the coronav...
4,4,4,'They thought he was too young for heart disease',A mother whose son died from heart disease urg...
...,...,...,...,...
995,1009,1009,The Irish artist who charmed East German children,Belfast-born illustrator Elizabeth Shaw lived ...
996,1010,1010,Platinum Jubilee: Love and respect for the Que...,The Platinum Jubilee saw a huge outpouring of ...
997,1011,1011,Boris Johnson: The letterati and Conservative ...,"After a long weekend of pomp and pageantry, th..."
998,1012,1012,In pictures: Platinum Jubilee pageant,The Royal Family were among tens of thousands ...


In [None]:
#For the test set only, keep last 10 words in a new column, then remove them from original column
test_set['real_end'] = test_set['description'].str.split().str[4:].apply(' '.join)
test_set['description'] = test_set['description'].str.split().str[:4].apply(' '.join)
test_set['description'].head()

0          Detectives still do not
1    "Several people were freaking
2       Residents are described as
3            There is pressure for
4             The BBC's Ione Wells
Name: description, dtype: object

In [None]:
test_set.head()

Unnamed: 0.1,index,Unnamed: 0,title,description,real_end
0,3183,3183,Woman's body found in Bolton cellar mystery re...,Detectives still do not,know the identity of the woman whose remains w...
1,1874,1874,Eurotunnel Le Shuttle: Passengers stuck for ho...,"""Several people were freaking","out,"" said one traveller, after a Calais to Fo..."
2,3834,3834,Aberhosan: Farmer dies and son seriously injur...,Residents are described as,"being in shock, with a concert cancelled as a ..."
3,1711,1711,Tory leadership: Cost of living crisis focus f...,There is pressure for,candidates to explain what they will do to hel...
4,2684,2684,Rishi Sunak: Your questions answered,The BBC's Ione Wells,answers your questions on the UK's next prime ...


In [None]:
%cd "/content/drive/MyDrive/Textgen"
#test_set.to_csv()
path_file = "/content/drive/MyDrive/Textgen/test_newsuk.csv"
test_set.to_csv(path_file)

/content/drive/MyDrive/Textgen


In [None]:
path_file ="/content/drive/MyDrive/Textgen/news_data.csv"
data.to_csv(path_file)

In [None]:
class News(Dataset):

    def __init__(self, end_text, truncate=False, gpt2_type="gpt2", max_length=1024):

        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
        self.news = []

        for item in data['description']:
          encoded_text = self.tokenizer.encode(f"<|{end_text}|>{item[:max_length]}")
          self.news.append(torch.tensor(encoded_text))

        if truncate:
            self.news = self.news[:30000]
        self.news_count = len(self.news)

    def __len__(self):
        return self.news_count

    def __getitem__(self, item):
        return self.news[item]

In [None]:
dataset = News(data['description'], truncate=True, gpt2_type="gpt2")


In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [None]:
#Accumulated batch size (since GPT2 is so big)
def load_tensor(input_ten, remain_ten, max_seqlen):
    if remain_ten is None:
        return input_ten, True, None
    if input_ten.size()[1] + remain_ten.size()[1] > max_seqlen:
        return remain_ten, False, input_ten
    else:
        remain_ten = torch.cat([input_ten, remain_ten[:, 1:]], dim=1)
        return remain_ten, True, None


In [1]:
def gpt_train(
    data, model, tokenizer,
    batch_size=16, epochs=6, lr=1e-5,
    max_seq_len=500, warmup_steps=100,
    gpt2_type="gpt2", output_dir=".", output_prefix="news",
    test_mode=False,save_model_on_epoch=False,
):

    device=torch.device("cuda")
    model = model.cuda()
    model.train()

    optimizer = AdamW(model.parameters(), lr=1e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )
    loss = 0
    news_loader = DataLoader(data, batch_size=batch_size, shuffle=True)
    accumulating_batch_count = 0
    in_tensor = None

    for i in range(epochs):

        print(f" Epoch Training {i}")
        print(loss)
        for idx, x in tqdm(enumerate(news_loader)):
            (input_tensor, carry_tensor, remainder) = load_tensor(x, in_tensor, 768)

            if carry_tensor and idx != len(news_loader) - 1:
                continue

            input_tensor = input_tensor.to(device)
            output = model(input_tensor, labels=input_tensor)
            loss = output[0]
            loss.backward()

            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            accumulating_batch_count += 1
            input_tensor = None
        if save_model_on_epoch:
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{output_prefix}-{i}.pt"),
            )
    return model

In [None]:
model = gpt_train(dataset, model, tokenizer)



Training epoch 0
0


5924it [05:42, 17.32it/s]


Training epoch 1
tensor(0.4862, device='cuda:0', grad_fn=<NllLossBackward0>)


5924it [05:40, 17.37it/s]


Training epoch 2
tensor(0.3885, device='cuda:0', grad_fn=<NllLossBackward0>)


5924it [05:40, 17.39it/s]


Training epoch 3
tensor(0.3338, device='cuda:0', grad_fn=<NllLossBackward0>)


5924it [05:40, 17.38it/s]


Training epoch 4
tensor(0.4080, device='cuda:0', grad_fn=<NllLossBackward0>)


5924it [05:40, 17.40it/s]


In [None]:
import torch
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [None]:
model = torch.load('/content/drive/MyDrive/Textgen/news_uk.pt')

In [None]:
def generate_gpt(
    model,
    tokenizer,
    prompt,
    prompt_count=1,
    word_length=200, #maximum number of words
    top_p=0.8,
    temp=0.8,
):
    model.eval()
    num_generated = 0
    gen_list = []

    filter_value = -float("Inf")

    with torch.no_grad():

        for idx in trange(prompt_count):

            prompt_finished = False
            input_ten = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)

            for i in range(word_length):
                output_ten = model(input_ten, labels=input_ten)
                loss, logits = output_ten[:2]
                logits = logits[:, -1, :] / (temp if temp > 0 else 1.0)

                sortlogit, sortind = torch.sort(logits, descending=True)
                cum_prob = torch.cumsum(F.softmax(sortlogit, dim=-1), dim=-1)

                sortind_remove = cum_prob > top_p
                sortind_remove[..., 1:] = sortind_remove[
                    ..., :-1
                ].clone()
                sortind_remove[..., 0] = 0

                ind_remove = sortind[sortind_remove]
                logits[:, ind_remove] = filter_value

                next_pred = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
                generated = torch.cat((input_ten, next_pred), dim=1)

                if next_pred in tokenizer.encode("<|endoftext|>"):
                    prompt_finished = True

                if prompt_finished:

                    num_generated = num_generated + 1

                    output_list = list(generated.squeeze().numpy())
                    output_text = tokenizer.decode(output_list)
                    gen_list.append(output_text)
                    break

            if not prompt_finished:
              output_list = list(generated.squeeze().numpy())
              output_text = f"{tokenizer.decode(output_list)}<|endoftext|>"
              gen_list.append(output_text)

    return gen_list





In [None]:
def text_generation(test_data):
     x = generate_gpt(model.to('cpu'), tokenizer, test_data, entry_count=1)
     return x

In [None]:
generated_new4 = generate(model.to('cpu'), tokenizer, "UK government are")
generated_new4

100%|██████████| 1/1 [01:27<00:00, 87.12s/it]


['UK government are also pressing ahead with plans to build an Olympic stadium on the site of the former site of the World Cup.\n\nIt is understood that the new stadium will not be built until 2019.\n\nMr Cameron said: "We are going to have to find out how we can best put forward a vision for the future of the UK and what we want to achieve in the long term.\n\n"We will have to decide on what the future of the UK looks like in the coming years, but we will also have to look at the impact on the economy and what impact it will have on the wider economy.\n\n"We will need to look at what\'s needed to make sure we have a safe and secure future for our people and the future of the UK.\n\n"We need to have a government that is committed to delivering on the promises we made.\n\n"I\'m sure the government is prepared to say no to any proposals to build a new stadium or a new<|endoftext|>']

In [None]:
file_loc = './gen_4.txt'
with open(file_loc, "w") as file:
    for item in generated_new4:
        file.write(item
        )

In [None]:
generated_temp_1 = generate(model.to('cpu'), tokenizer, "Five things")
generated_temp_1

100%|██████████| 1/1 [01:30<00:00, 90.64s/it]


['Five things you need to know about the new version of the game:\n\n1. The game is now available for PC and Mac.\n\n2. The game is now available for PlayStation 4 and Xbox One.\n\n3. The game is now available for Xbox One and PlayStation 4.\n\n4. The game is now available for PC and Mac.\n\n5. The game is now available for PlayStation 4 and Xbox One.\n\n6. The game is now available for PC and Mac.\n\n7. The game is now available for PlayStation 4 and Xbox One.\n\n8. The game is now available for PC and Mac.\n\n9. The game is now available for PlayStation 4 and Xbox One.\n\n10. The game is now available for PC and Mac.\n\n11. The game is now available for PlayStation 4 and Xbox One.\n\n12. The game is now available for Contains content from the original game.\n\n13. The game is now available']

In [None]:
generated_old_model

['Rules introduced after the financial crisis in 2009. Chancellor George Osborne has for this year said public spending on welfare last week has been set too high and a further reduction is likely on target.<|endoftext|>']

In [None]:
generated_eig_twh

['Rules introduced after the financial crisis.\n\nIn the new report, the Treasury says the UK\'s spending is "expected to increase by nearly £4.2 billion over the next three years and the UK is expected to pay a net £1.7bn to the EU over the next five years".\n\nIt says the UK will also have to "extend the programme for a third of its workforce by 2020".\n\nThe report also warns the UK is expected to lose up to 25,000 jobs by 2020.\n\nThe UK is also expected to lose £5.6bn over the next five years, the Treasury says.\n\nThe Treasury says the UK is "engaging in an ambitious new policy to secure a new European financial system and a more secure and stable financial system in which the UK can focus its economic energy on the 21st Century".\n\nThe report says: "The UK is working hard to deliver on its commitment to deliver a balanced and cost-effective fiscal policy. The UK']

In [None]:
file_loc = './gen_fiv.txt'
with open(file_loc, "w") as file:
    for item in generated_fiv:
        file.write(item
        )

In [None]:
generated_eig

['Rules introduced after the financial crisis.\n\nThe Government\'s Brexit strategy is to push through the changes to the law before the next general election, which is planned for 2019.\n\nThe first phase of the plan is to bring in new laws to help businesses and individuals negotiate a new deal.\n\nThe government says it is now ready to work with the government on new regulations and a "no-deal" policy for Brexit.\n\nImage copyright PA Image caption Theresa May says she is "working on a new plan"']

In [None]:
generated_fiv

['Rules introduced after the financial crisis, which has left many of the country\'s banks struggling, are being used to help the struggling.\n\nThe government says it is "working closely with the UK government and the Bank of England to ensure that the UK\'s financial services sector is able to meet the needs of the UK economy".\n\nThe government says it is "working closely with the UK government and the Bank of England to ensure that the UK\'s financial services sector is able to meet the needs of the UK economy".\n\n']

In [None]:
#Function to generate multiple sentences. Test data should be a dataframe
def text_generation(test_data):
  generated_news = []
  for i in range(0, 10):#(len(test_data)):
    x = generate(model.to('cpu'), tokenizer, test_data['description'][i], entry_count=1)
    generated_news.append(x)
  return generated_news

In [None]:
test_set =pd.read_csv('./test_set.csv')
test_set.head()

Unnamed: 0.2,Unnamed: 0.1,index,Unnamed: 0,title,description,real_end
0,0,2259,11398,UK banking rules face biggest shake-up in more...,Rules introduced after the financial crisis in...,limiting risks will be eased.
1,1,2356,12485,Prince Harry and Meghan: Where do they get the...,The Duke of Sussex's memoir Spare is published...,key claims have been leaked.
2,2,4214,2952,Buffalo shooting: Gunman deliberately sought b...,Questions are being asked about how the suspec...,he was known to authorities.
3,3,1819,21194,Subway agrees sale to Dunkin' and Baskin-Robbi...,The sandwich maker has been bought by Roark Ca...,six decades of family ownership.
4,4,3226,8804,Dancers' moves help to power Glasgow music venue,Heat energy is being captured from people danc...,venue reduce its carbon emissions.


In [None]:
generated_news = text_generation(test_set)

100%|██████████| 1/1 [00:02<00:00,  2.79s/it]
100%|██████████| 1/1 [00:02<00:00,  3.00s/it]
100%|██████████| 1/1 [00:03<00:00,  3.40s/it]
100%|██████████| 1/1 [00:01<00:00,  1.32s/it]
100%|██████████| 1/1 [00:03<00:00,  3.68s/it]
100%|██████████| 1/1 [00:01<00:00,  1.35s/it]
100%|██████████| 1/1 [00:03<00:00,  3.48s/it]
100%|██████████| 1/1 [00:02<00:00,  2.90s/it]
100%|██████████| 1/1 [00:02<00:00,  2.83s/it]
100%|██████████| 1/1 [00:01<00:00,  1.79s/it]


In [None]:
generated_news

[['Rules introduced after the financial crisis in 2008 aimed at eliminating the income tax, but the legislation has faced fierce opposition in other EU countries.<|endoftext|>'],
 ["The Duke of Sussex's memoir Spare is published on 10 January, but many have predicted it will be a cautionary tale, particularly after the risk of obesity being ascribed to"],
 ['Questions are being asked about how the suspect carried out the attack when he was seen in one of the tunnel entrance corridors.\n\nThe victims were taken to hospital but'],
 ['The sandwich maker has been bought by Roark Capital, ending several of its old lives.<|endoftext|>'],
 ['Heat energy is being captured from people dancing to help a Glasgow homeless man.\n\nJoel McAteer, 24, who was driving from York, Scotland'],
 ['Fans in Saudi Arabia have been unable to watch all of the World Cup after a streaming service was disrupted in the capital, Saudi Arabia.<|endoftext|>'],
 ['Tiger Woods says he is "playing to win" when he compete

In [None]:
len(generated_news)

In [None]:
path_file ='/content/drive/MyDrive/Textgen/GPT_gen2.txt'
with open(path_file, "w") as file:
    for item in generated_news:
        for sublist in item:
          file.write(sublist +"\n")

In [None]:
#Run the functions to generate the news
generated_news = text_generation(test_set)

100%|██████████| 1/1 [00:02<00:00,  2.35s/it]
100%|██████████| 1/1 [00:03<00:00,  3.15s/it]
100%|██████████| 1/1 [00:03<00:00,  3.32s/it]
100%|██████████| 1/1 [00:00<00:00,  1.22it/s]
100%|██████████| 1/1 [00:03<00:00,  3.09s/it]
100%|██████████| 1/1 [00:01<00:00,  1.43s/it]
100%|██████████| 1/1 [00:01<00:00,  1.20s/it]
100%|██████████| 1/1 [00:03<00:00,  3.06s/it]
100%|██████████| 1/1 [00:02<00:00,  2.80s/it]
100%|██████████| 1/1 [00:03<00:00,  3.31s/it]


In [None]:
generated_news

[['Watch as Matt Burton performs a one-handed through-the-legs pass to set up Josh Addo-Carr for a fourth try in an impressive 1-0 win for the Falcons.<|endoftext|>'],
 ['His mother says she felt like "transporting with joy" when Sir Mo Farah appeared on the show.\n\nHer father was never able to find out.<|endoftext|>'],
 ['The Britain\'s Got Talent judge apologises for making comments that "didn\'t fall under the principle of informed consent".\n\nBarry Davis is now<|endoftext|>'],
 ['Some may quit NHS contracts over threats of fines, costs and legal action.<|endoftext|>'],
 ['When ambulance crews bring patients to hospital they are meant to be able to handover their patients to those without a hospital licence, says Gillian Triggs, the NHS minister.\n\nThe crisis<|endoftext|>'],
 ["France breeze into the Euro 2022 quarter-finals with a game to spare by beating Belgium at Rotherham's New York Stadium to take a comfortable 5-0 lead.<|endoftext|>"],
 ['Tonga put one foot in the Rugby Le

In [None]:
import statistics
from nltk.translate.bleu_score import sentence_bleu

scores=[]

for i in range(len(test_set)):
  reference = test_set['True_end_lyrics'][i]
  candidate = test_set['Generated_lyrics'][i]
  scores.append(sentence_bleu(reference, candidate))

statistics.mean(scores)