In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import io


from transformers import GPT2Tokenizer, T5ForConditionalGeneration
import wandb

In [2]:
class Config:
    def __init__(self):
        self.actuals = []
        self.actuals_f = False

In [3]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.text = self.data.text
        self.ctext = self.data.ctext

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        ctext = str(self.ctext[index])
        ctext = ' '.join(ctext.split())

        text = str(self.text[index])
        text = ' '.join(text.split())

        source = self.tokenizer.batch_encode_plus([ctext], max_length= self.source_len, pad_to_max_length=True,return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([text], max_length= self.summ_len, pad_to_max_length=True,return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

In [4]:
from time import sleep
def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        labels = y[:, 1:].clone().detach()
        labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)
        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=labels)
        loss = outputs[0]
        
        if _%500==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [5]:
import evaluate
rouge = evaluate.load("rouge")
 
def compute_metrics(decoded_preds,decoded_labels):
    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,
        rouge_types=[
            'rouge1',
            'rouge2',
            'rougeL'
        ]
    )
 
    return {k: round(v, 4) for k, v in result.items()}

In [6]:
def validate(epoch, tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length = config.SUMMARY_LEN
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            predictions.extend(preds)
            if not config.actuals_f:
                target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]  
                actuals.extend(target)
        if not config.actuals_f:
            config.actuals_f = True
            config.actuals = actuals
        scores = compute_metrics(predictions, config.actuals)
        print(predictions[0])
    return predictions, config.actuals, scores

In [7]:
from tqdm.notebook import tqdm

device='cuda:0'
config = Config()         # Initialize config
config.TRAIN_BATCH_SIZE = 1    # input batch size for training (default: 64)
config.VALID_BATCH_SIZE = 1    # input batch size for testing (default: 1000)
config.TRAIN_EPOCHS = 10       # number of epochs to train (default: 10)
config.VAL_EPOCHS = 1 
config.LEARNING_RATE = 5e-5    # learning rate (default: 0.01)
config.SEED = 42               # random seed (default: 42)
config.MAX_LEN = 1024
config.SUMMARY_LEN = 256 

best = 0

# Set random seeds and deterministic pytorch for reproducibility
torch.manual_seed(config.SEED) # pytorch random seed
np.random.seed(config.SEED) # numpy random seed
torch.backends.cudnn.deterministic = True

# tokenzier for encoding the text
tokenizer = GPT2Tokenizer.from_pretrained("RussianNLP/FRED-T5-Summarizer")

df = pd.read_excel('data.xlsx')
df.columns = ['ctext', 'text']
df = df[['text','ctext']]
df.ctext = '<LM> Сократи текст.\n ' + df.ctext
print(df.head())
train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state = config.SEED)
val_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(val_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)
val_set = CustomDataset(val_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)

train_params = {
    'batch_size': config.TRAIN_BATCH_SIZE,
    'shuffle': True,
    'num_workers': 0
    }

val_params = {
    'batch_size': config.VALID_BATCH_SIZE,
    'shuffle': False,
    'num_workers': 0
    }
training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)

model = T5ForConditionalGeneration.from_pretrained("RussianNLP/FRED-T5-Summarizer", low_cpu_mem_usage=True)
model = model.to(device)
for name, param in model.named_parameters():
    param.requires_grad = False
    for template in ['lm_head', 'block.22.', 'block.23.', 'final_layer_norm']:
        if template in name:
            param.requires_grad = True
        
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

optimizer = torch.optim.Adam(params =  model.parameters(), lr=config.LEARNING_RATE)

print('Initiating Fine-Tuning for the model on our dataset')
try:
    for epoch in tqdm(range(config.TRAIN_EPOCHS)):
        train(epoch, tokenizer, model, device, training_loader, optimizer)
        #if epoch % 10 == 0:
        _, __, scores = validate(epoch, tokenizer, model, device, val_loader)
        print(scores)
        if scores['rougeL'] > best:
            best = scores['rougeL']
            model.save_pretrained("checkpoints//model_rougeL_"+str(best).split('.')[-1])
            
except KeyboardInterrupt:
    pass
except Exception as e:
    print(e)
print('Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe')
for epoch in range(config.VAL_EPOCHS):
    predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
    final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
    print('Output Files generated for review')
    

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


                                                text  \
0  Компания имеет положительный тренд по 2 показа...   
1  Компания имеет положительный тренд по 0 показа...   
2  Компания имеет положительный тренд по 1 показа...   
3  Компания имеет положительный тренд по 0 показа...   
4  Компания имеет положительный тренд по 3 показа...   

                                               ctext  
0  <LM> Сократи текст.\n Материальные внеоборотны...  
1  <LM> Сократи текст.\n Материальные внеоборотны...  
2  <LM> Сократи текст.\n Материальные внеоборотны...  
3  <LM> Сократи текст.\n Материальные внеоборотны...  
4  <LM> Сократи текст.\n Материальные внеоборотны...  
FULL Dataset: (317, 2)
TRAIN Dataset: (254, 2)
TEST Dataset: (63, 2)
1,740,354,048 total parameters.
209,498,112 training parameters.
Initiating Fine-Tuning for the model on our dataset


  0%|          | 0/100 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch: 0, Loss:  0.2639848589897156
Актуальность компании по 1, в целом положение компании ухудшается. В целом положение компании ухудшается по 1, в целом положение компании ухудшается. В целом положение компании ухудшается по 1, в целом положение компании ухудшается. В целом положение компании ухудшается по 1, в целом положение компании ухудшается.
{'rouge1': 0.1063, 'rouge2': 0.0079, 'rougeL': 0.1007}




Epoch: 1, Loss:  0.2828477919101715
Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe


KeyboardInterrupt: 