# Procesar

In [3]:
from glob import glob

files = glob('../scraper/data/*.txt',recursive=True)

In [17]:
# Read all files with each line as a list element, replace newline character with space
# remove trailing spaces
data = []
for file in files:
    with open(file,'r') as f:
        data.extend([line.replace('\n','').strip() for line in f.readlines()])

In [35]:
import re

def cleaning(st: str):
    # remove any fragment with <...> tags
    st = re.sub(r'<.*?>','',st)

    # remove string if it has a ;; in it
    st = re.sub(r'.*;;.*','',st)

    # remove urls
    st = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+','',st)

    # remove string if it ha a word that starts with ! in it
    st = re.sub(r'.*![a-zA-Z]+.*','',st)

    return st

In [43]:
processed_data: list[str] = [cleaning(st) for st in data]
processed_data = [st for st in processed_data if st]

pd.DataFrame(processed_data).to_csv('cleaned_messages.csv',index=False,header=False)

text_data = open('messages.txt', 'w')
for d in processed_data:
  text_data.write(d+'\n')
text_data.close()

# Entrenar

In [1]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [36]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset


def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=mlm,
    )
    return data_collator


def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)
      
  model = GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
          push_to_hub=True
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )
      
  trainer.train()
  trainer.save_model()

In [3]:
# you need to set parameters 
train_file_path = "messages.txt"
# model_name = 'gpt2'
model_name = 'DeepESP/gpt2-spanish'
output_dir = '../pex-gpt2'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 5.0
save_steps = 500

In [4]:
import torch

In [37]:
# It takes about 30 minutes to train in colab.
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)



Step,Training Loss


# Inferir

In [7]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer

In [31]:
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer


def generate_text(sequence, max_length):
    model_path = "../pex-gpt2"
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    generated_text = tokenizer.decode(final_outputs[0], skip_special_tokens=True)
    # Remove the input sequence from the generated text
    output_text = generated_text[len(sequence):].strip()
    print(output_text)

In [38]:
sequence = 'Buenos dias p como estas' # oil price
max_len = int(15) # 20
generate_text(sequence, max_len) # oil price for July June which had been low at as low as was originally stated Prices have since resumed

Alo que eres el único retrasado
