# Download e tratamento de dados

In [None]:
import requests
import re
import random

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def obter_diretorios(site_url):
    # Faz uma solicitação GET ao site
    response = requests.get(site_url)

    # Verifica se a solicitação foi bem-sucedida
    if response.status_code == 200:
        padrao = r"https://visionvox.net/biblioteca/[a-z]/.+\.txt"
        matches = re.findall(padrao, response.text)
        return [match.split('>')[0] for match in matches]
    else:
        print("Falha ao obter a página:", response.status_code)

In [None]:
with open('diretorios.txt', 'w') as f:
  for i in range(600):
    try:
      url = f'https://visionvox.com.br/biblioteca/estante.php?estante=formato&formato=txt&pagina=sim&num_page={i*50}&total_pagina=2097.58'
      dirs = obter_diretorios(url)
      for dir in dirs:
        f.write(dir + '\n')
    except:
      pass

In [None]:
with open('/content/drive/MyDrive/IA/diretorios.txt', 'r') as f:
  links = [x.strip() for x in f.readlines()]

n = 1000
random.seed(47)
links = random.sample(links, n)

for i, link in enumerate(links):
  if i >= 500:
    with open(f'/content/drive/MyDrive/IA/books/book{i:05}.txt', 'w') as f:
      response = requests.get(link)
      f.write(response.text + '\n')


In [2]:
with open('/content/drive/MyDrive/IA/data.txt', 'w') as out:
  for i in range(1000):
    with open(f'/content/drive/MyDrive/IA/books/book{i:05}.txt', 'r') as f:
      content = f.read()
    out.write(content)


In [4]:
from sklearn.model_selection import train_test_split

with open('/content/drive/MyDrive/IA/data.txt', 'r') as f:
  data = f.readlines()

train, test = train_test_split(data,test_size=0.15)

print("Data length: " + str(len(data)))
print("Train length: " + str(len(train)))
print("Test length: " + str(len(test)))

Data length: 4651204
Train length: 3953523
Test length: 697681


In [5]:
with open('/content/drive/MyDrive/IA/train.txt', 'w') as f:
  for line in train:
    f.write(line)
with open('/content/drive/MyDrive/IA/test.txt', 'w') as f:
  for line in test:
    f.write(line)

# Preparando modelo, tokenizer e dataset

In [2]:
!pip install transformers accelerate xformers evaluate

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m52.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.20.3-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.6/227.6 kB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xformers
  Downloading xformers-0.0.20-cp310-cp310-manylinux2014_x86_64.whl (109.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.1/109.1 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!nvidia-smi

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('pierreguillou/gpt2-small-portuguese')

train_path = '/content/drive/MyDrive/IA/train.txt'
test_path = '/content/drive/MyDrive/IA/test.txt'

In [None]:
from transformers import TextDataset, DataCollatorForLanguageModeling

def load_dataset(train_path, test_path, tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)

    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=1)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator

train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer)

In [None]:
from transformers import Trainer, TrainingArguments, AutoModelWithLMHead

model = AutoModelWithLMHead.from_pretrained("pierreguillou/gpt2-small-portuguese")

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/IA/gpt2-contaAI-small-1/", # The output directory
    overwrite_output_dir=True, # overwrite the content of the output directory
    num_train_epochs=2, # number of training epochs
    per_device_train_batch_size=96, # batch size for training
    per_device_eval_batch_size=32,  # batch size for evaluation
    eval_steps = 500, # Number of update steps between two evaluations.
    save_steps=1000, # after # steps model is saved
    warmup_steps=500, # number of warmup steps for learning rate scheduler
    )

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Treinamento

In [None]:
trainer.train()
trainer.save_model()

# Prompt manual

In [None]:
from transformers import pipeline

model_path = '/content/drive/MyDrive/IA/gpt2-contaAI-5/checkpoint-5000'
conta_ai = pipeline('text-generation', model=model_path, tokenizer='pierreguillou/gpt2-small-portuguese', max_length=100)

In [None]:
prompt = 'Era uma casa velha e feia, com uma porta cheia de buracos e janelas remendadas. Emilia então decidiu entrar, ao girar'
print('PROMPT:\n', prompt, '\n\n\n\n\n')
for i in range(1):
  out = conta_ai(prompt)
  prompt = out[0]['generated_text'] + '\n'

print('SAIDA:\n', prompt)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


PROMPT:
 Era uma casa velha e feia, com uma porta cheia de buracos e janelas remendadas. Emilia então decidiu entrar, ao girar 





SAIDA:
 Era uma casa velha e feia, com uma porta cheia de buracos e janelas remendadas. Emilia então decidiu entrar, ao girar em torno das janelas.
     - Bem-vindo a casa.

Eu nunca poderia ter imaginado que não houvesse um mundo com
por parte de seus
    -  É claro que Pantera?

Os olhos de Matthew foram arregalados.
O dia estava quente em seu corpo, e os olhos de Lucy brilharam ao redor da fogueira.

     - Está vendo - disse Meredith, sorrindo. - O que eu acho que

         Não sabia quando isso seria. O dia seria para ele, ou para Rosemary Rosemary.
      Os olhos de Dana se fechavam quando ela puxou um para a frente e a observou atentamente. Em suas pálpebras e suas costas estavam se dobrando, o cabelo castanho escuro como as da noite.
- Não me chamo papai - sussurrou.

O que eu ia dizer? Se é que eu não ia, como ia? Eu sei lá, não era nada... A ide