In [None]:
!pip install transformers
!pip install torch
!pip install sklearn
!pip install tqdm

Collecting transformers
  Downloading transformers-4.11.3-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 5.3 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 31.6 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 48.2 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 43.8 MB/s 
[?25hCollecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 4.7 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: Py

In [None]:
from transformers import AutoTokenizer, TextDataset,DataCollatorForLanguageModeling, Trainer, TrainingArguments,AutoModelWithLMHead
import torch
from transformers import AdamW
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def preprocessing(path):
	with open(path, 'r') as f:
		text = f.read().replace('"',"").replace("«","").replace("«","»").replace("‘","").split('\n\n')
	text = list(map(lambda x: "<BOS>"+x.replace("\n"," "), text))
	return text

tokenizer = AutoTokenizer.from_pretrained("GroNLP/gpt2-small-italian-embeddings", max_lenght=256)
model = AutoModelWithLMHead.from_pretrained("GroNLP/gpt2-small-italian-embeddings")

input_data = "divina_commedia.txt"

text = preprocessing(input_data)

device = torch.device("cuda")

model = model.to(device)

train, eval = train_test_split(text, train_size=.9, random_state=2020)

with open('train_tmp.txt', 'w') as file_handle:
  file_handle.write("<EOS>".join(train))

with open('eval_tmp.txt', 'w') as file_handle:
  file_handle.write("<EOS>".join(eval))

special_tokens_dict = {'bos_token': '<BOS>', 'eos_token': '<EOS>', 'pad_token': '<PAD>'}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)
     
    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)   
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator

train_dataset,test_dataset,data_collator = load_dataset('train_tmp.txt','eval_tmp.txt',tokenizer)

training_args = TrainingArguments(
    output_dir="./dantebert", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=200, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=32,  # batch size for evaluation
    eval_steps = 400, # Number of update steps between two evaluations.
    save_steps=1500, # after # steps model is saved 
    warmup_steps=500,# number of warmup steps for learning rate scheduler
    prediction_loss_only=True,
    )

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()
trainer.save_model()

Downloading:   0%|          | 0.00/135 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/970 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/475k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/280k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.0 [00:00<?, ?B/s]



Downloading:   0%|          | 0.00/427M [00:00<?, ?B/s]

***** Running training *****
  Num examples = 1143
  Num Epochs = 200
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 7200


Step,Training Loss
500,10.2907
1000,3.9298
1500,3.2486
2000,2.6224
2500,2.0669
3000,1.6116
3500,1.2617
4000,1.0062
4500,0.8248
5000,0.6975


Saving model checkpoint to ./dantebert/checkpoint-1500
Configuration saved in ./dantebert/checkpoint-1500/config.json
Model weights saved in ./dantebert/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ./dantebert/checkpoint-3000
Configuration saved in ./dantebert/checkpoint-3000/config.json
Model weights saved in ./dantebert/checkpoint-3000/pytorch_model.bin
Saving model checkpoint to ./dantebert/checkpoint-4500
Configuration saved in ./dantebert/checkpoint-4500/config.json
Model weights saved in ./dantebert/checkpoint-4500/pytorch_model.bin
Saving model checkpoint to ./dantebert/checkpoint-6000
Configuration saved in ./dantebert/checkpoint-6000/config.json
Model weights saved in ./dantebert/checkpoint-6000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ./dantebert
Configuration saved in ./dantebert/config.json
Model weights saved in ./dantebert/pytorch_model.bin


In [None]:
from transformers import pipeline

dante = pipeline('text-generation',model='./dantebert', tokenizer='GroNLP/gpt2-small-italian-embeddings')
dante("E colei che amai ")[0]['generated_text']

loading configuration file ./dantebert/config.json
Model config GPT2Config {
  "_name_or_path": "GroNLP/gpt2-small-italian-embeddings",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 0,
  "embd_pdrop": 0.1,
  "eos_token_id": 0,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "pad_token_id": 0,
  "resid_pdrop": 0.1,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 100,
      "no_repeat_ngram_size": 4,
      "num_beams": 10,
      "repetition_penalty": 10.0,
      "temperature": 2.0,
      "top_k": 20,
 

'E colei che amai uopo più si vuoli, d’altrui lume già bianche e fioche, quasi come piante novelle rinognate per l’acqua fosser pronte.e disse: Piglia quel seme a li occhi; volgi ’l viso, e fammi nota la larghezza di questa nutrice».S’el s’aunasse ancor tutta la gente che già, in su la fortunata terra di Puglia, fu del suo sangue'

In [None]:
!zip -r ./dantebert8epoc.zip ./dantebert

  adding: dantebert/ (stored 0%)
  adding: dantebert/checkpoint-3000/ (stored 0%)
  adding: dantebert/checkpoint-3000/rng_state.pth (deflated 27%)
  adding: dantebert/checkpoint-3000/optimizer.pt (deflated 8%)
  adding: dantebert/checkpoint-3000/pytorch_model.bin (deflated 10%)
  adding: dantebert/checkpoint-3000/trainer_state.json (deflated 65%)
  adding: dantebert/checkpoint-3000/config.json (deflated 52%)
  adding: dantebert/checkpoint-3000/training_args.bin (deflated 48%)
  adding: dantebert/checkpoint-3000/scheduler.pt (deflated 49%)
  adding: dantebert/checkpoint-6000/ (stored 0%)
  adding: dantebert/checkpoint-6000/rng_state.pth (deflated 27%)
  adding: dantebert/checkpoint-6000/optimizer.pt (deflated 9%)
  adding: dantebert/checkpoint-6000/pytorch_model.bin (deflated 10%)
  adding: dantebert/checkpoint-6000/trainer_state.json (deflated 72%)
  adding: dantebert/checkpoint-6000/config.json (deflated 52%)
  adding: dantebert/checkpoint-6000/training_args.bin (deflated 48%)
  addin