imports

In [1]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, get_scheduler, GPT2LMHeadModel, GPT2Tokenizer
import torch

  from .autonotebook import tqdm as notebook_tqdm


model config

In [2]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
#model.resize_token_embeddings(130000)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

data import & stats

In [3]:
csv_files = ['Resume.csv', 'Resume2.csv']

selected_column = 'Resume_str'
combined_column = pd.concat([pd.read_csv(file)[selected_column] for file in csv_files], ignore_index=True)

print(combined_column)
train_texts = combined_column.to_list()#

tokens = combined_column.apply(lambda x: len(tokenizer.tokenize(x)))

avg_tokens_per_row = tokens.mean()

print(f"Średnia liczba tokenów na wiersz: {avg_tokens_per_row}")

0                HR ADMINISTRATOR/MARKETING ASSOCIATE\...
1                HR SPECIALIST, US HR OPERATIONS      ...
2                HR DIRECTOR       Summary      Over 2...
3                HR SPECIALIST       Summary    Dedica...
4                HR MANAGER         Skill Highlights  ...
                              ...                        
3441    Computer Skills: â¢ Proficient in MS office (...
3442    â Willingness to accept the challenges. â ...
3443    PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne...
3444    COMPUTER SKILLS & SOFTWARE KNOWLEDGE MS-Power ...
3445    Skill Set OS Windows XP/7/8/8.1/10 Database MY...
Name: Resume_str, Length: 3446, dtype: object
Średnia liczba tokenów na wiersz: 1281.010737086477


Data prep

In [4]:
class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encodings = self.tokenizer(self.texts[idx], truncation=True, max_length=self.max_length, return_tensors="pt")
        input_ids = encodings['input_ids'].squeeze()
        attention_mask = encodings['attention_mask'].squeeze()
        return input_ids, attention_mask

?

In [5]:
def collate_fn(batch):
    input_ids = [item[0] for item in batch]
    attention_masks = [item[1] for item in batch]

    # Znajdujemy najdłuższą sekwencję w batchu
    max_len = max(len(ids) for ids in input_ids)

    # Padding sekwencji do tej samej długości
    padded_input_ids = [torch.cat([ids, torch.zeros(max_len - len(ids), dtype=torch.long)]) for ids in input_ids]
    padded_attention_masks = [torch.cat([mask, torch.zeros(max_len - len(mask), dtype=torch.long)]) for mask in attention_masks]

    # Zwracamy spakowane tensory
    return torch.stack(padded_input_ids), torch.stack(padded_attention_masks)

Training Config

In [6]:
train_dataset = TextDataset(train_texts, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_loader)

scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)



Training

In [7]:
model.train()

for epoch in range(num_epochs):
# Podczas treningu, dane (input_ids, attention_mask) również muszą być przeniesione na GPU
    for batch in train_loader:
        input_ids, attention_masks = batch
        input_ids = input_ids.to(device)
        attention_masks = attention_masks.to(device)
    
        # Forward pass
        outputs = model(input_ids, attention_mask=attention_masks, labels=input_ids)
        loss = outputs.loss
        loss.backward()
    
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    print(f"Epoch {epoch + 1} finished with loss {loss.item()}")



  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch 1 finished with loss 2.6944870948791504
Epoch 2 finished with loss 1.754962682723999
Epoch 3 finished with loss 2.1572158336639404


Model Saving

In [8]:
model.save_pretrained('./fine_tuned_gpt2')
tokenizer.save_pretrained('./fine_tuned_gpt2')

('./fine_tuned_gpt2\\tokenizer_config.json',
 './fine_tuned_gpt2\\special_tokens_map.json',
 './fine_tuned_gpt2\\vocab.json',
 './fine_tuned_gpt2\\merges.txt',
 './fine_tuned_gpt2\\added_tokens.json')

Test prompt properties & model loading

In [9]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model = GPT2LMHeadModel.from_pretrained('./fine_tuned_gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('./fine_tuned_gpt2')

prompt = "generate CV for Samantha, the IT Recruiter"
total_new_tokens = 6000 

Test

In [14]:


model.eval()
generated_text = ""
input_ids = tokenizer.encode(prompt, return_tensors="pt")
max_new_tokens_per_generation = 1000 
current_length = 0


#Debug Data
vocab_size = tokenizer.vocab_size
print(input_ids)
print(f"Vocab size: {vocab_size}")
print(f"Max index in input_ids: {input_ids.max()}")


while current_length < total_new_tokens:
    output = model.generate(
        input_ids,
        attention_mask=torch.ones_like(input_ids),
        max_new_tokens=max_new_tokens_per_generation,
        num_return_sequences=1,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.2,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    
    generated_part = tokenizer.decode(output[0], skip_special_tokens=True)
    generated_text += generated_part

    input_ids = tokenizer.encode(prompt, return_tensors="pt")

    current_length += max_new_tokens_per_generation

print(generated_text)

tensor([[ 8612,   378, 26196,   329, 34778,    11,   262,  7283,  3311,   622,
          2676]])
Vocab size: 50257
Max index in input_ids: 34778
generate CV for Samantha, the IT Recruiter (LTD) and CPA. She has over 15 years of experience in HR with a strong focus on building trust between client and team members while meeting their needs. Ms Kranti is well-versed at handling complex projects which requires detailed planning to achieve desired results. Strengths: Motivated Team Player Ability To Apply Leadership Skills In Organizing A Project As an Individual Having achieved several awards including Best Lead Of A Team award from the USAFRCA I have always been looking for another career as an individual who can contribute towards my professional goals by working hard toward them. But now that I am able obtain this opportunity there is no option but to give back what I gained during my time here under the guidance & supervision of these outstanding people. Enthusiastic Customer Service 