In [6]:
import pandas as pd

# Wczytanie pliku CSV
file_path = './Resume.csv'  # Zastąp to ścieżką do swojego pliku CSV
df = pd.read_csv(file_path)

# Wybór tylko jednej kolumny (na przykład kolumny 'text') i konwersja do listy
train_texts = df['Resume_html'].tolist()  # Zamień 'text' na nazwę swojej kolumny

In [10]:
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, get_scheduler, GPT2LMHeadModel, GPT2Tokenizer

# Inicjalizacja modelu i tokenizatora
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Ustawienie tokena PAD (jeśli nie jest zdefiniowany)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Przygotowanie danych do trenowania
class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encodings = self.tokenizer(self.texts[idx], truncation=True, max_length=self.max_length, return_tensors="pt")
        input_ids = encodings['input_ids'].squeeze()
        attention_mask = encodings['attention_mask'].squeeze()
        return input_ids, attention_mask

train_dataset = TextDataset(train_texts, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

# Definiowanie optymalizatora i schematu nauki
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_loader)

scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Trenowanie modelu
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
model.train()

for epoch in range(num_epochs):
# Podczas treningu, dane (input_ids, attention_mask) również muszą być przeniesione na GPU
    for batch in train_loader:
        input_ids, attention_masks = batch
        input_ids = input_ids.to(device)
        attention_masks = attention_masks.to(device)
    
        # Forward pass
        outputs = model(input_ids, attention_mask=attention_masks, labels=input_ids)
        loss = outputs.loss
        loss.backward()
    
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    print(f"Epoch {epoch + 1} finished with loss {loss.item()}")

# Zapisanie wytrenowanego modelu
model.save_pretrained('./fine_tuned_gpt2')
tokenizer.save_pretrained('./fine_tuned_gpt2')


Epoch 1 finished with loss 0.8306434154510498
Epoch 2 finished with loss 0.570225179195404
Epoch 3 finished with loss 1.0754492282867432


('./fine_tuned_gpt2\\tokenizer_config.json',
 './fine_tuned_gpt2\\special_tokens_map.json',
 './fine_tuned_gpt2\\vocab.json',
 './fine_tuned_gpt2\\merges.txt',
 './fine_tuned_gpt2\\added_tokens.json')

In [40]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Załaduj wytrenowany model i tokenizer
model = GPT2LMHeadModel.from_pretrained('./fine_tuned_gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('./fine_tuned_gpt2')

# Wygeneruj tekst na podstawie podanego kontekstu
input_text = "generate CV for Samantha, the IT Recruiter, do it in html format"
encodings = tokenizer.encode_plus(input_text, return_tensors='pt')

input_ids = encodings['input_ids']
attention_mask = encodings['attention_mask']
vocab_size = tokenizer.vocab_size
print(input_ids)
print(f"Vocab size: {vocab_size}")
print(f"Max index in input_ids: {input_ids.max()}")
# Model generuje kontynuację
model.eval()
generated_text = ""
input_ids = tokenizer.encode(input_text, return_tensors="pt")
max_new_tokens_per_generation = 300  # Liczba tokenów generowanych w jednej iteracji
total_new_tokens = 3000  # Całkowita liczba nowych tokenów
current_length = 0  # Długość wygenerowanego tekstu

# Pętla do generowania tekstu w partiach
while current_length < total_new_tokens:
    # Generowanie nowych tokenów
    output = model.generate(
        input_ids,
        attention_mask=torch.ones_like(input_ids),
        max_new_tokens=max_new_tokens_per_generation,
        num_return_sequences=1,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.2,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    
    # Dekodowanie nowo wygenerowanego tekstu
    generated_part = tokenizer.decode(output[0], skip_special_tokens=True)
    generated_text += generated_part  # Dodanie do istniejącego tekstu

    # Zresetuj `input_ids` do początkowego promptu, aby uniknąć wzrostu wartości
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    # Zwiększenie licznika długości
    current_length += max_new_tokens_per_generation

print(generated_text)

tensor([[ 8612,   378, 26196,   329, 34778,    11,   262,  7283,  3311,   622,
          2676,    11,   466,   340,   287, 27711,  5794]])
Vocab size: 50257
Max index in input_ids: 34778
generate CV for Samantha, the IT Recruiter, do it in html format
and then print out a new file with all of my information. It would take around 15-20 minutes to complete each project and get me started on some projects that I love working under. Once you have completed your first task (1st one), go through and read this: https://www;p&ampd0t4u2a3i5</div> </td class="section" id="" style=\"paddingLeft:-45px"> <tr/> <table ClassifiedTags":["PICT_NAME", "WRITTENGRAPHIC DESIGNER/PROCESSOR"]  <br /> Experienced Graphic Designer offering over 10 years of experience designing graphic design solutions which include digital art, sound design or photography.</font></ul> </li><ltnks singlecolumn txtCenterBlockBorder 5ptFx25 fgCoreLogic 2 hddmlk6 fontsize dynamicresize 4 tdmargins paddingTop 0 20px 50px 40% 25%, D