In [None]:
!pip install transformers datasets peft accelerate bitsandbytes


Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl (61.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.47.0


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from datasets import Dataset
import torch
import gc

In [None]:
from transformers import AutoTokenizer, AutoModel

model_name = "ibm-granite/granite-embedding-107m-multilingual"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to("cuda")


In [None]:
from google.colab import drive
import pandas as pd

drive.mount('/content/drive')

file_path = "/content/drive/MyDrive/wiki-tr.parquet"
df = pd.read_parquet(file_path)

print(df.head())
print(df.columns)  # Kolon isimlerini gör


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
   id                                                url                title  \
0  10         https://tr.wikipedia.org/wiki/Cengiz%20Han           Cengiz Han   
1  16  https://tr.wikipedia.org/wiki/Film%20%28anlam%...  Film (anlam ayrımı)   
2  22      https://tr.wikipedia.org/wiki/Mustafa%20Suphi        Mustafa Suphi   
3  24                https://tr.wikipedia.org/wiki/Linux                Linux   
4  30       https://tr.wikipedia.org/wiki/Bol%C5%9Fevizm            Bolşevizm   

                                                text  
0  Cengiz Han (doğum adıyla Temuçin,  – 18 Ağusto...  
1  Film şu anlamlara gelebilir:\n\n Camlara yapış...  
2  Mehmed Mustafa Subhi (Osmanlıca: ), kısaca Mus...  
3  Linux (telaffuz: Lin-uks); Linux çekirdeğine d...  
4  Bolşevik, çoğunluktan yana anlamına gelen Rusç...  
Index(['id', 'url', 'title', 'text'], dtype='object')


In [None]:
from torch.utils.data import Dataset, DataLoader

# ====== Dataset tanımı ======
class TextDataset(Dataset):
    def __init__(self, dataframe, text_column):
        self.data = dataframe[text_column].tolist() # Convert the specified column to a list

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]
        return {"text": text}

text_column = 'text' # Define text_column
dataset = TextDataset(df, text_column=text_column)
train_loader = DataLoader(dataset, batch_size=16, shuffle=True)

In [None]:
from torch.optim import AdamW
import torch

optimizer = AdamW(model.parameters(), lr=1e-4)

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

model.train()
for epoch in range(1):  # örnek 1 epoch
    for batch in train_loader:
        inputs = tokenizer(batch["text"], return_tensors="pt", truncation=True, padding=True).to(device)

        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Ortalama embedding çıkarımı

        # Dummy loss (örnek amaçlı, çünkü embedding modeli süpervised değil)
        loss = embeddings.norm(p=2, dim=1).mean()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print("Loss:", loss.item())

Loss: 42.55369567871094
Loss: 41.15060806274414
Loss: 38.2431640625
Loss: 37.532020568847656
Loss: 35.91316604614258
Loss: 33.96466827392578
Loss: 32.538475036621094
Loss: 30.95329475402832
Loss: 29.129230499267578
Loss: 27.786441802978516
Loss: 27.068574905395508
Loss: 27.109676361083984
Loss: 25.05181884765625
Loss: 24.484432220458984
Loss: 23.782793045043945


KeyboardInterrupt: 

In [None]:
save_path = "/content/drive/MyDrive/granite_tr_finetuned"

# Model ve tokenizer kaydet
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)


('/content/drive/MyDrive/granite_tr_finetuned/tokenizer_config.json',
 '/content/drive/MyDrive/granite_tr_finetuned/special_tokens_map.json',
 '/content/drive/MyDrive/granite_tr_finetuned/sentencepiece.bpe.model',
 '/content/drive/MyDrive/granite_tr_finetuned/added_tokens.json',
 '/content/drive/MyDrive/granite_tr_finetuned/tokenizer.json')

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

# Kaydedilen modeli yükle
model_path = "/content/drive/MyDrive/granite_tr_finetuned"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModel.from_pretrained(model_path)

# Test cihazı
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Örnek Türkçe cümle
test_sentences = [
    "Türkiye'nin başkenti Ankara'dır.",
    "Mete Gazoz olimpiyatlarda altın madalya kazandı."
]

# Tokenize et
inputs = tokenizer(test_sentences, return_tensors="pt", padding=True, truncation=True).to(device)

# Embedding çıkar
with torch.no_grad():
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Ortalama embedding

print("Embeddings shape:", embeddings.shape)
print("1. cümle embedding:", embeddings[0][:10])  # ilk 10 değer

Embeddings shape: torch.Size([2, 384])
1. cümle embedding: tensor([ 2.3284,  1.5212, -0.3189, -0.1029,  0.4771,  1.7066,  1.4510,  1.0675,
         1.7525,  1.5524], device='cuda:0')


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

sim = cosine_similarity(embeddings[0].cpu().numpy().reshape(1, -1),
                        embeddings[1].cpu().numpy().reshape(1, -1))
print("Benzerlik:", sim)


Benzerlik: [[0.7693831]]
