In [None]:
import pandas as pd

# Veriyi yükleme
df = pd.read_parquet('/content/train-00000-of-00001.parquet')

# Veriyi inceleme
print(df.head())
print(df.info())
print(df.describe())


    bolum                 konu  \
0  Adalet  Adalet Meslek Etiği   
1  Adalet  Adalet Meslek Etiği   
2  Adalet  Adalet Meslek Etiği   
3  Adalet  Adalet Meslek Etiği   
4  Adalet  Adalet Meslek Etiği   

                                                soru  cevap aciklama  \
0  Avrupa (Ekonomik) Topluluğu’nu kuran antlaşma ...      3     None   
1  Katma Protokol’de düzenlenen temel konular ara...      4     None   
2  Türkiye Avrupa Topluluğu’na tam üyelik başvuru...      0     None   
3  Ankara Anlaşması’nın temel ve nihai amacı aşağ...      1     None   
4  Türkiye-AB Ortaklığı’nda kömür ve çelik ürünle...      0     None   

                                          secenekler  
0                     [1953, 1955, 1957, 1958, 1960]  
1  [İş gücünün serbest dolaşımı, Yabancı sermaye,...  
2                     [1987, 1988, 1997, 1999, 2007]  
3  [Türkiye’yi siyasi olarak güçlendirmek, Türkiy...  
4  [Serbest ticaret anlaşması, Entegrasyon rejimi...  
<class 'pandas.core.frame.DataFr

In [None]:
# Eksik değerlerin sayısını kontrol et
print(df.isnull().sum())

# Eksik değerleri doldurma veya silme
df = df.dropna()  # veya df.fillna(value) ile doldurabilirsiniz


bolum              0
konu           11505
soru               1
cevap              0
aciklama      263729
secenekler         0
dtype: int64


In [None]:
import re

def clean_text(text):
    text = text.lower()  # Küçük harfe çevir
    text = re.sub(r'\s+', ' ', text)  # Fazla boşlukları kaldır
    text = re.sub(r'[^\w\s]', '', text)  # Noktalama işaretlerini kaldır
    return text

df['soru'] = df['soru'].apply(clean_text)
df['aciklama'] = df['aciklama'].apply(clean_text)


In [None]:
from sklearn.model_selection import train_test_split

# Özellikler ve etiketler
X = df[['soru', 'aciklama']]
y = df['cevap']

# Eğitim ve test setlerine ayırma
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from datasets import Dataset, DatasetDict

# Veri setini Hugging Face formatına dönüştürme
train_dataset = Dataset.from_pandas(pd.concat([X_train.reset_index(drop=True), y_train.reset_index(drop=True)], axis=1))
test_dataset = Dataset.from_pandas(pd.concat([X_test.reset_index(drop=True), y_test.reset_index(drop=True)], axis=1))

datasets = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Tokenizer ve modeli yükle
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
def preprocess_function(examples):
    inputs = examples['soru']
    targets = examples['aciklama']
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_datasets = datasets.map(preprocess_function, batched=True)


Map:   0%|          | 0/20747 [00:00<?, ? examples/s]

Map:   0%|          | 0/5187 [00:00<?, ? examples/s]

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    output_dir='./results',
    num_train_epochs=3,
    eval_strategy="epoch",
    logging_dir='./logs',
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test']
)

trainer.train()


Epoch,Training Loss,Validation Loss
1,2.0584,1.90124
2,1.9377,1.787992
3,1.8773,1.761202


TrainOutput(global_step=7782, training_loss=2.0505685681274697, metrics={'train_runtime': 1444.82, 'train_samples_per_second': 43.079, 'train_steps_per_second': 5.386, 'total_flos': 2105952266354688.0, 'train_loss': 2.0505685681274697, 'epoch': 3.0})

In [None]:
results = trainer.evaluate()
print(results)


{'eval_loss': 1.7612017393112183, 'eval_runtime': 30.5863, 'eval_samples_per_second': 169.586, 'eval_steps_per_second': 21.219, 'epoch': 3.0}
