In [3]:
!pip install --upgrade transformers



In [6]:
print(transformers.__version__)

4.51.3


In [5]:
import pandas as pd
import numpy as np
import transformers
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

In [7]:
train_df = pd.read_csv("/content/processed_egitim_verisi.csv")
label_encoder = LabelEncoder()
train_df['label_id'] = label_encoder.fit_transform(train_df['label'])

In [8]:
model_name = "dbmdz/bert-base-turkish-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [14]:
def tokenize(batch):
    tokens = tokenizer(batch['comment'], padding=True, truncation=True, max_length=256)
    tokens["labels"] = batch["label_id"]
    return tokens

dataset = Dataset.from_pandas(train_df[['comment', 'label_id']])
dataset = dataset.train_test_split(test_size=0.2, seed=42)
dataset = dataset.map(tokenize, batched=True)

Map:   0%|          | 0/1906 [00:00<?, ? examples/s]

Map:   0%|          | 0/477 [00:00<?, ? examples/s]

In [15]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir="./logs",
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics,
)

In [17]:
trainer.train()

Step,Training Loss
500,0.3699


TrainOutput(global_step=600, training_loss=0.31157629648844404, metrics={'train_runtime': 102.6178, 'train_samples_per_second': 92.869, 'train_steps_per_second': 5.847, 'total_flos': 1253757948840960.0, 'train_loss': 0.31157629648844404, 'epoch': 5.0})

In [18]:
preds = trainer.predict(dataset['test'])
y_pred = np.argmax(preds.predictions, axis=1)
print(classification_report(dataset['test']['label_id'], y_pred, target_names=label_encoder.classes_))

                        precision    recall  f1-score   support

              basyapit       0.87      0.84      0.86        81
beklentiyi_karsilamadi       0.82      0.73      0.77       104
        izlemeye_deger       0.76      0.78      0.77        87
                 vasat       0.72      0.78      0.75        93
           zaman_kaybi       0.90      0.93      0.91       112

              accuracy                           0.82       477
             macro avg       0.81      0.81      0.81       477
          weighted avg       0.82      0.82      0.82       477



In [19]:
unlabeled_df = pd.read_csv("/content/processed_turkish_movie_review_dataset.csv")

In [20]:
unlabeled_df.head()

Unnamed: 0,film_name,point,comment
0,Sevginin Gücü,50,jean reno denince zaten leon filmi gelir akla ...
1,Sevginin Gücü,50,ekşın falan izlemek istiyorsanız eğer bunu izl...
2,Sevginin Gücü,50,bu yapım hakkında oyle çok şey yazabilirim ki ...
3,Sevginin Gücü,50,finali yeter... sting shape of my heartbazılar...
4,Sevginin Gücü,50,jean reno..bu adam kusursuz biri..ve oyunculug...


In [23]:
unlabeled_df = unlabeled_df.dropna(subset=["comment"])
unlabeled_df = unlabeled_df[unlabeled_df["comment"].str.strip() != ""]

In [24]:
def tokenize_predict(batch):
    return tokenizer(batch["comment"], padding=True, truncation=True, max_length=256)

In [25]:
to_predict_dataset = Dataset.from_pandas(unlabeled_df)
to_predict_dataset = to_predict_dataset.map(tokenize_predict, batched=True)

Map:   0%|          | 0/82637 [00:00<?, ? examples/s]

In [26]:
raw_preds = trainer.predict(to_predict_dataset)
predicted_ids = np.argmax(raw_preds.predictions, axis=1)
predicted_labels = label_encoder.inverse_transform(predicted_ids)

In [27]:
unlabeled_df['predicted_label'] = predicted_labels
unlabeled_df.to_csv("etiketlenmis_83k.csv", index=False)
print("83K yorum etiketlendi ve kaydedildi → etiketlenmis_83k.csv")

83K yorum etiketlendi ve kaydedildi → etiketlenmis_83k.csv


In [28]:
duygu_sayilari = unlabeled_df['predicted_label'].value_counts()

print(duygu_sayilari)

predicted_label
izlemeye_deger            27021
basyapit                  21271
vasat                     15256
zaman_kaybi               11885
beklentiyi_karsilamadi     7204
Name: count, dtype: int64


In [29]:
from google.colab import files
files.download('etiketlenmis_83k.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>