In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../data/preprocessed_review.csv")

In [3]:
def label_review(rate):
    if rate >= 4: return 'positive'
    elif rate == 3: return 'neutral'
    else: return 'negative'

df['sentiment'] = df['Puan'].apply(label_review)
df['sentiment'].value_counts()

sentiment
positive    45656
negative    18419
neutral      2219
Name: count, dtype: int64

In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.nn.functional import softmax
from tqdm import tqdm

model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to("cuda")
model.eval()

def predict_star_scores(texts, batch_size=64):
    results = []
    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size)):
            batch = texts[i:i+batch_size]
            inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to("cuda")
            outputs = model(**inputs)
            probs = softmax(outputs.logits, dim=1)
            stars = torch.argmax(probs, dim=1) + 1  # 0-based index → 1–5
            for score, prob in zip(stars, probs):
                results.append({
                    "score": score.item(),
                    "distribution": prob.tolist()
                })
    return results


In [6]:
import ast

texts = df["lemmas"].dropna().apply(lambda x: " ".join(ast.literal_eval(x))).tolist()
results = predict_star_scores(texts)

100%|██████████████████████████████████████████████████████████████████████████████| 1036/1036 [02:28<00:00,  6.96it/s]


In [8]:
df_filtered = df.loc[df["lemmas"].notna()].copy()
df_filtered["sentiment_star"] = [r["score"] for r in results]

In [10]:
print(df_filtered["sentiment_star"].value_counts())

sentiment_star
5    31759
1    19107
3    11272
4     3092
2     1064
Name: count, dtype: int64


In [12]:

df_filtered[["lemmas", "sentiment_star"]].to_csv("sentiment_results.csv", index=False)


In [18]:
from sklearn.metrics import accuracy_score
df_filtered = df.loc[df["lemmas"].notna() & df["Puan"].notna()].copy()
df_filtered["pred_star"] = [int(r["score"]) for r in results]

acc = accuracy_score(df_filtered["Puan"], df_filtered["pred_star"])
print(f"Doğruluk: %{acc*100:.2f}")


Doğruluk: %57.29


In [19]:
from sklearn.metrics import classification_report
print(classification_report(df_filtered["Puan"], df_filtered["pred_star"]))


              precision    recall  f1-score   support

           1       0.57      0.65      0.61     16756
           2       0.03      0.02      0.02      1663
           3       0.04      0.20      0.06      2219
           4       0.07      0.07      0.07      3311
           5       0.83      0.62      0.71     42345

    accuracy                           0.57     66294
   macro avg       0.31      0.31      0.30     66294
weighted avg       0.68      0.57      0.61     66294

