# Turkish Game Review Sentiment/Topic Classification: Model Training Pipeline

This notebook demonstrates the full workflow for training and evaluating two NLP models (Logistic Regression and BERTurk) on Turkish game review data.

## 1. Install & Import Required Libraries

Install dependencies (if running in a new environment) and import all necessary libraries.

In [1]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

# For BERT
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, load_metric
# ZEMBEREK PYTHON FOR LEMMATIZATION
from zemberek import (
    TurkishMorphology,
    TurkishSentenceNormalizer,
    TurkishSpellChecker,
    TurkishTokenizer
)


# Additional imports for constants
import json

# 2. Data Processing

In [None]:
def read_excel_file(file_path):
    """
    Reads an Excel file and returns a DataFrame.
    """
    try:
        df = pd.read_excel(file_path)
        print(f"Successfully read the file: {file_path}")
        return df
    except Exception as e:
        print(f"Error reading the file: {file_path}. Error: {e}")
        return None

In [None]:
# path:

FILE_PATH = "dataset/cleaned_reviews_zemberek.xlsx"

In [None]:
df = read_excel_file(FILE_PATH)

# print the first few rows of the DataFrame
df.head(5)

Successfully read the file: dataset/cleaned_reviews_zemberek.xlsx


Unnamed: 0,review_id,app_id,review_text,Grafik,AI,Oynanis,Ses ve Muzik,Oyun Dunyasi,Topluluk ve Sosyal,Hikaye ve Senaryo,Performans ve Teknik,Genel Duygu
0,1,1245620,İlk 40 saatimde nereye gitmem gerektiğini ne y...,Nötr,Nötr,Olumsuz,Nötr,Nötr,Nötr,Nötr,Nötr,Nötr
1,2,1245620,Bu oyunda Malenia bossunu tasarlıyan arkadaşa ...,Nötr,Nötr,Olumlu,Nötr,Nötr,Nötr,Nötr,Nötr,Nötr
2,3,1245620,Güzel oyun atmosfer kontroller vs. güzel ama ...,Olumlu,Nötr,Olumlu,Nötr,Olumlu,Nötr,Nötr,Nötr,Nötr
3,4,1245620,Oyunun devasa bir haritası var açık dünya olma...,Nötr,Olumlu,Olumlu,Nötr,Olumlu,Nötr,Nötr,Nötr,Nötr
4,5,1245620,oynu bitirdiğinizde huzurluca ölebilirisiniz,Nötr,Nötr,Nötr,Nötr,Nötr,Nötr,Nötr,Nötr,Nötr


In [None]:
def normalize_text(text):
    """
    Normalize the text by removing punctuation, converting to lowercase, and removing extra spaces.
    """
    text = text.lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def replace_turkish_characters(text):
    """
    Replace Turkish characters with their English equivalents.
    """
    replacements = {
        'ç': 'c', 'ğ': 'g', 'ı': 'i', 'ö': 'o', 'ş': 's', 'ü': 'u',
        'Ç': 'C', 'Ğ': 'G', 'İ': 'I', 'Ö': 'O', 'Ş': 'S', 'Ü': 'U'
    }
    for turkish_char, english_char in replacements.items():
        text = text.replace(turkish_char, english_char)
    return text

# Lemmatize the text using Zemberek
from zemberek.morphology import TurkishMorphology
from zemberek.tokenization import TurkishTokenizer
from zemberek.normalization import TurkishSentenceNormalizer

def lemmatize_text(text):
    """
    Lemmatize the text using Zemberek.
    """
    morphology = TurkishMorphology.create_with_defaults()
    normalizer = TurkishSentenceNormalizer(morphology)

    # Normalize the text
    normalized_text = normalizer.normalize(text)

    # Tokenize and lemmatize
    tokenizer = TurkishTokenizer.DEFAULT
    tokens = tokenizer.tokenize(normalized_text)

    lemmatized_tokens = [morphology.analyze(token).best_analysis().lemma for token in tokens]

    return ' '.join(lemmatized_tokens)

# Example usage:
print(lemmatize_text("Bu bir örnek cümledir."))


def preprocess_text(text):
    """
    Preprocess the text by normalizing and replacing Turkish characters.
    """
    text = normalize_text(text)
    text = replace_turkish_characters(text)
    return text