In [10]:
import pandas as pd
import numpy as np
import re
import pickle
import warnings
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D
from sklearn.preprocessing import LabelEncoder


warnings.filterwarnings("ignore", category=FutureWarning)
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# 1. Veriseti Yüklemesi
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# 2. Ön İşleme (PREPROCESSING)
def preprocess(text):
    if not isinstance(text, str): return ""
    text = text.replace('İ', 'i').replace('I', 'ı').lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text) # Linkler
    text = re.sub(r'@\w+', '', text) # @kullanıcı
    text = re.sub(r'\d+', '', text) # Sayılar
    text = re.sub(r'[^\w\s]', '', text) # Noktalama
    return text.strip()

print("Veriler temizleniyor...")
train_df['clean_text'] = train_df['text'].apply(preprocess)
test_df['clean_text'] = test_df['text'].apply(preprocess)

# 3. Etiketleri numaralandırılması.
le = LabelEncoder()
train_df['label_num'] = le.fit_transform(train_df['label'])
test_df['label_num'] = le.transform(test_df['label'])
mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Etiket eşleşmeleri:", mapping)

# 4. Tokenization
max_words = 15000 
max_len = 100    

tokenizer = Tokenizer(num_words=max_words, lower=False)
tokenizer.fit_on_texts(train_df['clean_text'])

X_train = pad_sequences(tokenizer.texts_to_sequences(train_df['clean_text']), maxlen=max_len)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_df['clean_text']), maxlen=max_len)

y_train = tf.keras.utils.to_categorical(train_df['label_num'], 3)
y_test = tf.keras.utils.to_categorical(test_df['label_num'], 3)


Veriler temizleniyor...
Etiket eşleşmeleri: {'Negative': 0, 'Notr': 1, 'Positive': 2}


In [11]:
# LSTM Modeli
model = Sequential([
    Embedding(max_words, 128, input_length=max_len),
    SpatialDropout1D(0.3),
    LSTM(128, dropout=0.2), 
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(3, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

## Model Eğitimi
model.fit(
    X_train, y_train,
    epochs=3, 
    batch_size=256, 
    validation_split=0.1,
    verbose=1
)

Epoch 1/3




[1m1550/1550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m356s[0m 228ms/step - accuracy: 0.9172 - loss: 0.2218 - val_accuracy: 0.9364 - val_loss: 0.1691
Epoch 2/3
[1m1550/1550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m244s[0m 157ms/step - accuracy: 0.9423 - loss: 0.1572 - val_accuracy: 0.9400 - val_loss: 0.1626
Epoch 3/3
[1m1550/1550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m244s[0m 157ms/step - accuracy: 0.9488 - loss: 0.1393 - val_accuracy: 0.9394 - val_loss: 0.1615


<keras.src.callbacks.history.History at 0x18a5b4cdca0>

In [12]:
# Modelin ve gerekli dosyaların kaydedilmesi.
model.save('turkce_sentiment_model.h5')

with open('tokenizer.pickle', 'wb') as h:
    pickle.dump(tokenizer, h)

with open('label_encoder.pickle', 'wb') as h:
    pickle.dump(le, h)

print("Model ve sözlük kaydedildi.")



Model ve sözlük kaydedildi.


In [13]:
# Model testi
print("\nFinal Test Sonucu:")
loss, acc = model.evaluate(X_test, y_test)
print(f"Test Seti Doğruluğu: %{acc*100:.2f}")

# Tahmin Denemesi
def tahmin_yap(cumle):
    t = preprocess(cumle)
    seq = tokenizer.texts_to_sequences([t])
    pad = pad_sequences(seq, maxlen=max_len)
    res = model.predict(pad, verbose=0)
    return le.classes_[np.argmax(res)]

ornek = "Ürün elime çok hızlı ulaştı, paketleme harikaydı!"
print(f"\nÖrnek: {ornek} -> Tahmin: {tahmin_yap(ornek)}")


Final Test Sonucu:
[1m1531/1531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 9ms/step - accuracy: 0.9388 - loss: 0.1647
Test Seti Doğruluğu: %93.88





Örnek: Ürün elime çok hızlı ulaştı, paketleme harikaydı! -> Tahmin: Positive
