In [1]:
import pandas as pd

# Excel dosyasının yolunu belirtin
file_path = 'Türkçe Nefret Söylemi Veriseti.xlsx'

# Excel dosyasını yükle
xlsx = pd.ExcelFile(file_path)

# Tüm sheet'leri listele
sheet_names = xlsx.sheet_names

# "TOPLAM" ve ilk sheet dışındaki sheet'leri seç
sheets_to_merge = [sheet for sheet in sheet_names if sheet != "TOPLAM" and sheet != sheet_names[0]]

# Seçilen sheet'leri birleştir
merged_data = pd.concat([xlsx.parse(sheet,header=1) for sheet in sheets_to_merge], ignore_index=True)

# Birleştirilen veriyi kaydetmek için bir CSV dosyası oluşturun (isteğe bağlı)
#merged_data.to_csv('merged_sheets.csv', index=False)

print(merged_data.head())  # Display the first few rows
print(merged_data.info())  # Understand data types and null values
print(merged_data.columns)

merged_data['Tweet'] = merged_data['Tweet'].str.lower()
merged_data['Tweet'] = merged_data['Tweet'].str.replace(r'[^\w\s]', '', regex=True)

import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords

stop_words = set(stopwords.words('turkish'))
merged_data['Tweet'] = merged_data['Tweet'].apply(
    lambda x: ' '.join(word for word in x.split() if word not in stop_words)
)


from snowballstemmer import TurkishStemmer

stemmer = TurkishStemmer()
merged_data['Tweet'] = merged_data['Tweet'].apply(
    lambda x: ' '.join(stemmer.stemWord(word) for word in x.split())
)

print("Boş Tweet sayısı:", merged_data['Tweet'].isnull().sum())
print("Boş Etiket sayısı:", merged_data['Etiket'].isnull().sum())


  from pandas.core import (


   row ID                                              Tweet     Etiket  \
0  Row589  ya orospu cocuklari hepiniz niye ayni anda yaz...     nefret   
1  Row593  Ciddiye alan tüm dünyanın beynini sileyim.. \n...  saldırgan   
2  Row600  Kayıtlı İstihdama geçiş programına göre (?)\nŞ...    hiçbiri   
3  Row604  Hastaneye git Suriyeli. PTT ye git Suriyeli. P...     nefret   
4  Row607               Cölesi bitmiş suriyeli gibiyim bugün    hiçbiri   

  Alt Etiket   Etiket.1 Alt Etiket.1   Etiket.2 Alt Etiket.2 Etiket.3  \
0      etnik     nefret        etnik     nefret        etnik      NaN   
1        NaN  saldırgan          NaN  saldırgan          NaN      NaN   
2        NaN    hiçbiri          NaN    hiçbiri          NaN      NaN   
3      etnik     nefret        etnik     nefret        etnik      NaN   
4        NaN     nefret        etnik    hiçbiri          NaN  hiçbiri   

  Alt Etiket.3  ...    User - Location User - Time Zone User - Statuses  \
0          NaN  ...            göky

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Boş Tweet sayısı: 0
Boş Etiket sayısı: 0


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
# Türkçe stop-word listesini indir
nltk.download('stopwords')
turkish_stop_words = stopwords.words('turkish')
# Metin ve etiketleri ayırma
texts = merged_data['Tweet']
labels = merged_data['Etiket']

# Label encoding
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

# Eğitim ve test bölme
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# TF-IDF Vectorizer tanımlama
max_features = 5000  #  maksimum özellik sayısını ayarlayın
vectorizer = TfidfVectorizer(max_features=max_features, stop_words=turkish_stop_words)

# Eğitim ve test verilerini vektörleştirme
X_train_tfidf = vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = vectorizer.transform(X_test).toarray()

# Kontrol
print("Eğitim verisi şekli:", X_train_tfidf.shape)
print("Test verisi şekli:", X_test_tfidf.shape)
print("TF-IDF örnek değerler:", X_train_tfidf[0])


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Eğitim verisi şekli: (8179, 5000)
Test verisi şekli: (2045, 5000)
TF-IDF örnek değerler: [0. 0. 0. ... 0. 0. 0.]


In [3]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam, RMSprop, SGD

def create_ann_model(hyperparameters, input_dim):
    """
    Hiperparametrelere göre ANN modelini oluşturur.
    """
    learning_rate = hyperparameters['learning_rate']
    num_layers = hyperparameters['num_layers']
    neurons_per_layer = hyperparameters['neurons_per_layer']
    activation = hyperparameters['activation']
    optimizer_name = hyperparameters['optimizer']

    # Optimizer seçimi
    if optimizer_name == 'Adam':
        optimizer = Adam(learning_rate=learning_rate)
    elif optimizer_name == 'RMSprop':
        optimizer = RMSprop(learning_rate=learning_rate)
    elif optimizer_name == 'SGD':
        optimizer = SGD(learning_rate=learning_rate)

    # Model oluşturma
    model = Sequential()

    # Giriş katmanı
    model.add(Dense(neurons_per_layer, activation=activation, input_dim=input_dim))
    model.add(Dropout(0.2))

    # Ek katmanlar
    for _ in range(1, num_layers):
        model.add(Dense(neurons_per_layer, activation=activation))
        model.add(Dropout(0.2))

    # Çıkış katmanı
    model.add(Dense(1, activation='sigmoid')) 

    # Model derleme
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    return model



In [4]:
import random

# Hiperparametre uzayı
hyperparameter_space = {
    'learning_rate': [0.0001, 0.1],
    'batch_size': [16, 32, 64, 128, 256],
    'num_layers': [1, 5],  # 'num_layers' eklendi
    'filters_per_layer': [8, 128],
    'kernel_size': [(3, 3), (5, 5)],
    'neurons_per_layer': [32, 128],  # 'neurons_per_layer' eklendi
    'activation': ['relu', 'tanh', 'sigmoid'],
    'pooling_type': ['MaxPooling', 'AveragePooling'],
    'optimizer': ['SGD', 'Adam', 'RMSprop']
}

# Popülasyon başlatma fonksiyonu
def initialize_population(pop_size, hyperparameter_space):
    population = []
    for _ in range(pop_size):
        individual = {
            'learning_rate': random.uniform(*hyperparameter_space['learning_rate']),
            'batch_size': random.choice(hyperparameter_space['batch_size']),
            'num_layers': random.randint(hyperparameter_space['num_layers'][0], hyperparameter_space['num_layers'][1]), # 'num_conv_layers' yerine 'num_layers' kullanıldı
            'filters_per_layer': random.choice(range(hyperparameter_space['filters_per_layer'][0],
                                                     hyperparameter_space['filters_per_layer'][1] + 1, 8)),
            'kernel_size': random.choice(hyperparameter_space['kernel_size']),
            'activation': random.choice(hyperparameter_space['activation']),
            'pooling_type': random.choice(hyperparameter_space['pooling_type']),
            'optimizer': random.choice(hyperparameter_space['optimizer']),
            'neurons_per_layer': random.choice(hyperparameter_space['neurons_per_layer']),
        }
        population.append(individual)
    return population


In [5]:
def evaluate_fitness(hyperparameters, X_train, y_train, X_test, y_test):
    # Modeli oluştur
    input_dim = X_train.shape[1]  # TF-IDF ile vektör boyutu
    model = create_ann_model(hyperparameters, input_dim=input_dim)

    # `batch_size` değerini ayarla
    batch_size = min(hyperparameters['batch_size'], len(X_train))

    # Modeli eğit
    history = model.fit(X_train, y_train, epochs=1, batch_size=batch_size, verbose=1)

    # Test doğruluğunu değerlendir
    _, accuracy = model.evaluate(X_test, y_test, verbose=0)
    return accuracy


In [6]:
def mutate(individual, hyperparameter_space):
    gene_to_mutate = random.choice(list(individual.keys()))
    if gene_to_mutate == 'learning_rate':
        individual[gene_to_mutate] = random.uniform(*hyperparameter_space['learning_rate'])
    elif gene_to_mutate == 'neurons_per_layer':
        individual[gene_to_mutate] = random.choice(hyperparameter_space['neurons_per_layer'])
    elif gene_to_mutate == 'batch_size':
        individual[gene_to_mutate] = random.choice(hyperparameter_space['batch_size'])
    elif gene_to_mutate == 'num_layers':
        individual[gene_to_mutate] = random.randint(hyperparameter_space['num_layers'][0], hyperparameter_space['num_layers'][1])
    elif gene_to_mutate == 'filters_per_layer':
        individual[gene_to_mutate] = random.choice(range(hyperparameter_space['filters_per_layer'][0],
                                                           hyperparameter_space['filters_per_layer'][1] + 1, 8))
    elif gene_to_mutate == 'activation':
        individual[gene_to_mutate] = random.choice(hyperparameter_space['activation'])
    elif gene_to_mutate == 'optimizer':
        individual[gene_to_mutate] = random.choice(hyperparameter_space['optimizer'])
    elif gene_to_mutate == 'pooling_type':
        individual[gene_to_mutate] = random.choice(hyperparameter_space['pooling_type'])
    elif gene_to_mutate == 'kernel_size':
        individual[gene_to_mutate] = random.choice(hyperparameter_space['kernel_size'])

# Crossover operatörü
def crossover(parent1, parent2):
    cut_point = random.randint(1, len(parent1) - 1)
    child1 = {**dict(list(parent1.items())[:cut_point]), **dict(list(parent2.items())[cut_point:])}
    child2 = {**dict(list(parent2.items())[:cut_point]), **dict(list(parent1.items())[cut_point:])}
    return child1, child2


In [7]:
def tournament_selection(population, fitness_scores, tournament_size=5):
    """
    Turnuva seçimi uygular: Rastgele seçilen bireyler arasında en iyisini seçer.
    """
    selected = []
    for _ in range(len(population)):
        # Turnuvaya katılacak bireyleri rastgele seç
        tournament = random.sample(list(zip(population, fitness_scores)), tournament_size)
        # Turnuvanın en iyi bireyini seç
        winner = max(tournament, key=lambda x: x[1])
        selected.append(winner[0])
    return selected

In [8]:
def survivor_selection(population, fitness_scores, pop_size):
    """
    Fitness-based Survivor Selection uygular: Eski ve yeni popülasyonu birleştirir,
    en iyi bireyleri fitness skorlarına göre seçer.
    """
    # Popülasyonu ve fitness skorlarını birleştir
    combined_population = population
    combined_fitness_scores = fitness_scores

    # Fitness skorlarına göre sıralama yap
    sorted_population = [ind for _, ind in sorted(zip(combined_fitness_scores, combined_population),
                                                  key=lambda pair: pair[0], reverse=True)]

    # En iyi popülasyonu seç
    return sorted_population[:pop_size]

In [9]:
def genetic_algorithm(X_train, y_train, X_test, y_test, hyperparameter_space,
                      pop_size=10, generations=10, mutation_prob=0.05, crossover_prob=0.3, tournament_size=5):
    """
    Genetik algoritmayı uygular ve en iyi kromozomu döndürür.
    """
    population = initialize_population(pop_size, hyperparameter_space)
    best_chromosomes = []  # En iyi kromozomları her nesil için sakla
    generation_accuracies = []  # Her nesildeki doğruluk değerlerini sakla

    # Global en iyi doğruluğu ve kromozomu takip et
    best_accuracy = 0
    best_chromosome = None

    for generation in range(generations):
        # Fitness değerlendirmesi
        fitness_scores = [evaluate_fitness(ind, X_train, y_train, X_test, y_test)
                          for ind in population]

        # Her nesildeki en iyi doğruluk değerini kaydet
        generation_best_accuracy = max(fitness_scores)
        generation_accuracies.append(generation_best_accuracy)

        # En iyi kromozomu kontrol et ve global olarak güncelle
        if generation_best_accuracy > best_accuracy:
            best_accuracy = generation_best_accuracy
            best_chromosome = population[fitness_scores.index(generation_best_accuracy)]

        # Parent Selection: Turnuva seçimi
        selected_population = tournament_selection(population, fitness_scores, tournament_size)

        # Yeni popülasyonu oluşturma
        next_population = []

        while len(next_population) < pop_size:
            if random.random() < crossover_prob:
                # Crossover uygulama
                parent1, parent2 = random.sample(selected_population, 2)
                child1, child2 = crossover(parent1, parent2)
                if random.random() < mutation_prob:
                    mutate(child1, hyperparameter_space)
                    mutate(child2, hyperparameter_space)
                next_population += [child1, child2]
            else:
                # Mutasyon uygulama
                child = random.choice(selected_population).copy()
                mutate(child, hyperparameter_space)
                next_population.append(child)

        # Survivor Selection: Fitness-based Survivor Selection
        population = survivor_selection(next_population, [evaluate_fitness(ind, X_train, y_train, X_test, y_test)
                                                           for ind in next_population], pop_size)

        # Her nesilin en iyi doğruluk değerini yazdır
        print(f"Generation {generation + 1}, Best Accuracy (Current): {generation_best_accuracy:.4f}")
        print(f"Overall Best Accuracy: {best_accuracy:.4f}")

    # En iyi kromozomu döndürme
    print("\nFinal Best Chromosome and Accuracy:")
    print(f"Best Chromosome: {best_chromosome}")
    print(f"Best Accuracy: {best_accuracy:.4f}")

    # Tüm nesillerin doğruluk değerlerini döndür
    return best_chromosome, best_accuracy, generation_accuracies

In [10]:
# 10 popülasyon, 10 iterasyon
print("\nRunning GA with 10 population and 10 generations...\n")
# Assign the third returned value to a variable named 'generation_accuracies_10_10'
best_chromosome_10_10, best_accuracy_10_10, generation_accuracies_10_10 = genetic_algorithm(
    X_train_tfidf, y_train, X_test_tfidf, y_test,

    hyperparameter_space=hyperparameter_space,
    pop_size=10, generations=10, mutation_prob=0.05, crossover_prob=0.3
)

# Sonuçları yazdır
print("\nResults for 10 Population and 10 Generations:")
print(f"Best Chromosome: {best_chromosome_10_10}")
print(f"Best Accuracy: {best_accuracy_10_10:.4f}")

# 10 popülasyon, 50 iterasyon
print("\nRunning GA with 10 population and 50 generations...\n")
best_chromosome_10_50, best_accuracy_10_50,generation_accuracies_10_50 = genetic_algorithm(
    X_train_tfidf, y_train, X_test_tfidf, y_test,

    hyperparameter_space=hyperparameter_space,
    pop_size=10, generations=50, mutation_prob=0.05, crossover_prob=0.3
)

# Sonuçları yazdır
print("\nResults for 10 Population and 50 Generations:")
print(f"Best Chromosome: {best_chromosome_10_50}")
print(f"Best Accuracy: {best_accuracy_10_50:.4f}")

# Karşılaştırma
print("\nComparison:")
print(f"Accuracy Improvement: {best_accuracy_10_50 - best_accuracy_10_10:.4f}")


Running GA with 10 population and 10 generations...

Generation 1, Best Accuracy (Current): 0.8411
Overall Best Accuracy: 0.8411
Generation 2, Best Accuracy (Current): 0.8416
Overall Best Accuracy: 0.8416
Generation 3, Best Accuracy (Current): 0.8504
Overall Best Accuracy: 0.8504
Generation 4, Best Accuracy (Current): 0.8460
Overall Best Accuracy: 0.8504
Generation 5, Best Accuracy (Current): 0.8484
Overall Best Accuracy: 0.8504
Generation 6, Best Accuracy (Current): 0.8469
Overall Best Accuracy: 0.8504
Generation 7, Best Accuracy (Current): 0.8494
Overall Best Accuracy: 0.8504
Generation 8, Best Accuracy (Current): 0.8465
Overall Best Accuracy: 0.8504
Generation 9, Best Accuracy (Current): 0.8474
Overall Best Accuracy: 0.8504
Generation 10, Best Accuracy (Current): 0.8455
Overall Best Accuracy: 0.8504

Final Best Chromosome and Accuracy:
Best Chromosome: {'learning_rate': 0.02933297374961518, 'batch_size': 64, 'num_layers': 2, 'filters_per_layer': 72, 'kernel_size': (3, 3), 'activati