<a href="https://colab.research.google.com/github/efemehmetkarabulut/AYRIK-SISTEMLER-ILERI-OLASILIK/blob/main/classification_Naive_Bias_Random_Forest_SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

def download_dataset():
    file_path = '/content/drive/MyDrive/Text Classification Tez Çalışması/inputs for final task/bbc_data_Format_151K.xlsx'
    try:
        data = pd.read_excel(file_path)
        return data
    except Exception as e:
        print(f"Veri seti yüklenirken hata oluştu: {e}")
        return None

def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    return text

def fill_missing_values(data):
    data.dropna(inplace=True)
    return data

def split_data(data):
    X = data['text']
    y = data['category']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)
    return X_train, X_test, y_train, y_test

def train_and_evaluate(X_train, X_test, y_train, y_test):
    classifiers = {
        "Naive Bayes": MultinomialNB(),
        "Random Forest": RandomForestClassifier(random_state=42),
        "SVM": SVC(kernel='linear', random_state=42)
    }

    for name, clf in classifiers.items():
        print(f"\nModel: {name}")
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(max_df=0.75, ngram_range=(1, 2))),
            ('clf', clf)
        ])

        pipeline.fit(X_train, y_train)
        predictions = pipeline.predict(X_test)
        print(classification_report(y_test, predictions))

def main():
    data = download_dataset()
    if data is None:
        return

    data['text'] = data['text'].apply(clean_text)
    data = fill_missing_values(data)
    X_train, X_test, y_train, y_test = split_data(data)
    train_and_evaluate(X_train, X_test, y_train, y_test)

if __name__ == "__main__":
    main()



Model: Naive Bayes
                                            precision    recall  f1-score   support

                  Astronomy & Astrophysics       0.84      0.91      0.87      8581
                                  Biology        0.60      0.44      0.51      7154
                                Chemistry        0.86      0.74      0.80      8241
Computer Science, Artificial Intelligence        0.68      0.90      0.77      8662
              Cybersecurity & Cryptography       0.46      0.50      0.48      8601
                      Economics & Business       0.88      0.86      0.87      7400
                     Environmental Science       0.63      0.61      0.62      8431
                                 Geography       0.48      0.89      0.62      9290
                                   History       0.76      0.26      0.39      7724
                   International Relations       0.71      0.27      0.39      8506
       Materials Science, Coatings & Films       0.69  

In [None]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

def download_dataset():
    file_path = '/content/drive/MyDrive/Text Classification Tez Çalışması/inputs for final task/bbc_data_Format_151K.xlsx'
    try:
        data = pd.read_excel(file_path)
        return data
    except Exception as e:
        print(f"Veri seti yüklenirken hata oluştu: {e}")
        return None

def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.lower()

def fill_missing_values(data):
    data.dropna(inplace=True)
    return data

def split_data(data):
    X = data['text']
    y = data['category']
    return train_test_split(X, y, test_size=0.8, random_state=42)

def train_and_evaluate(X_train, X_test, y_train, y_test):
    classifiers = {
        "Naive Bayes": MultinomialNB(),
        "Random Forest": RandomForestClassifier(random_state=42),
        "SVM": SVC(kernel='linear', random_state=42)
    }

    for name, clf in classifiers.items():
        print(f"\nModel: {name}")
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(max_df=0.75, ngram_range=(1, 2))),
            ('clf', clf)
        ])

        pipeline.fit(X_train, y_train)
        predictions = pipeline.predict(X_test)

        print("\nClassification Report:\n")
        print(classification_report(y_test, predictions))

        # Confusion Matrix
    cm = confusion_matrix(decoded_labels, decoded_predictions, labels=label_encoder.classes_)
    cm_df = pd.DataFrame(cm, index=label_encoder.classes_, columns=label_encoder.classes_)

    plt.figure(figsize=(12, 10))
    sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues', cbar=False, linewidths=0.5)

    plt.title("Confusion Matrix - BERT", fontsize=16)
    plt.xlabel("Tahmin Edilen Etiket", fontsize=12)
    plt.ylabel("Gerçek Etiket", fontsize=12)

    # X ekseni etiketlerini düzelt
    plt.xticks(ticks=np.arange(len(label_encoder.classes_)) + 0.5,  # merkezleme
               labels=label_encoder.classes_,
               rotation=45,
               ha='right')

    # Y ekseni etiketleri
    plt.yticks(ticks=np.arange(len(label_encoder.classes_)) + 0.5,  # merkezleme
               labels=label_encoder.classes_,
               rotation=0)

    plt.tight_layout()
    plt.show()

def main():
    data = download_dataset()
    if data is None:
        return

    data['text'] = data['text'].apply(clean_text)
    data = fill_missing_values(data)
    X_train, X_test, y_train, y_test = split_data(data)
    train_and_evaluate(X_train, X_test, y_train, y_test)

if __name__ == "__main__":
    main()



Model: Naive Bayes

Classification Report:

                                            precision    recall  f1-score   support

                  Astronomy & Astrophysics       0.87      0.93      0.90      8550
                                  Biology        0.67      0.43      0.53      7199
                                Chemistry        0.90      0.74      0.81      8177
Computer Science, Artificial Intelligence        0.76      0.93      0.84      8623
                      Economics & Business       0.95      0.85      0.90      7440
                     Environmental Science       0.84      0.70      0.76      8476
                                 Geography       0.63      0.92      0.75      9347
       Materials Science, Coatings & Films       0.74      0.98      0.84      8285
         Medicine, Research & Experimental       0.80      0.61      0.69      7121
                            Neurosciences        0.74      0.79      0.77      7327
                              

KeyboardInterrupt: 

In [None]:
pip install gensim


In [None]:
# Word2Vec + Hem Random Forest hem de SVM ile Eğitim
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
import gensim.downloader as api

def download_dataset():
    file_path = '/content/drive/MyDrive/Text Classification Tez Çalışması/inputs for final task/bbc_data_Format_151K.xlsx'
    try:
        data = pd.read_excel(file_path)
        return data
    except Exception as e:
        print(f"Veri seti yüklenirken hata oluştu: {e}")
        return None

def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.lower()

def fill_missing_values(data):
    data.dropna(inplace=True)
    return data

def split_data(data):
    X = data['text']
    y = data['category']
    return train_test_split(X, y, test_size=0.8, random_state=42)

def get_embedding_vector(texts, model, dim):
    vectors = []
    for text in texts:
        tokens = text.split()
        vecs = [model[word] for word in tokens if word in model]
        if vecs:
            vectors.append(np.mean(vecs, axis=0))
        else:
            vectors.append(np.zeros(dim))
    return np.array(vectors)

def plot_confusion_matrix(y_true, y_pred, labels, title):
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    cm_df = pd.DataFrame(cm, index=labels, columns=labels)

    plt.figure(figsize=(12, 10))
    sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues', cbar=False, linewidths=0.5)
    plt.title(f"Confusion Matrix - {title}", fontsize=16)
    plt.xlabel("Tahmin Edilen Etiket", fontsize=12)
    plt.ylabel("Gerçek Etiket", fontsize=12)
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()

def train_and_evaluate_word2vec(X_train, X_test, y_train, y_test):
    print("Word2Vec modeli indiriliyor...")
    model = api.load("word2vec-google-news-300")

    print("Vektörler hazırlanıyor...")
    X_train_vec = get_embedding_vector(X_train, model, 300)
    X_test_vec = get_embedding_vector(X_test, model, 300)

    classifiers = {
        "Random Forest": RandomForestClassifier(random_state=42),
        "SVM (Linear Kernel)": SVC(kernel='linear', random_state=42)
    }

    for name, clf in classifiers.items():
        print(f"\nModel: {name}")
        clf.fit(X_train_vec, y_train)
        predictions = clf.predict(X_test_vec)

        print("\nClassification Report:\n")
        print(classification_report(y_test, predictions))
        plot_confusion_matrix(y_test, predictions, clf.classes_ if hasattr(clf, 'classes_') else np.unique(y_test), title=name)

def main():
    data = download_dataset()
    if data is None:
        return

    data['text'] = data['text'].apply(clean_text)
    data = fill_missing_values(data)
    X_train, X_test, y_train, y_test = split_data(data)
    train_and_evaluate_word2vec(X_train, X_test, y_train, y_test)

if __name__ == "__main__":
    main()



In [None]:
#GloVe ile tüm modeller aynı anda naive svm rf
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
import gensim.downloader as api

def download_dataset():
    file_path = '/content/drive/MyDrive/Text Classification Tez Çalışması/inputs for final task/bbc_data_Format_151K.xlsx'
    try:
        data = pd.read_excel(file_path)
        return data
    except Exception as e:
        print(f"Veri seti yüklenirken hata oluştu: {e}")
        return None

def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.lower()

def fill_missing_values(data):
    data.dropna(inplace=True)
    return data

def split_data(data):
    X = data['text']
    y = data['category']
    return train_test_split(X, y, test_size=0.8, random_state=42)

def get_embedding_vector(texts, model, dim):
    vectors = []
    for text in texts:
        tokens = text.split()
        vecs = [model[word] for word in tokens if word in model]
        if vecs:
            vectors.append(np.mean(vecs, axis=0))
        else:
            vectors.append(np.zeros(dim))
    return np.array(vectors)

def plot_confusion_matrix(y_true, y_pred, labels, title):
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    cm_df = pd.DataFrame(cm, index=labels, columns=labels)

    plt.figure(figsize=(12, 10))
    sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues', cbar=False, linewidths=0.5)
    plt.title(f"Confusion Matrix - {title}", fontsize=16)
    plt.xlabel("Tahmin Edilen Etiket", fontsize=12)
    plt.ylabel("Gerçek Etiket", fontsize=12)
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()

def train_and_evaluate_glove(X_train, X_test, y_train, y_test):
    print("GloVe modeli indiriliyor...")
    model = api.load("glove-wiki-gigaword-100")  # 100 boyutlu, hızlı ve etkili

    print("Vektörler hazırlanıyor...")
    X_train_vec = get_embedding_vector(X_train, model, 100)
    X_test_vec = get_embedding_vector(X_test, model, 100)

    classifiers = {
        "Naive Bayes (GaussianNB)": GaussianNB(),
        "Random Forest": RandomForestClassifier(random_state=42),
        "SVM (Linear Kernel)": SVC(kernel='linear', random_state=42)
    }

    for name, clf in classifiers.items():
        print(f"\n==============================")
        print(f"Model: {name}")
        clf.fit(X_train_vec, y_train)
        predictions = clf.predict(X_test_vec)

        print("\nClassification Report:\n")
        print(classification_report(y_test, predictions))
        plot_confusion_matrix(y_test, predictions, np.unique(y_test), title=name)

def main():
    data = download_dataset()
    if data is None:
        return

    data['text'] = data['text'].apply(clean_text)
    data = fill_missing_values(data)
    X_train, X_test, y_train, y_test = split_data(data)
    train_and_evaluate_glove(X_train, X_test, y_train, y_test)

if __name__ == "__main__":
    main()


In [None]:
#FastText (300D) + Naive Bayes + Random Forest + SVM – Tüm Çıktılar

import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
import gensim.downloader as api

def download_dataset():
    file_path = '/content/drive/MyDrive/Text Classification Tez Çalışması/inputs for final task/bbc_data_Format_151K.xlsx'
    try:
        data = pd.read_excel(file_path)
        return data
    except Exception as e:
        print(f"Veri seti yüklenirken hata oluştu: {e}")
        return None

def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.lower()

def fill_missing_values(data):
    data.dropna(inplace=True)
    return data

def split_data(data):
    X = data['text']
    y = data['category']
    return train_test_split(X, y, test_size=0.8, random_state=42)

def get_embedding_vector(texts, model, dim):
    vectors = []
    for text in texts:
        tokens = text.split()
        vecs = [model[word] for word in tokens if word in model]
        if vecs:
            vectors.append(np.mean(vecs, axis=0))
        else:
            vectors.append(np.zeros(dim))
    return np.array(vectors)

def plot_confusion_matrix(y_true, y_pred, labels, title):
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    cm_df = pd.DataFrame(cm, index=labels, columns=labels)

    plt.figure(figsize=(12, 10))
    sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues', cbar=False, linewidths=0.5)
    plt.title(f"Confusion Matrix - {title}", fontsize=16)
    plt.xlabel("Tahmin Edilen Etiket", fontsize=12)
    plt.ylabel("Gerçek Etiket", fontsize=12)
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()

def train_and_evaluate_fasttext(X_train, X_test, y_train, y_test):
    print("FastText modeli yükleniyor...")
    model = api.load("fasttext-wiki-news-subwords-300")

    print("Vektörler oluşturuluyor...")
    X_train_vec = get_embedding_vector(X_train, model, 300)
    X_test_vec = get_embedding_vector(X_test, model, 300)

    classifiers = {
        "Naive Bayes (GaussianNB)": GaussianNB(),
        "Random Forest": RandomForestClassifier(random_state=42),
        "SVM (Linear Kernel)": SVC(kernel='linear', random_state=42)
    }

    for name, clf in classifiers.items():
        print(f"\nModel: {name}")
        clf.fit(X_train_vec, y_train)
        predictions = clf.predict(X_test_vec)

        print("\nClassification Report:\n")
        print(classification_report(y_test, predictions))
        labels = np.unique(y_test)
        plot_confusion_matrix(y_test, predictions, labels, title=name)

def main():
    data = download_dataset()
    if data is None:
        return

    data['text'] = data['text'].apply(clean_text)
    data = fill_missing_values(data)
    X_train, X_test, y_train, y_test = split_data(data)
    train_and_evaluate_fasttext(X_train, X_test, y_train, y_test)

if __name__ == "__main__":
    main()


In [None]:
import tensorflow as tf
from transformers import TFBertForSequenceClassification, BertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
import re

# Adım 1: Veriyi temizleyen fonksiyon
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Noktalama işaretlerini kaldır
    text = re.sub(r'\s+', ' ', text)    # Çoklu boşlukları tek boşlukla değiştir
    text = text.lower()                  # Küçük harfe çevir
    return text

# Adım 2: Veriyi bölme
def split_data(data):
    X = data['text']
    y = data['category']
    return train_test_split(X, y, test_size=0.8, random_state=42)

# Adım 3: BERT modelini oluşturma
def create_bert_model(num_labels):
    model = TFBertForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels=num_labels
    )
    return model

# Learning Rate Scheduler
def lr_scheduler():
    return tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=3,
        min_lr=1e-7,
        verbose=1
    )

# Adım 4: BERT modelini eğitme
def train_bert_model(model, train_data, val_data, epochs, learning_rate):
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metrics = ["accuracy"]

    model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics)

    scheduler = lr_scheduler()

    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=2,
        restore_best_weights=True,
        verbose=1
    )

    model.fit(
        train_data,
        validation_data=val_data,
        epochs=epochs,
        callbacks=[scheduler, early_stopping],
        verbose=1
    )
    return model

# Adım 5: Modeli değerlendirme ve sonuçları raporlama
def evaluate_model(model, val_data, val_labels_encoded, label_encoder):
    predictions = model.predict(val_data)["logits"]
    predicted_labels_encoded = np.argmax(predictions, axis=1)

    decoded_labels = label_encoder.inverse_transform(val_labels_encoded)
    decoded_predictions = label_encoder.inverse_transform(predicted_labels_encoded)

    report = classification_report(decoded_labels, decoded_predictions, target_names=label_encoder.classes_)
    print("Sınıflandırma Raporu (BERT):")
    print(report)

    # Sınıf başına düşen eleman sayısını yazdırma
    print("\nSınıf Başına Düşen Eleman Sayısı (Test Seti):")
    unique_labels, counts = np.unique(decoded_labels, return_counts=True)
    for label, count in zip(unique_labels, counts):
        print(f"{label:<40} {count}")

# Ana program
if __name__ == "__main__":
    # Veri setini yükleme
    file_path = '/content/drive/MyDrive/bbc_data_Format_151K (1).xlsx'
    data = pd.read_excel(file_path)

    # Veriyi temizleme
    data['text'] = data['text'].apply(clean_text)

    # Kategori etiketlerini kodlama
    label_encoder = LabelEncoder()
    data['category_encoded'] = label_encoder.fit_transform(data['category'])

    # Veriyi bölme
    X_train, X_test, y_train_encoded, y_test_encoded, y_train_labels, y_test_labels = train_test_split(
        data['text'], data['category_encoded'], data['category'], test_size=0.8, random_state=42
    )

    # Tokenizer ve maksimum uzunluk
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    max_length = 128  # Artırıldı

    train_encodings = tokenizer(list(X_train), truncation=True, padding='max_length', max_length=max_length, return_tensors="tf")
    test_encodings = tokenizer(list(X_test), truncation=True, padding='max_length', max_length=max_length, return_tensors="tf")

    # TF Dataset oluşturma, batch size artırıldı
    train_dataset = tf.data.Dataset.from_tensor_slices((
        {"input_ids": train_encodings["input_ids"], "attention_mask": train_encodings["attention_mask"]},
        y_train_encoded
    )).batch(64)  # Artırıldı

    test_dataset = tf.data.Dataset.from_tensor_slices((
        {"input_ids": test_encodings["input_ids"], "attention_mask": test_encodings["attention_mask"]},
        y_test_encoded
    )).batch(64)  # Artırıldı

    # Model oluştur
    bert_model = create_bert_model(num_labels=len(label_encoder.classes_))

    # Eğitimi başlat
    trained_model = train_bert_model(
        model=bert_model,
        train_data=train_dataset,
        val_data=test_dataset,
        epochs=5,  # Artırıldı
        learning_rate=2e-5
    )

    # Değerlendir
    evaluate_model(
        trained_model,
        {"input_ids": test_encodings["input_ids"], "attention_mask": test_encodings["attention_mask"]},
        y_test_encoded,
        label_encoder
    )


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/bbc_data_Format_151K (1).xlsx'

In [None]:
import tensorflow as tf
from transformers import TFBertForSequenceClassification, BertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
import re

# Adım 1: Veriyi temizleyen fonksiyon
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Noktalama işaretlerini kaldır
    text = re.sub(r'\s+', ' ', text)    # Çoklu boşlukları tek boşlukla değiştir
    text = text.lower()                  # Küçük harfe çevir
    return text

# Adım 2: Veriyi bölme
def split_data(data):
    X = data['text']
    y = data['category']
    return train_test_split(X, y, test_size=0.8, random_state=42)

# Adım 3: BERT modelini oluşturma
def create_bert_model(num_labels):
    model = TFBertForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels=num_labels
    )
    return model

# Learning Rate Scheduler
def lr_scheduler():
    return tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=3,
        min_lr=1e-7,
        verbose=1
    )

# Adım 4: BERT modelini eğitme
def train_bert_model(model, train_data, val_data, epochs, learning_rate):
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metrics = ["accuracy"]

    model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics)

    scheduler = lr_scheduler()

    model.fit(
        train_data,
        validation_data=val_data,
        epochs=epochs,
        callbacks=[scheduler],
        verbose=1  # Eğitim sürecini daha az ayrıntılı göstererek hızı artırabiliriz.
    )
    return model

# Adım 5: Modeli değerlendirme ve sonuçları raporlama
def evaluate_model(model, val_data, val_labels_encoded, label_encoder):
    predictions = model.predict(val_data)["logits"]
    predicted_labels_encoded = np.argmax(predictions, axis=1)

    decoded_labels = label_encoder.inverse_transform(val_labels_encoded)
    decoded_predictions = label_encoder.inverse_transform(predicted_labels_encoded)

    report = classification_report(decoded_labels, decoded_predictions, target_names=label_encoder.classes_)
    print("Sınıflandırma Raporu (BERT):")
    print(report)

    # Sınıf başına düşen eleman sayısını yazdırma
    print("\nSınıf Başına Düşen Eleman Sayısı (Test Seti):")
    unique_labels, counts = np.unique(decoded_labels, return_counts=True)
    for label, count in zip(unique_labels, counts):
        print(f"{label:<40} {count}")

# Ana program
if __name__ == "__main__":
    # Veri setini yükleme (Örnekte Excel dosyası kullanılıyor)
    file_path = '/content/drive/MyDrive/Text Classification Tez Çalışması/inputs for final task/bbc_data_Format_151K.xlsx'  # Dosya yolunu doğru ayarlayın
    data = pd.read_excel(file_path)

    # Veriyi temizleme
    data['text'] = data['text'].apply(clean_text)

    # Kategori etiketlerini kodlama
    label_encoder = LabelEncoder()
    data['category_encoded'] = label_encoder.fit_transform(data['category'])

    # Veriyi bölme
    X_train, X_test, y_train_encoded, y_test_encoded, y_train_labels, y_test_labels = train_test_split(
        data['text'], data['category_encoded'], data['category'], test_size=0.8, random_state=42
    )

    # BERT Tokenizer ile veriyi tokenize etme
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    max_length = 64  # Daha kısa bir max_length denenebilir

    train_encodings = tokenizer(list(X_train), truncation=True, padding='max_length', max_length=max_length, return_tensors="tf")
    test_encodings = tokenizer(list(X_test), truncation=True, padding='max_length', max_length=max_length, return_tensors="tf")

    # TensorFlow tensörleri
    train_dataset = tf.data.Dataset.from_tensor_slices((
        {"input_ids": train_encodings["input_ids"], "attention_mask": train_encodings["attention_mask"]},
        y_train_encoded
    )).batch(32)  # Batch size'ı artırarak eğitimi hızlandırabiliriz

    test_dataset = tf.data.Dataset.from_tensor_slices((
        {"input_ids": test_encodings["input_ids"], "attention_mask": test_encodings["attention_mask"]},
        y_test_encoded
    )).batch(32)  # Batch size'ı artırarak değerlendirmeyi hızlandırabiliriz

    # BERT modelini oluşturma
    bert_model = create_bert_model(num_labels=len(label_encoder.classes_))

    # Modeli eğitme
    trained_model = train_bert_model(
        model=bert_model,
        train_data=train_dataset,
        val_data=test_dataset,
        epochs=3,  # Epoch sayısını azaltarak süreyi kısaltabiliriz
        learning_rate=2e-5
    )

    # Modeli değerlendirme
    evaluate_model(
        trained_model,
        {"input_ids": test_encodings["input_ids"], "attention_mask": test_encodings["attention_mask"]},
        y_test_encoded,
        label_encoder
    )