In [10]:
!pip install rarfile
!apt-get install unrar

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
unrar is already the newest version (1:6.1.5-1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [3]:
import rarfile
import os
import glob

# Path of uploaded file
rar_path = '/content/finalDataset.rar'

# Extract folder path
extracted_folder = '/content/finalDataset'

# Open and extract the file
with rarfile.RarFile(rar_path) as rf:
    rf.extractall(extracted_folder)

print("The file was extracted successfully.")

The file was extracted successfully.


In [4]:
import os
import glob

# Extracted folder path
data_path = '/content/finalDataset/makaleler-yazarlar'

# Class mapping consisting of authors
authors = [
    "AHMET ÇAKAR", "ALİ SİRMEN", "ATAOL BEHRAMOĞLU", "ATİLLA DORSAY", "AYKAN SEVER",
    "AZİZ ÜSTEL", "CAN ATAKLI", "DENİZ GÖKÇE", "EMRE KONGAR", "GÖZDE BEDELOĞLU",
    "HASAN PULUR", "HİKMET ÇETİNKAYA", "MEHMET ALİ BİRAND", "MEHMET DEMİRKOL",
    "MELTEM GÜRLE", "MERYEM KORAY", "MÜMTAZ SOYSAL", "NAZAN BEKİROĞLU",
    "NAZIM ALPMAN", "NEDİM HAZAR", "NEŞE YAŞIN", "OKAY KARACAN",
    "ÖZGE BAŞAK TANELİ", "REHA MUHTAR", "RIDVAN DİLMEN", "RUHAT MENGİ",
    "SELİM İLERİ", "TARHAN ERDEM", "UFUK BOZKIR", "YAŞAR SEYMAN"
]

# Function to sort according to Turkish alphabet
def turkish_sort(text):
    turkish_alphabet = 'AÂBCÇDEFGĞHIİÎJKLMNOÖPRSŞTUÜVYZ'
    return [turkish_alphabet.index(c) for c in text.upper() if c in turkish_alphabet]

# Alphabetically sorted class mapping according to Turkish alphabet
sorted_authors = sorted(authors, key=turkish_sort)

# Lists for storing data
all_articles = []
labels = []

# Reading files from author folders
for author in sorted_authors:
    folder_path = os.path.join(data_path, author)
    if os.path.exists(folder_path):
        files = glob.glob(os.path.join(folder_path, '*.txt'))
        for file in files:
            try:
                # Reading the file with ISO-8859-9
                with open(file, 'r', encoding='ISO-8859-9') as f:
                    article = f.read().strip()
                    all_articles.append(article)
                    labels.append(author)
            except UnicodeDecodeError as e:
                print(f"Error reading file {file}: {e}")
            except Exception as e:
                print(f"Unexpected error with file {file}: {e}")

# Data control
print(f"Total number of articles: {len(all_articles)}")
print(f"Total number of labels: {len(labels)}")
print(f"Sample article: {all_articles[0][:200]}...") # A preview of the first article
print(f"Sample label: {labels[0]}")



Total number of articles: 1500
Total number of labels: 1500
Sample article: İnönü'de iki farklı devre
Beşiktaş, rahat oynadı, çok gol kaçırdı ama Karabük o kadar dağınık ve kötü ki bu maç için fazla bir şey söylemenin de bir anlamı yok. İlk devre maça bakıyoruz 5-0 olurdu. Ek...
Sample label: AHMET ÇAKAR


# Data Prepocessing

In [5]:
!pip install jpype1 #Since zemberek is written in java, we need to install jpeg1
!pip install zemberek-python #zembereki installation

Collecting jpype1
  Downloading jpype1-1.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading jpype1-1.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (493 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/493.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.9/493.9 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jpype1
Successfully installed jpype1-1.5.1
Collecting zemberek-python
  Downloading zemberek_python-0.2.3-py3-none-any.whl.metadata (2.7 kB)
Collecting antlr4-python3-runtime==4.8 (from zemberek-python)
  Downloading antlr4-python3-runtime-4.8.tar.gz (112 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.4/112.4 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading zemberek_python-0.2.3-py3-none-any.whl (95.1 MB)
[2K   [90m

In [6]:
from zemberek.tokenization import TurkishTokenizer
from zemberek.morphology import TurkishMorphology
import string

# start Zemberek Morphology
morphology = TurkishMorphology.create_with_defaults()
tokenizer = TurkishTokenizer.DEFAULT

# convert lovercase
def to_lowercase(text):
    return text.lower()

# Tokenization
def tokenize_text(text):
    return tokenizer.tokenize(text)

# Clean up punctuation
def remove_punctuation(tokens):
    return [token for token in tokens if token.content not in string.punctuation]

# stemming
def apply_stemming(tokens, morphology):
    stemmed_tokens = []
    for token in tokens:
        if '#' in token.content or any(char in string.punctuation for char in token.content):
            continue

        analysis = morphology.analyze(token.normalized)
        if analysis.analysis_results:  # Add words that can be analyzed
            stemmed_tokens.append(analysis.analysis_results[0].item.root)
    return stemmed_tokens

# Processing on all articles
processed_articles = []
for article in all_articles:
    try:
        # 1. convert lowercase
        lowercased_article = to_lowercase(article)

        # 2. Tokenization
        tokens = tokenize_text(lowercased_article)

        # 3. clear punctuation
        cleaned_tokens = remove_punctuation(tokens)

        # 4. apply stemming
        stemmed_tokens = apply_stemming(cleaned_tokens, morphology)

        # Save the processed data
        processed_articles.append(" ".join(stemmed_tokens))
    except Exception as e:
        print(f"Error processing article: {e}")

# Control of processed data
print(f"Total number of processed articles: {len(processed_articles)}")
print(f"Sample processed article: {processed_articles[0][:200]}...")


INFO:zemberek.morphology.turkish_morphology:TurkishMorphology instance initialized in 18.37812614440918


2025-01-24 11:12:33,831 - zemberek.morphology.turkish_morphology - INFO
Msg: TurkishMorphology instance initialized in 18.37812614440918

Total number of processed articles: 1500
Sample processed article: de iki fark devre beşiktaş rahat oyna çok gol kaçır ama karabük o kadar dağınık ve kötü ki bu maç için fazla bir şey söyle de bir anlam yok devre maç bak ol ekrem hayat bul gol pozisyon bul sağ gel so...


In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Processed articles and labels will be used for TF-IDF

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_articles)

# Convert TF-IDF results to DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Adding labels
tfidf_df['class'] = labels

# Sort the documents and name them as 'doc1', 'doc2', ...
tfidf_df.index = [f"doc{doc_id + 1}" for doc_id in range(len(processed_articles))]

# Save as CSV
output_path = '/content/tfidf_values.csv'
tfidf_df.to_csv(output_path, sep=';', index=True, encoding='utf-8-sig')
print(f"TF-IDF table saved as '{output_path}'.")


INFO:numexpr.utils:NumExpr defaulting to 2 threads.


2025-01-24 11:34:57,673 - numexpr.utils - INFO
Msg: NumExpr defaulting to 2 threads.

TF-IDF table saved as '/content/tfidf_values.csv'.


In [11]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Load data from CSV file
dataset = pd.read_csv('/content/tfidf_values.csv', sep=';', encoding='utf-8-sig')

# Separating TF-IDF features and classes by removing the 'Unnamed: 0' column
X = dataset.drop(columns=['class', 'Unnamed: 0']).to_numpy() # TF-IDF feature matrix
y = dataset['class'].to_numpy()  # class labels

# Sorting and numbering author names in alphabetical order
sorted_authors = sorted(set(y))
label_mapping = {author: f'class{i+1}' for i, author in enumerate(sorted_authors)}
y_mapped = np.array([label_mapping[author] for author in y])

# Convert class labels to numeric values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_mapped)
# Using StratifiedKFold for 5-Fold Cross-Validation
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

accuracy_list = []  # Storing the accuracy results for each fold
precision_list = []
recall_list = []
f1_list = []

final_results = {class_name: {'precision': [], 'recall': [], 'f1-score': []} for class_name in label_mapping.values()}

# 5-fold cross validation
for fold, (train_index, test_index) in enumerate(skf.split(X, y_encoded), 1):
    print(f"\nFold {fold}:")

    # Separating training and test sets
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]

   # Creating and training the SVM model
    classifier = SVC(kernel='linear', random_state=42)  #linear kernel
    classifier.fit(X_train, y_train)

    # Making predictions on the test set
    y_pred = classifier.predict(X_test)

    # Classification report and accuracy score
    print("Classification Report:")
    target_names = [f'class{i+1}' for i in range(len(sorted_authors))]
    report = classification_report(y_test, y_pred, target_names=target_names, output_dict=True)
    print(classification_report(y_test, y_pred, target_names=target_names))

    # Calculate accuracy, precision, recall and F1-score for each fold
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')

   # Add metrics to the list
    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)

# Add metrics for each class to final_results
    for class_name in label_mapping.values():
        final_results[class_name]['precision'].append(report[class_name]['precision'])
        final_results[class_name]['recall'].append(report[class_name]['recall'])
        final_results[class_name]['f1-score'].append(report[class_name]['f1-score'])

# Calculate average metrics for each class
final_metrics = {
    'Class': [],
    'Precision': [],
    'Recall': [],
    'F1-Score': []
}

for class_name, metrics in final_results.items():
    final_metrics['Class'].append(class_name)
    final_metrics['Precision'].append(np.mean(metrics['precision']))
    final_metrics['Recall'].append(np.mean(metrics['recall']))
    final_metrics['F1-Score'].append(np.mean(metrics['f1-score']))

# Calculating overall averages
overall_precision = np.mean(precision_list)
overall_recall = np.mean(recall_list)
overall_f1 = np.mean(f1_list)
overall_accuracy = np.mean(accuracy_list)

# Adding the overall averages to the table
final_metrics['Class'].append('Overall Average')
final_metrics['Precision'].append(overall_precision)
final_metrics['Recall'].append(overall_recall)
final_metrics['F1-Score'].append(overall_f1)

final_results_df = pd.DataFrame(final_metrics)

# Save the results to an Excel file
final_results_df.to_excel('/content/final_results.xlsx', index=False)

print("\nFinal results saved to 'final_results.xlsx'")
print(f"Average Accuracy: {overall_accuracy}")


Fold 1:
Classification Report:
              precision    recall  f1-score   support

      class1       0.83      1.00      0.91        10
      class2       0.77      1.00      0.87        10
      class3       0.91      1.00      0.95        10
      class4       1.00      1.00      1.00        10
      class5       1.00      1.00      1.00        10
      class6       1.00      1.00      1.00        10
      class7       1.00      0.90      0.95        10
      class8       1.00      1.00      1.00        10
      class9       0.90      0.90      0.90        10
     class10       0.91      1.00      0.95        10
     class11       0.82      0.90      0.86        10
     class12       0.73      0.80      0.76        10
     class13       0.88      0.70      0.78        10
     class14       1.00      1.00      1.00        10
     class15       1.00      0.90      0.95        10
     class16       1.00      1.00      1.00        10
     class17       0.89      0.80      0.84      