In [155]:
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.classify import ClassifierI
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from statistics import mode
from nltk.classify.scikitlearn import SklearnClassifier
import re
from TurkishStemmer import TurkishStemmer
from zeyrek import MorphAnalyzer
import logging
from zemberek import TurkishMorphology, TurkishSentenceNormalizer, TurkishTokenizer

In [156]:
# Load and preprocess data
df = pd.read_excel('formasyon_veri_yanitlari.xlsx')
df = df.drop(df.columns[[0, 1]], axis=1)

# Extract text-label pairs
text_label_list = []
for column in df.columns:
    label = column.split('-')[0]
    for sentence in df[column].dropna():
        text_label_list.append((sentence, label))

text_label_df = pd.DataFrame(text_label_list, columns=['Text', 'Label'])
# shuffle your dataframe in-place and reset the index
text_label_df = text_label_df.sample(frac=1).reset_index(drop=True)

In [157]:
# Initialize Zemberek tools
tokenizer = TurkishTokenizer.DEFAULT
stop_words = set(stopwords.words("turkish"))
stemmer = TurkishStemmer()
morphology = TurkishMorphology.create_with_defaults()
normalizer = TurkishSentenceNormalizer(morphology)

# Text Preprocessing
def preprocess_text(text):
    # ^ Matches the start of the string
    # If the first character of the set is '^', all the characters that are not in the set will be matched
    # \ signals a special sequence
    # [] Used to indicate a set of characters
    # Remove non-alphanumeric
    # remove any new leading and trailing whitespace
    text = re.sub(r"[^\sa-zA-Z0-9ğüşöçıĞÜŞİÖÇ]", "", text)
    text = text.strip()
    
    # normalize
    # attempt to reduce its randomness, bringing it closer to a predefined “standard”
    text = normalizer.normalize(text.lower())
    #print(text)
    
    # tokenize
    # it’s the task of cutting a text into pieces called tokens.
    tokens = tokenizer.tokenize(text)
    words = [
        stemmer.stem(token.content)
        for token in tokens
        if token.content not in stop_words
    ]
    return " ".join(words)

# Apply preprocessing
text_label_df['Processed_Text'] = text_label_df['Text'].apply(preprocess_text)

2025-01-02 20:35:34,932 - zemberek.morphology.turkish_morphology - INFO
Msg: TurkishMorphology instance initialized in 12.686001539230347



In [158]:
all_words = []
for text in text_label_df['Processed_Text']:
    all_words.extend(text.split())

all_words_dist = nltk.FreqDist(all_words)
most_common_words = '\n'.join([f"{word}: {freq}" for word, freq in all_words_dist.most_common(15)])
print(f"Most common 15 words:\n{most_common_words}\n")

Most common 15 words:
formasyon: 133
v: 128
voleybol: 102
üçgen: 91
ok: 78
oluş: 77
şekl: 77
çizg: 73
baş: 62
bir: 53
yap: 49
drönelar: 49
çiz: 46
ters: 29
ol: 28



In [159]:
# find features
word_features = list(all_words_dist.keys())[:400]

def find_features(document):
    words = set(document.split())
    return {word: (word in words) for word in word_features}

# Prepare datasets
featuresets = [
    (find_features(row['Processed_Text']), row['Label'])
    for _, row in text_label_df.iterrows()
]

In [160]:
# datasets
train_size = int(len(featuresets) * 0.8)
training_set = featuresets[:train_size]
testing_set = featuresets[train_size:]

# Classifiers
classifiers = {
    "Naive Bayes": nltk.NaiveBayesClassifier.train(training_set),
    "MultinomialNB": SklearnClassifier(MultinomialNB()).train(training_set),
    "BernoulliNB": SklearnClassifier(BernoulliNB()).train(training_set),
    "SGDClassifier": SklearnClassifier(SGDClassifier()).train(training_set),
    "LinearSVC": SklearnClassifier(LinearSVC(dual=True)).train(training_set),
    "LogisticRegression": SklearnClassifier(LogisticRegression()).train(training_set),
}


# Evaluate classifiers
for name, clf in classifiers.items():
    print(f"{name} Accuracy: %{nltk.classify.accuracy(clf, testing_set) * 100}")

Naive Bayes Accuracy: %92.98245614035088
MultinomialNB Accuracy: %92.98245614035088
BernoulliNB Accuracy: %93.85964912280701
SGDClassifier Accuracy: %95.6140350877193
LinearSVC Accuracy: %97.36842105263158
LogisticRegression Accuracy: %96.49122807017544


In [161]:
# Voting Classifier
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = [clf.classify(features) for clf in self._classifiers]
        return mode(votes)

    def confidence(self, features):
        votes = []
        for classification in self._classifiers:
            vote = classification.classify(features)
            votes.append(vote)

        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf

voted_classifier = VoteClassifier(*classifiers.values())
print(f"Voted Classifier Accuracy: {nltk.classify.accuracy(voted_classifier, testing_set) * 100}%")

ex_data = "zavallı"

print(f"Classification: {voted_classifier.classify(find_features(ex_data))}")
print(f"Confidence %: {voted_classifier.confidence(find_features(ex_data))*100}")

Voted Classifier Accuracy: 95.6140350877193%
Classification: voleybol
Confidence %: 100.0


In [162]:
# Trying lemmatize

from zeyrek import MorphAnalyzer
import nltk
import logging

def run_examples():
    logging.getLogger("zeyrek.rulebasedanalyzer").setLevel(logging.ERROR)
    analyzer = MorphAnalyzer()
    
    with open('text.txt', encoding='utf-8') as text_file:
        text = text_file.read()

    tokens = tokenizer.tokenize(text)
    words = []
    for token in tokens:
        content = token.content
        if content not in stop_words:
            lemmas = analyzer.lemmatize(content)[0][1]
            if lemmas:  # Check if there are any lemmas
                words.append(lemmas[0])  # Append the first lemma
            else:
                words.append(content)  # Fallback to the original word
    print(words)

if __name__ == '__main__':
    run_examples()

['V', 'şek', 'uç', 'üçgen', 'formasyon', 'geçmek', 'Hadi', 'boleybol', 'oynamak', 'Çocuk', 'üçgen', 'formasyon', 'oluşmak', 'gerek', '.', 'voleybol', 'maç', 'yapmak', 'bir', 'nokta', 'bir', 'nokta', 'dümdüz', 'bir', 'çizgi', 'Çek', ',', '...', 'üçgen', 'formasyon', 'oluştururmusun', 'Dronların', 'çizgi', 'üzerinde', 'gitmek', 'istemek']
