In [1]:
import nltk
import pandas as pd
from nltk.corpus import stopwords
import random
import pickle
from nltk.tokenize import word_tokenize
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode
import re

from zemberek import (
    TurkishSpellChecker,
    TurkishSentenceNormalizer,
    TurkishSentenceExtractor,
    TurkishMorphology,
    TurkishTokenizer
)

In [2]:
df = pd.read_excel('formasyon_veri_yanitlari.xlsx')
df = df.drop(df.columns[[0,1]], axis=1)

text_label_list = []
sentences = []
for column in df.columns:
    label = column.split('-')[0]
    
    for sentence in df[column]:
        sentences.append(sentence)
        text_label_list.append((sentence, label))   

text_label_df = pd.DataFrame(text_label_list, columns =['Text', 'Label'])
text_label_df

Unnamed: 0,Text,Label
0,Çizgi formasyonunu yap,Çizgi
1,çizgi formasyonu oluştur,Çizgi
2,İHA'ları kullanarak bir çizgi çiz.,Çizgi
3,Bir çizgi çiz,Çizgi
4,Cizgi ciz.,Çizgi
...,...,...
562,valleybol oynayınıverin,voleybol
563,toplanın maç zamanı,voleybol
564,Voleybol oynamanız lazım acil,voleybol
565,Hadi boleybol oynayalım çocuklar,voleybol


In [3]:
random.shuffle(sentences)
morphology = TurkishMorphology.create_with_defaults()
all_words = []
normalizer = TurkishSentenceNormalizer(morphology)


for sentence in sentences:
    sentence = normalizer.normalize(sentence)
    words = nltk.word_tokenize(sentence)
    for word in words:
        if word.isalpha():
            all_words.append(word.lower())

2025-01-02 00:57:20,210 - zemberek.morphology.turkish_morphology - INFO
Msg: TurkishMorphology instance initialized in 9.117685794830322



In [4]:
stop_words = set(stopwords.words("turkish"))
filter_sentence = []

for word in all_words:
    if word not in stop_words:
        filter_sentence.append(word)
        
all_words = filter_sentence

In [5]:
all_words_dist = nltk.FreqDist(all_words)
most_common_words = '\n'.join([f"{word}: {freq}" for word, freq in all_words_dist.most_common(15)])
print(f"Most common 15 words:\n{most_common_words}\n")

print(f"Number of times cizgi is used:{all_words_dist['cizgi']}")
print(f"Number of times çizgi is used:{all_words_dist['çizgi']}")

Most common 15 words:
v: 130
voleybol: 101
üçgen: 89
oluştur: 76
ok: 75
çizgi: 72
başı: 56
bir: 51
formasyonunu: 50
formasyonu: 48
drönelar: 43
yap: 41
çiz: 41
ters: 28
şeklinde: 26

Number of times cizgi is used:0
Number of times çizgi is used:72


In [6]:
word_features = list(all_words_dist.keys())[:200]

def find_features(document):
    words = set(nltk.word_tokenize(document.lower()))
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features


feature_sets = [(find_features(text), label) for (text, label) in text_label_list]

train_size = int(0.8 * len(feature_sets))  # 80% 
training_set = feature_sets[:train_size]
testing_set = feature_sets[train_size:]


In [7]:
NBclassifier = nltk.NaiveBayesClassifier.train(training_set)

print("Naive Bayes Algorithm accuracy percent:", (nltk.classify.accuracy(NBclassifier, testing_set))*100)
NBclassifier.show_most_informative_features(15)

Naive Bayes Algorithm accuracy percent: 75.43859649122807
Most Informative Features
                      ok = True           Ok baş : V      =     48.3 : 1.0
                   üçgen = True            Üçgen : V      =     32.6 : 1.0
                       v = False           Çizgi : V      =     14.5 : 1.0
            formasyonuna = True           voleyb : Üçgen  =     11.6 : 1.0
                     geç = True           voleyb : Üçgen  =      8.3 : 1.0
                     bir = True            Çizgi : V      =      8.2 : 1.0
                     yap = True           voleyb : Çizgi  =      5.0 : 1.0
                   ettir = True           voleyb : Çizgi  =      5.0 : 1.0
            gerçekleştir = True           voleyb : Ok baş =      5.0 : 1.0
                   geçiş = True           voleyb : Ok baş =      5.0 : 1.0
                 şekline = True           voleyb : V      =      5.0 : 1.0
                voleybol = False          Ok baş : voleyb =      4.9 : 1.0
                

In [8]:
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB Classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

MNB Classifier accuracy percent: 71.05263157894737


In [9]:
BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB Classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

BernoulliNB Classifier accuracy percent: 15.789473684210526


In [10]:
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)

SGDClassifier_classifier accuracy percent: 75.43859649122807


In [115]:
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

LogisticRegression_classifier accuracy percent: 69.2982456140351


In [116]:
LinearSVC_classifier = SklearnClassifier(LinearSVC(dual=True))
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

LinearSVC_classifier accuracy percent: 74.56140350877193


In [117]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []
        for classification in self._classifiers:
            vote = classification.classify(features)
            votes.append(vote)
        return mode(votes)

    def confidence(self, features):
        votes = []
        for classification in self._classifiers:
            vote = classification.classify(features)
            votes.append(vote)

        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf

In [118]:
voted_classifier = VoteClassifier(NBclassifier,MNB_classifier,BernoulliNB_classifier,LogisticRegression_classifier,SGDClassifier_classifier,LinearSVC_classifier)
print("Voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)

ex_data = "üçgen"

print(f"Classification: {voted_classifier.classify(find_features(ex_data))}")
print(f"Confidence %: {voted_classifier.confidence(find_features(ex_data))*100}")


Voted_classifier accuracy percent: 73.68421052631578
Classification: Üçgen
Confidence %: 100.0
