In [54]:
import nltk
import pandas as pd
from nltk.corpus import stopwords
import random
import pickle
from nltk.tokenize import word_tokenize
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode
import re
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

from TurkishStemmer import TurkishStemmer

from zemberek import (
    TurkishSpellChecker,
    TurkishSentenceNormalizer,
    TurkishSentenceExtractor,
    TurkishMorphology,
    TurkishTokenizer
)

In [67]:
df = pd.read_excel('formasyon_veri_yanitlari.xlsx')
df = df.drop(df.columns[[0,1]], axis=1)

text_label_list = []
sentences = []
for column in df.columns:
    label = column.split('-')[0]
    
    for sentence in df[column]:
        sentences.append(sentence)
        text_label_list.append((sentence, label))   

text_label_df = pd.DataFrame(text_label_list, columns =['Text', 'Label'])
text_label_df = text_label_df.sample(frac=1).reset_index(drop=True)
text_label_df

Unnamed: 0,Text,Label
0,voleybol oyna,voleybol
1,Voleybol yapsana,voleybol
2,çizgi şeklini meydana getir,Çizgi
3,Voleybol oynar gibi hareket et,voleybol
4,sürü gösterisine başla,voleybol
...,...,...
562,Voleybol oyna,voleybol
563,Ok basi gibi v olustur,Ok başı
564,Dronelar aynı doğru üzerine sıralansınlar,Çizgi
565,V çiz,V


In [56]:
random.shuffle(sentences)

# Morphology also analyzes how words behave as parts of speech, 
# and how they may be inflected to express grammatical categories including number, tense, and aspect
morphology = TurkishMorphology.create_with_defaults() 
all_words = []

# attempt to reduce its randomness, bringing it closer to a predefined “standard”
normalizer = TurkishSentenceNormalizer(morphology) 

# it’s the task of cutting a text into pieces called tokens.
tokenizer = TurkishTokenizer.DEFAULT

# Stemming is the process of reducing the words to their word stem or root form
stemmer = TurkishStemmer()
stop_words = set(stopwords.words("turkish"))

for sentence in sentences:
    sentence = normalizer.normalize(sentence)
    
    tokens = tokenizer.tokenize(sentence)
    words = [
        stemmer.stem(token.content)
        for token in tokens
        if token.content not in stop_words
    ]
    
    for word in words:
        if word.isalpha():
            all_words.append(word.lower())
            

2025-01-02 17:49:37,236 - zemberek.morphology.turkish_morphology - INFO
Msg: TurkishMorphology instance initialized in 23.174827575683594



In [57]:
all_words_dist = nltk.FreqDist(all_words)
most_common_words = '\n'.join([f"{word}: {freq}" for word, freq in all_words_dist.most_common(15)])
print(f"Most common 15 words:\n{most_common_words}\n")

Most common 15 words:
formasyon: 133
v: 130
voleybol: 102
üçgen: 91
ok: 78
oluş: 77
şekl: 77
çizg: 73
baş: 62
bir: 53
yap: 49
drönelar: 49
çiz: 46
ters: 29
ol: 28



In [58]:
word_features = list(all_words_dist.keys())[:300]

def find_features(document):
    words = set(nltk.word_tokenize(document.lower()))
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features


feature_sets = [(find_features(text), label) for (text, label) in text_label_list]

train_size = int(0.8 * len(feature_sets))  # 80% 
training_set = feature_sets[:train_size]
testing_set = feature_sets[train_size:]


In [59]:
NBclassifier = nltk.NaiveBayesClassifier.train(training_set)

print("Naive Bayes Algorithm accuracy percent:", (nltk.classify.accuracy(NBclassifier, testing_set))*100)
NBclassifier.show_most_informative_features(15)

Naive Bayes Algorithm accuracy percent: 75.43859649122807
Most Informative Features
                      ok = True           Ok baş : V      =     48.3 : 1.0
                   üçgen = True            Üçgen : V      =     32.6 : 1.0
                       v = False           Çizgi : V      =     14.5 : 1.0
                   şekil = True            Üçgen : Ok baş =      9.0 : 1.0
                     geç = True           voleyb : Üçgen  =      8.3 : 1.0
                     bir = True            Çizgi : V      =      8.2 : 1.0
                    olan = True            Üçgen : Çizgi  =      5.0 : 1.0
                     yap = True           voleyb : Çizgi  =      5.0 : 1.0
                      et = True           voleyb : Çizgi  =      5.0 : 1.0
                   geçiş = True           voleyb : Ok baş =      5.0 : 1.0
              kullanarak = True           voleyb : Üçgen  =      5.0 : 1.0
                     var = True           voleyb : Üçgen  =      5.0 : 1.0
                

In [53]:
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB Classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

MNB Classifier accuracy percent: 71.05263157894737


In [60]:
BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB Classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

BernoulliNB Classifier accuracy percent: 0.0


In [61]:
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)

SGDClassifier_classifier accuracy percent: 74.56140350877193


In [62]:
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

LogisticRegression_classifier accuracy percent: 69.2982456140351


In [63]:
LinearSVC_classifier = SklearnClassifier(LinearSVC(dual=True))
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

LinearSVC_classifier accuracy percent: 69.2982456140351


In [64]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []
        for classification in self._classifiers:
            vote = classification.classify(features)
            votes.append(vote)
        return mode(votes)

    def confidence(self, features):
        votes = []
        for classification in self._classifiers:
            vote = classification.classify(features)
            votes.append(vote)

        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf

In [65]:
voted_classifier = VoteClassifier(NBclassifier,MNB_classifier,BernoulliNB_classifier,LogisticRegression_classifier,SGDClassifier_classifier,LinearSVC_classifier)
print("Voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)

ex_data = "drone'lar bugün çizgi olacaksınız"

print(f"Classification: {voted_classifier.classify(find_features(ex_data))}")
print(f"Confidence %: {voted_classifier.confidence(find_features(ex_data))*100}")


Voted_classifier accuracy percent: 72.80701754385966
Classification: Çizgi
Confidence %: 100.0


In [66]:
import time
import logging

from zemberek import (
    TurkishSpellChecker,
    TurkishSentenceNormalizer,
    TurkishSentenceExtractor,
    TurkishMorphology,
    TurkishTokenizer
)

logger = logging.getLogger(__name__)

examples = ["Yrn okua gidicem",
            "Tmm, yarin havuza giricem ve aksama kadar yaticam :)",
            "ah aynen ya annemde fark ettı siz evinizden cıkmayın diyo",
            "gercek mı bu? Yuh! Artık unutulması bile beklenmiyo",
            "Hayır hayat telaşm olmasa alacam buraları gökdelen dikicem.",
            "yok hocam kesınlıkle oyle birşey yok",
            "herseyi soyle hayatında olmaması gerek bence boyle ınsanların falan baskı yapıyosa",
            "email adresim zemberek_python@loodos.com",
            "Kredi başvrusu yapmk istiyrum.",
            "Bankanizin hesp blgilerini ogrenmek istyorum."]

morphology = TurkishMorphology.create_with_defaults()

# SENTENCE NORMALIZATION
start = time.time()
normalizer = TurkishSentenceNormalizer(morphology)
logger.info(f"Normalization instance created in: {time.time() - start} s")

start = time.time()
for example in examples:
    print(example)
    print(normalizer.normalize(example), "\n")
logger.info(f"Sentences normalized in: {time.time() - start} s")

start = time.time()
sc = TurkishSpellChecker(morphology)
logger.info(f"Spell checker instance created in: {time.time() - start} s")


# SPELLING SUGGESTION
li = ["okuyablirim", "tartısıyor", "Ankar'ada", "knlıca", "yapablrim", "kıredi", "geldm", "geliyom", "aldm", "asln"]
start = time.time()
for word in li:
    print(word + " = " + ' '.join(sc.suggest_for_word(word)))
logger.info(f"Spells checked in: {time.time() - start} s")


# SENTENCE BOUNDARY DETECTION
start = time.time()
extractor = TurkishSentenceExtractor()
print("Extractor instance created in: ", time.time() - start, " s")

text = "İnsanoğlu aslında ne para ne sevgi ne kariyer ne şöhret ne de çevre ile sonsuza dek mutlu olabilecek bir " \
       "yapıya sahiptir. Dış kaynaklardan gelebilecek bu mutluluklar sadece belirli bir zaman için insanı mutlu " \
       "kılıyor. Kişi bu kaynakları elde ettiği zaman belirli bir dönem için kendini iyi hissediyor, ancak alışma " \
       "dönemine girdiği andan itibaren bu iyilik hali hızla tükeniyor. Mutlu olma sanatının özü bu değildir. Gerçek " \
       "mutluluk, kişinin her türlü olaya ve duruma karşı kendini pozitif tutarak mutlu hissedebilmesi halidir. Bu " \
       "davranış şeklini edinen insan, zor günlerde güçlü, mutlu günlerde zevk alan biri olur ve mutluluğu kalıcı " \
       "kılar. "

start = time.time()
sentences = extractor.from_paragraph(text)
print(f"Sentences separated in {time.time() - start}s")

for sentence in sentences:
    print(sentence)
print("\n")

# SINGLE WORD MORPHOLOGICAL ANALYSIS
results = morphology.analyze("kalemin")
for result in results:
    print(result)
print("\n")

# SENTENCE ANALYSIS AND DISAMBIGUATION

sentence = "Yarın kar yağacak."
analysis = morphology.analyze_sentence(sentence)
after = morphology.disambiguate(sentence, analysis)

print("\nBefore disambiguation")
for e in analysis:
    print(f"Word = {e.inp}")
    for s in e:
        print(s.format_string())

print("\nAfter disambiguation")
for s in after.best_analysis():
    print(s.format_string())

# TOKENIZATION
tokenizer = TurkishTokenizer.DEFAULT

tokens = tokenizer.tokenize("Saat 12:00.")
for token in tokens:
    print('Content = ', token.content)
    print('Type = ', token.type_.name)
    print('Start = ', token.start)
    print('Stop = ', token.end, '\n')

2025-01-02 17:51:33,875 - zemberek.morphology.turkish_morphology - INFO
Msg: TurkishMorphology instance initialized in 17.299692392349243

2025-01-02 17:51:54,309 - __main__ - INFO
Msg: Normalization instance created in: 20.433939456939697 s

Yrn okua gidicem
yarın okula gideceğim 

Tmm, yarin havuza giricem ve aksama kadar yaticam :)
tamam , yarın havuza gireceğim ve akşama kadar yatacağım :) 

ah aynen ya annemde fark ettı siz evinizden cıkmayın diyo
ah aynen ya annemde fark etti siz evinizden çıkmayın diyor 

gercek mı bu? Yuh! Artık unutulması bile beklenmiyo
gerçek mi bu ? yuh ! artık unutulması bile beklenmiyor 

Hayır hayat telaşm olmasa alacam buraları gökdelen dikicem.
hayır hayat telaşı olmasa alacağım buraları gökdelen dikeceğim . 

yok hocam kesınlıkle oyle birşey yok
yok hocam kesinlikle öyle bir şey yok 

herseyi soyle hayatında olmaması gerek bence boyle ınsanların falan baskı yapıyosa
herşeyi söyle hayatında olmaması gerek bence böyle insanların falan baskı yapıyorsa 

