# **PREPROCESSING**

Read the Wikipedia data and clean the doc links. Then, write the cleaned file to "cleaned_wiki_00.txt".

In [1]:
import re

def clean_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        cleaned_content = re.sub(r'<.*?>', '', content)

    return cleaned_content

cleaned_data = clean_text('wiki_00')

with open('cleaned_wiki_00.txt', 'w', encoding='utf-8') as file:
    file.write(cleaned_data)


# **EXTRACT SENTENCES**
In this code block, we extract the all sentences in the Wikipedia data. There are only sentences that includes only one "ki" or only one "de-da-te-ta" suffix. It is assumed that there is only one "de-da-te-ta" or there is only one "ki".

In [2]:
import re

def extract_sentences(text):
    sentences = re.split(r'\.\s+|\.$', text)
    return [sentence.strip() for sentence in sentences if sentence]

def write_selected_sentences_to_file(sentences, output_file_path):
    with open(output_file_path, 'w', encoding='utf-8') as file:
        for sentence in sentences:
            sentence = sentence.split()
            de_counter = 0
            ki_counter = 0
            for word in sentence:
            # Check the number of "de" and "ki" suffixes or conjunctions
                if(word=="de" or word=="da" or word[-2:]=="de" or word[-2:]=="da" or word[-2:]=="te" or word[-2:]=="ta"):
                    de_counter = de_counter + 1
                if(word=="ki" or word[-2:]=="ki"):
                    ki_counter = ki_counter + 1

            if((de_counter == 1 and ki_counter ==0) or (de_counter == 0 and ki_counter ==1) or (de_counter == 1 and ki_counter ==1)):
                file.write(" ".join(sentence) + '\n')

sentences = extract_sentences(cleaned_data)
write_selected_sentences_to_file(sentences, 'extracted_sentences.txt')


# **TOKENIZE SENTENCES**
In here, sentences are tokenized. "De-da-te-ta" suffixes and "ki" suffixes are the important point in this process. For example:
"Ahmetler de gelecek" -> ["Ahmed", "de_suffix", "gelecek"]
"Masada ekmek var" -> ["Masa", "de_suffix", "ekmek", "var"]

In [4]:
import re

def custom_tokenize(sentence):
    # Split words to process the suffixes "de" and "ki"
    tokens = []
    de_token = "de_suffix"
    ki_token = "ki_suffix"
    words = sentence.split()

    for word in words:
        word = word.lower()
        # Check the word if it ends with a suffix "de" or "ki"
        if re.match(r'.+(de|da|ta|te)$', word):
            # Add the word and suffix as token.
            tokens.append(word[:-2])
            tokens.append(de_token)
        elif re.match(r'.+(ki)$', word):
            # Add the word and suffix as token.
            tokens.append(word[:-2])
            tokens.append(ki_token)
        else:
            # Add the word what you read
            if(word=="da" or word=="de"):
                tokens.append(de_token)
            elif(word=="ki"):
                tokens.append(ki_token)
            else:
                tokens.append(word)

    return tokens


# **BUILD DICTIONARY**



In [5]:
from collections import defaultdict

def build_vocab(sentences):
    vocab = defaultdict(lambda: len(vocab))
    vocab["<PAD>"] = 0  # Paddint Token
    vocab["<UNK>"] = 1  # Token for unknown words
    for sentence in sentences:
        tokens = custom_tokenize(sentence)
        for token in tokens:
            vocab[token]

    return dict(vocab)

def sentences_to_token_lists(sentences, vocab, max_length):
    token_lists = []
    for sentence in sentences:
        tokens = custom_tokenize(sentence)
        token_list = [vocab.get(token, vocab['<UNK>']) for token in tokens]  # For unknown words <UNK>
        # Padding uygula
        padded_token_list = token_list + [0] * (max_length - len(token_list))
        token_lists.append(padded_token_list[:max_length])  # Crop it. Do not exceed max. length
    return token_lists

# **LABEL SENTENCES AND CREATE JSON FILES**


1.   Read file and get all sentences. Put them to a list
2.   Get a sample from the sentences. The percentage is 0.3 for this dataset
3.   Check the number of sentences
4.   Split data into Train, Validation and Test set
5.   Tokenize all words in sentences and create dictionary
6.   Create JSON files and save them




In [6]:
from collections import defaultdict
from sklearn.model_selection import train_test_split
import json

# Label all sentences
def label_sentence(sentence):
    # If "de" is a conjunction
    if " de " in sentence or sentence.startswith("de ") or sentence.endswith(" de") or  " da " in sentence or sentence.startswith("da ") or sentence.endswith(" da"):
        return "DE_CON"
    # If "ki" is a conjunction
    if " ki " in sentence or sentence.startswith("ki ") or sentence.endswith(" ki"):
        return "KI_CON"

    # If "de" or "ki" is a suffix (bulunma hali eki)
    else:
        words = sentence.split()
        for word in words:
            if word[-2:]=="ki":
                return "KI_SUF"
        return "DE_SUF"

# Create JSON files of labelled sentences. Label, then add to dictionary
def create_json_labels(sentences, vocab, file_name, max_length):
    data = []
    token_lists = sentences_to_token_lists(sentences, vocab, max_length)
    for sentence, token_list in zip(sentences, token_lists):
        label = label_sentence(sentence)
        data.append({"sentence": token_list, "label": label})

    with open(file_name, 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, ensure_ascii=False, indent=4)

# Read file and add all lines to the list
def read_file_to_list(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        sentences = [line.strip() for line in file]
    return sentences

file_path = 'extracted_sentences.txt'
sentences = read_file_to_list(file_path) # Extract all sentences from the text

# Get 0.3 of the sentences for training
percentage = 3
sentences = sentences[:int(len(sentences) * (percentage / 10))]


# Check the sentence numbers
print(f"Total Sentences: {len(sentences)}")
print("First 5 Sentence:")
for sentence in sentences[:5]:
    print(sentence)


# Split dataset as training, valid and test set. 70% Train, 20% Valid, 10% Test
train_sentences, test_valid_sentences = train_test_split(
    sentences, test_size=0.3, random_state=42)

valid_sentences, test_sentences = train_test_split(
    test_valid_sentences, test_size=1/3, random_state=42)

print("Size of Training Set:", len(train_sentences))
print("Size of Validation Set:", len(valid_sentences))
print("Size of Test Set:", len(test_sentences))

# Create dictionary (vocabulary)
vocab = build_vocab(sentences)
with open('vocab.json', 'w') as json_file:
    json.dump(vocab, json_file, indent=4)


# Create JSON label files
max_length = max(len(custom_tokenize(sentence)) for sentence in sentences)
create_json_labels(train_sentences, vocab, 'sentence_labels_train.json', max_length)
create_json_labels(valid_sentences, vocab, 'sentence_labels_valid.json', max_length)
create_json_labels(test_sentences, vocab, 'sentence_labels_test.json', max_length)


Total Sentences: 320185
First 5 Sentence:
Cengiz Han’ın şeceresi yarı mitolojik bir şekilde sis perdesi arkasındadır
Yazılış tarihi itibarı ile Cengiz Han’ın yaşadığı döneme en yakın olanı Şamanizm etkilerinin görüldüğü, 1240 yılında Moğolca olarak kaleme alınmış olan Moğolların Gizli Tarihi adlı esere göre Cengiz Han’dan 10 nesil önce yaşayan Alangoya, Cengiz Han soyunun efsanevi büyük annesi olarak kabul edilmiştir
Alangoya efsanesi yalnızca Cengiz Han’ı değil onunla birlikte “Nirun” yani ışığın çocukları adı verilen bir yığın boyu ilgilendirse de Cengiz Han soyunun en büyük efsanesi olarak kabul edilmiştir
1140 yılında Moğol kabilelerinden Börçiginlere mensup Kabul, bütün Moğolların ilk lideri olarak “Han” unvanını almıştır
Cengiz Han’ın büyük amacası Kutua, bu hakarete Çin üzerine ve Tatarlara bir dizi saldırı düzenleyerek cevap verdi ve bu akınlar sonunda “Moğol Herkülü” unvanını kazandı
Size of Training Set: 224129
Size of Validation Set: 64037
Size of Test Set: 32019


# **TOKENIZING A SPECIFIC SENTENCE**
This function converts a sentence to token list. It processes "de" and "ki" specifically.
Args: Sentence is the sentence that will be tokenized, token_dict is the dictionary that includes word-token matches.
It returns token list

In [7]:
def tokenize_sentence(sentence, token_dict):
    tokens = []
    de_token = token_dict.get("de_suffix", token_dict.get("<OOV>"))
    ki_token = token_dict.get("ki_suffix", token_dict.get("<OOV>"))
    words = sentence.split()

    for word in words:
        word = word.lower()
        # Kelime "de" veya "ki" ile bitiyor mu diye kontrol et
        if re.match(r'.+(de|da|ta|te)$', word):
            # Kelimeyi ve eki ayrı token olarak ekle
            base_word = word[:-2]
            tokens.append(token_dict.get(base_word, token_dict.get("<OOV>")))
            tokens.append(de_token)
        elif re.match(r'.+(ki)$', word):
            # Kelimeyi ve eki ayrı token olarak ekle
            base_word = word[:-2]
            tokens.append(token_dict.get(base_word, token_dict.get("<OOV>")))
            tokens.append(ki_token)
        else:
            # Kelimeyi olduğu gibi ekle
            if word.lower() in ["da", "de"]:
                tokens.append(de_token)
            elif word.lower() == "ki":
                tokens.append(ki_token)
            else:
                tokens.append(token_dict.get(word.lower(), token_dict.get("<OOV>")))

    return tokens

## **LOAD LABELS**
Load the labels that we create before

In [8]:
import json

def load_data(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        data = json.load(file)
        X = [item["sentence"] for item in data]
        y = [item["label"] for item in data]
    return X, y

# JSON dosyalarını yükle
X_train, y_train = load_data('sentence_labels_train.json')
X_valid, y_valid = load_data('sentence_labels_valid.json')
X_test, y_test = load_data('sentence_labels_test.json')

def check_lengths(X):
    lengths = [len(x) for x in X]
    return lengths

# Check the lengths
train_lengths = check_lengths(X_train)
valid_lengths = check_lengths(X_valid)
test_lengths = check_lengths(X_test)

# Print the length of sentences
print("Sentence length in the training set:")
print(train_lengths)
print("\nSentence length in the valid set:")
print(valid_lengths)
print("\nSentence length in the test set:")
print(test_lengths)

Sentence length in the training set:
[1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 16

# **MODEL DETAILS**

In [12]:
from sklearn.preprocessing import LabelEncoder
from keras.metrics import Precision, Recall
from keras.preprocessing.sequence import pad_sequences

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_valid_encoded = label_encoder.transform(y_valid)
y_test_encoded = label_encoder.transform(y_test)


from keras.models import Sequential
from keras.layers import Dense

# Model parameters
max_token_count = max(max(train_lengths), max(valid_lengths), max(test_lengths))
input_dim = max_token_count  # Max token number in all datasets

output_dim = 4  # Output size (Unique label number)
hidden_units = 64  # Neuron number in the hidden layers

# Padding
X_train_padded = pad_sequences(X_train, maxlen=max_token_count, padding='post')
X_valid_padded = pad_sequences(X_valid, maxlen=max_token_count, padding='post')
X_test_padded = pad_sequences(X_test, maxlen=max_token_count, padding='post')

from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, Flatten

model = Sequential()
model.add(Embedding(input_dim=len(vocab), output_dim=4, input_length=max_token_count))
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(hidden_units, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(hidden_units, activation='relu'))
model.add(Dense(output_dim, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

model.fit(X_train_padded, y_train_encoded, validation_data=(X_valid_padded, y_valid_encoded), epochs=30, batch_size=16)

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 1661, 4)           2010084   
                                                                 
 dropout_7 (Dropout)         (None, 1661, 4)           0         
                                                                 
 flatten_4 (Flatten)         (None, 6644)              0         
                                                                 
 dense_9 (Dense)             (None, 64)                425280    
                                                                 
 dropout_8 (Dropout)         (None, 64)                0         
                                                                 
 dense_10 (Dense)            (None, 64)                4160      
                                                                 
 dense_11 (Dense)            (None, 4)                

<keras.src.callbacks.History at 0x7d89e2e67df0>

In [13]:
from sklearn.metrics import precision_score, recall_score
import numpy as np

# model.load_weights('/content/nlp//model_weights.h5')

# Prediction on test set
y_pred = model.predict(X_test_padded)
y_pred_classes = np.argmax(y_pred, axis=1)

# Precision and Recall calculation
precision = precision_score(y_test_encoded, y_pred_classes, average='weighted')
recall = recall_score(y_test_encoded, y_pred_classes, average='weighted')

print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))


Precision: 0.87
Recall: 0.89


  _warn_prf(average, modifier, msg_start, len(result))


# **TEST**
Create a test set. Tokenize them, apply padding, then give the model for prediction.

In [14]:
def tokenize_sentence(sentence, token_dict):
    tokens = []
    de_token = token_dict.get("de_suffix", token_dict.get("<OOV>"))
    ki_token = token_dict.get("ki_suffix", token_dict.get("<OOV>"))
    words = sentence.split()

    for word in words:
        word = word.lower()
        if re.match(r'.+(de|da|ta|te)$', word):
            base_word = word[:-2]
            tokens.append(token_dict.get(base_word, token_dict.get("<OOV>")))
            tokens.append(de_token)
        elif re.match(r'.+(ki)$', word):
            base_word = word[:-2]
            tokens.append(token_dict.get(base_word, token_dict.get("<OOV>")))
            tokens.append(ki_token)
        else:
            if word.lower() in ["da", "de"]:
                tokens.append(de_token)
            elif word.lower() == "ki":
                tokens.append(ki_token)
            else:
                tokens.append(token_dict.get(word.lower(), token_dict.get("<UNK>")))

    return tokens

def pad_token_lists(token_lists, max_length, padding_value=0):
    padded_lists = []

    for tokens in token_lists:
        if len(tokens) < max_length:
            padded_list = tokens + [padding_value] * (max_length - len(tokens))
        elif len(tokens) > max_length:
            padded_list = tokens[:max_length]
        else:
            padded_list = tokens

        padded_lists.append(padded_list)

    return padded_lists

sentences = ["Çatıdaki yaprakları temizle", "Sende gördün mü", "Evde gördüm seni", "Kaya da geliyor.", \
             "Masada görüşürüz", \
             "Sette görüşeceğiz", "Gül ki güller açsın", "Evde ki çocuğu gördün mü.", \
             "Hem kar hem de tipi vardı", "Benki yedi iklimin sultanı", "Onu da bırakalım", \
             "Geçenki gibi oldu yine.", "Güzel çalışki başarılı olasın", \
             "O günki gibi oldu yine", \
             "Elma da alayım mı?", "Ya sende ne boş bir adam çıktın.", "Yaparız yapmasına da sonrası?", \
             "Test setin de yok bu cümleler.", "Bitirme projem de biber tespiti yaptım", \
             "Tamam, birinci kelime cepte"]
tokenized_sentences = []
for sentence in sentences:
    tokenized_sentence = tokenize_sentence(sentence, vocab)
    tokenized_sentences.append(tokenized_sentence)
    print(tokenized_sentence)

# Apply padding
padded_tokenized_sentences = pad_token_lists(tokenized_sentences, max_length)

print(padded_tokenized_sentences)

# Prediction
for sentence, orig_sentence in zip(padded_tokenized_sentences, sentences):
  input_data = np.array([sentence])

  # Model prediction
  prediction = model.predict(input_data)
  predicted_class = np.argmax(prediction, axis=1)
  predicted_label = label_encoder.inverse_transform(predicted_class)
  print("Cümle:", orig_sentence, "Tahmin edilen sınıf:", predicted_label)




[383353, 105, 53080, 1]
[447, 9, 53324, 121793]
[3960, 9, 82900, 30556]
[14242, 9, 1]
[7814, 9, 1]
[17354, 9, 1]
[18691, 105, 278189, 265505]
[3960, 9, 105, 361, 53324, 1]
[1142, 14288, 1142, 9, 3922, 205]
[425, 105, 3948, 10938, 22644]
[210, 9, 64801]
[1664, 105, 762, 560, 1]
[1975, 35009, 105, 2481, 285367]
[103, 119, 105, 762, 560, 2699]
[29214, 9, 1, 74719]
[2064, 447, 9, 971, 1735, 7, 333, 1]
[308832, 80003, 9, 1]
[8277, 27114, 9, 774, 80, 1]
[176808, 1, 9, 19900, 23300, 36359]
[1, 3888, 6502, 45190, 9]
[[383353, 105, 53080, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# !cp sentence_labels_test.json /content/drive/MyDrive/
# !cp sentence_labels_train.json /content/drive/MyDrive/
# !cp vocab.json /content/drive/MyDrive/


In [None]:
# !cp sentence_labels_valid.json /content/drive/MyDrive/
# model.save_weights('model_weights.h5')
# !cp model_weights.h5 /content/drive/MyDrive/
