Imported libraries which is crucial for the program.

In [1]:
import tensorflow as tf
import numpy as np
from tensorflow import keras

Calling dataset from google drive

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


Each sentence is checked for the presence of " de " or " ki " (with spaces to ensure they are not part of another word). Only sentences containing these substrings are processed and included in the final dataset. This ensures that our dataset focuses on sentences relevant to your task of classifying the usage of "de" and "ki".

In [3]:
from gensim.models import KeyedVectors

def process_text(text):
    # Split the text into lines and filter out lines starting with </doc> or <doc>
    lines = text.split('\n')
    filtered_lines = [line for line in lines if not line.startswith('</doc>') and not line.startswith('<doc')]

    # Join the filtered lines back into a single string for processing
    filtered_text = '\n'.join(filtered_lines)

    processed_sentences = []
    labels = []  # Initialize an empty list to store the labels

    sentences = filtered_text.lower().split(".")
    for sentence in sentences:
        if 'de ' in sentence or 'ki ' in sentence:  # Check if the sentence contains 'de' or 'ki'
            words = sentence.split()
            processed_words = [word for word in words if not (word.startswith('<') and word.endswith('>'))]
            processed_sentence = ' '.join(processed_words).strip()

            # Determine label based on the presence of ' de ' or ' ki ' as separate words
            if ' de ' in processed_sentence or ' ki ' in processed_sentence:
                label = 1  # Indicates '<separated>'
            else:
                label = 0  # Indicates '<unified>'

            processed_sentences.append(processed_sentence)
            labels.append(label)

    return processed_sentences, labels

def read_partial_file(file_path, portion):
    with open(file_path, 'r', encoding='utf8') as file:
        file.seek(0, 2)
        file_size = file.tell()
        file.seek(0)
        data_size = int(file_size * portion)
        return file.read(data_size)

try:
    _data = read_partial_file('/content/drive/MyDrive/Colab/wiki_00', 0.1)
    training_size = int(len(_data) * 0.95)
    training_data = _data[:training_size]
    testing_data = _data[training_size:]

    processed_training_data, training_labels = process_text(training_data)
    # Convert list to string
    processed_training_data_str = '\n'.join(processed_training_data)
    with open('training_data.txt', 'w', encoding="utf8") as f:
        f.write(processed_training_data_str)

    processed_testing_data, testing_labels = process_text(testing_data)
    # Convert list to string
    processed_testing_data_str = '\n'.join(processed_testing_data)
    with open('testing_data.txt', 'w', encoding="utf8") as f:
        f.write(processed_testing_data_str)

except Exception as e:
    print(f"An error occurred: {e}")

In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Assuming `processed_training_data` and `processed_testing_data` are lists of sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(processed_training_data)  # Fit tokenizer on training data

# Convert sentences to sequences
training_sequences = tokenizer.texts_to_sequences(processed_training_data)
testing_sequences = tokenizer.texts_to_sequences(processed_testing_data)

# Pad sequences to ensure uniform length
max_len = max(max(len(x) for x in training_sequences), max(len(x) for x in testing_sequences))
X_train = pad_sequences(training_sequences, maxlen=max_len)
X_test = pad_sequences(testing_sequences, maxlen=max_len)

# Convert labels to numpy arrays
Y_train = np.array(training_labels)
Y_test = np.array(testing_labels)

In [6]:
# Manual splitting for validation set
val_size = int(len(X_train) * 0.1)  # 10% of training data for validation

X_val = X_train[:val_size]
Y_val = Y_train[:val_size]

X_train_new = X_train[val_size:]
Y_train_new = Y_train[val_size:]

In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam

vocab_size = len(tokenizer.word_index) + 1  # Assuming tokenizer is already fitted to your corpus

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=100, input_length=max_len),  # Increased embedding dimension
    LSTM(64),  # Using LSTM layer to capture sequential dependencies
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')  # Output layer for binary classification
])

# Compile the model with Adam optimizer and binary crossentropy loss
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [37]:
from tensorflow.keras.callbacks import EarlyStopping

#early stopping to halt the training when the validation loss stops improving. This can save time by preventing unnecessary epochs.
#early_stopping = EarlyStopping(monitor='val_loss', patience=3)

history = model.fit(
    X_train_new, Y_train_new,
    epochs=10,
    batch_size=32,
    validation_data=(X_val, Y_val),
    verbose=1
    #callbacks=[early_stopping]
)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [38]:
# Tokenize the test sentences
test_sequences = tokenizer.texts_to_sequences(processed_testing_data)  # Assuming this is your list of test sentences

# Pad the sequences
X_test = pad_sequences(test_sequences, maxlen=max_len)

# Ensure the test labels are in the correct format
Y_test = np.array(testing_labels)

evaluation_results = model.evaluate(X_test, Y_test)
print(evaluation_results)

[0.18216589093208313, 0.9882139563560486]


In [41]:
import numpy as np

# Test sentences
test_sentences = [
    "Kitap masanın üstünde duruyordu.",
    "Arkadaşlar da gelmiş.",
    "O günki hava çok güzeldi.",
    "Derslerinde başarılı bir öğrenciydi.",
    "Herkesin de bir hikayesi var.",
    "Kapıdaki kimdi?",
    "Yazdıklarını okudum da çok beğendim.",
    "Bu işin sonu nereye varacak bilmiyorum ki.",
    "Olanlar olmuştu artık, yapacak bir şey yoktu.",
    "Sen de mi Brutus?",
    "Annesi de buradaydı.",
    "Bize de haber verir misin?",
    "Kitaplarda masanın üstündeydi.",
    "Yemekler hazırlandı ki.",
    "Evimizde duruyor.",
    "O günde çok güzeldi.",
    "Yarın da gelecekler.",
    "Gömlekteki leke çıktı.",
    "Dün de aynıydı.",
    "Herkesinki farklı"
]

# Predict whether "de" and "ki" suffixes should be separated or not for each sentence
for input_text in test_sentences:
    # Tokenize the input text
    sequence = tokenizer.texts_to_sequences([input_text])
    # Pad the sequence
    padded_sequence = pad_sequences(sequence, maxlen=max_len)

    # Use model.predict
    prediction = model.predict(padded_sequence)
    # Convert probability to class label
    class_prediction = (prediction > 0.5).astype(int)

    print(f"Sentence: '{input_text}'")
    print(f"Prediction: {'Should be separated' if class_prediction[0][0] == 1 else 'Should be unified'}")
    print()

Sentence: 'Kitap masanın üstünde duruyordu.'
Prediction: Should be unified

Sentence: 'Arkadaşlar da gelmiş.'
Prediction: Should be unified

Sentence: 'O günki hava çok güzeldi.'
Prediction: Should be unified

Sentence: 'Derslerinde başarılı bir öğrenciydi.'
Prediction: Should be unified

Sentence: 'Herkesin de bir hikayesi var.'
Prediction: Should be separated

Sentence: 'Kapıdaki kimdi?'
Prediction: Should be unified

Sentence: 'Yazdıklarını okudum da çok beğendim.'
Prediction: Should be unified

Sentence: 'Bu işin sonu nereye varacak bilmiyorum ki.'
Prediction: Should be unified

Sentence: 'Olanlar olmuştu artık, yapacak bir şey yoktu.'
Prediction: Should be unified

Sentence: 'Sen de mi Brutus?'
Prediction: Should be separated

Sentence: 'Annesi de buradaydı.'
Prediction: Should be separated

Sentence: 'Bize de haber verir misin?'
Prediction: Should be separated

Sentence: 'Kitaplarda masanın üstündeydi.'
Prediction: Should be unified

Sentence: 'Yemekler hazırlandı ki.'
Prediction