In [28]:
# Reading the file
try:
    with open('wiki_00', 'r', encoding='utf-8') as file:
        text = file.read()
except FileNotFoundError:
    print("The file was not found.")
    text = ""

In [29]:
    # Importing necessary libraries
    import nltk
    from nltk.tokenize import sent_tokenize
    import numpy as np
    import re
    from keras.models import Sequential
    from keras.layers import Dense
    from sklearn.model_selection import train_test_split
    import string
    
    
    # Downloading and setting up NLTK
    nltk.download('punkt')
    
    # Tokenize the text into sentences
    sentences = sent_tokenize(text, language='turkish')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ekasi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [30]:
# Preprocessing function
def preprocess_sentence(sentence):
    sentence = sentence.lower()
    sentence = re.sub(f"[{string.punctuation}]", "", sentence)
    return sentence

# Preprocess each sentence
preprocessed_sentences = [preprocess_sentence(sentence) for sentence in sentences]

In [31]:
# Regular Expressions for Separated Sentences (de/da and ki)
separated_pattern = r'\s(de|da|ki)\s'
separated_sentences = [sentence for sentence in preprocessed_sentences if re.search(separated_pattern, sentence, re.IGNORECASE)]

# Regular Expressions for Unseparated Sentences (de/da and ki)
unseparated_pattern = r'\S(de|da|ki)\S'
unseparated_sentences = [sentence for sentence in preprocessed_sentences if re.search(unseparated_pattern, sentence, re.IGNORECASE)]

import random

# Labels for separated sentences (True)
separated_labels = [True] * len(separated_sentences)

# Downsampling the unseparated sentences
random.seed(42)  # For reproducibility
unseparated_downsampled = random.sample(unseparated_sentences, len(separated_sentences))

# Update labels for the downsampled unseparated sentences
unseparated_labels_downsampled = [False] * len(unseparated_downsampled)

# Combine sentences and labels
combined_sentences = separated_sentences + unseparated_downsampled
combined_labels = separated_labels + unseparated_labels_downsampled
assert len(combined_sentences) == len(combined_labels)

In [32]:
# Word2Vec Model Training
from gensim.models import Word2Vec
tokenized_sentences = [sentence.split() for sentence in combined_sentences]
word2vec_model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)

# Vectorize each sentence using Word2Vec
def vectorize_sentence(sentence, model):
    words = sentence.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if not word_vectors:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

sentence_embeddings = np.array([vectorize_sentence(sentence, word2vec_model) for sentence in combined_sentences])

In [33]:
from keras.layers import Dropout
from sklearn.metrics import precision_score, recall_score, f1_score

# Neural network model setup
model = Sequential()
model.add(Dense(128, activation='relu', input_dim=sentence_embeddings.shape[1]))
model.add(Dropout(0.5))  #for regularization
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Binary classification

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Convert combined labels to a numpy array
y = np.array(combined_labels)

# Splitting data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(sentence_embeddings, y, test_size=0.2, random_state=42)

# Training the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluating the model
loss, accuracy = model.evaluate(X_test, y_test)
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)  # Convert probabilities to binary predictions

# Calculating Precision, Recall, and F1-Score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Test Accuracy: {accuracy*100:.2f}%")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 92.86%
Precision: 0.88
Recall: 0.99
F1 Score: 0.93


In [35]:
def predict_new_sentences(new_sentences, model, word2vec_model):
    # Preprocess the new sentences
    preprocessed_sentences = [preprocess_sentence(sentence) for sentence in new_sentences]

    # Vectorize the new sentences using the Word2Vec model
    new_sentence_embeddings = np.array([vectorize_sentence(sentence, word2vec_model) for sentence in preprocessed_sentences])

    # Make predictions
    predictions = model.predict(new_sentence_embeddings)
    # Convert predictions to binary labels (0 or 1)
    return (predictions > 0.5).astype(int)


In [39]:
# Example usage
test_sentences = [
    "Bugün parkta bir yürüyüş yaptık.",
    "Kitap masanın üstünde duruyordu.",
    "Pencereden bakan çocuk da mutlu görünüyordu.",
    "Yarınki toplantıya katılamayacağım.",
    "Bu iş tam da bana göre.",
    "Kediler de insanlar gibi duygusal olabilir.",
    "Olay yerindeki deliller incelendi.",
    "Anladığım kadarıyla ders çok zormuş ki düşük not almışlar.",
    "Köpeğim dün gece de çok havladı.",
    "Bu konudaki düşüncelerini merak ediyorum.",
    "Evdeki hesap çarşıya uymaz.",
    "Yeni aldığın ayakkabılar çok mu rahat ki?",
    "Arkadaşımla dün sinemada vakit geçirdik.",
    "Tatilde deniz kenarında bir evde kaldık.",
    "Okuldaki öğretmenler çok iyiydi.",
    "Dün akşamki yemeğin tadı hâlâ damağımda.",
    "Yazın ortasında da kar yağdı.",
    "Bu akşamki konseri kaçırmak istemiyorum.",
    "Dünkü maçta çok heyecanlandım.",
    "Kitaplıktaki kitaplar da tozlanmış."
]

predictions = predict_new_sentences(test_sentences, model, word2vec_model)

# Print predictions
for sentence, prediction in zip(test_sentences, predictions):
    print(f"Sentence: '{sentence}' - Predicted Label: {prediction[0]}")

Sentence: 'Bugün parkta bir yürüyüş yaptık.' - Predicted Label: 0
Sentence: 'Kitap masanın üstünde duruyordu.' - Predicted Label: 0
Sentence: 'Pencereden bakan çocuk da mutlu görünüyordu.' - Predicted Label: 1
Sentence: 'Yarınki toplantıya katılamayacağım.' - Predicted Label: 0
Sentence: 'Bu iş tam da bana göre.' - Predicted Label: 1
Sentence: 'Kediler de insanlar gibi duygusal olabilir.' - Predicted Label: 1
Sentence: 'Olay yerindeki deliller incelendi.' - Predicted Label: 0
Sentence: 'Anladığım kadarıyla ders çok zormuş ki düşük not almışlar.' - Predicted Label: 1
Sentence: 'Köpeğim dün gece de çok havladı.' - Predicted Label: 1
Sentence: 'Bu konudaki düşüncelerini merak ediyorum.' - Predicted Label: 0
Sentence: 'Evdeki hesap çarşıya uymaz.' - Predicted Label: 0
Sentence: 'Yeni aldığın ayakkabılar çok mu rahat ki?' - Predicted Label: 1
Sentence: 'Arkadaşımla dün sinemada vakit geçirdik.' - Predicted Label: 0
Sentence: 'Tatilde deniz kenarında bir evde kaldık.' - Predicted Label: 0
Se