In [None]:
from gensim.models import KeyedVectors

def process_text(text):
    # Split the text into lines and filter out lines starting with </doc> or <doc
    lines = text.split('\n')
    filtered_lines = [line for line in lines if not line.startswith('</doc>') and not line.startswith('<doc')]

    # Join the filtered lines back into a single string for processing
    filtered_text = '\n'.join(filtered_lines)

    processed_sentences = []
    sentences = filtered_text.lower().split(".")

    processed_sentences = []

    for sentence in sentences:
        if 'de ' in sentence or 'ki ' in sentence:  # Check if the sentence contains 'de' or 'ki'
            words = sentence.split()
            processed_words = [word for word in words if not (word.startswith('<') and word.endswith('>'))]
            processed_sentence = ' '.join(processed_words).strip()

            # Automated labeling based on the presence of ' de ' or ' ki ' as separate words
            if ' de ' in processed_sentence or ' ki ' in processed_sentence:
                label = '<separated>'
            else:
                label = '<unified>'  # Default label in case neither is found as separate words

            processed_sentences.append(f"{processed_sentence} {label}")

    return '\n'.join(processed_sentences)

def load_word_embeddings(model_path):
    return KeyedVectors.load_word2vec_format(model_path, binary=True)

def convert_to_embeddings(sentences, word_embeddings):
    embedded_sentences = []
    for sentence in sentences:
        words = sentence.split()
        embedded_sentence = [word_embeddings[word] for word in words if word in word_embeddings]
        embedded_sentences.append(embedded_sentence)
    return embedded_sentences

# Load pre-trained Word2Vec model
word_embeddings = load_word_embeddings('/content/drive/MyDrive/Colab/trmodel')

def read_partial_file(file_path, portion):
    with open(file_path, 'r', encoding='utf8') as file:
        file.seek(0, 2)
        file_size = file.tell()
        file.seek(0)
        data_size = int(file_size * portion)
        return file.read(data_size)

try:
    _data = read_partial_file('/content/drive/MyDrive/Colab/wiki_00', 0.3)
    training_size = int(len(_data) * 0.95)
    training_data = _data[:training_size]
    testing_data = _data[training_size:]

    processed_training_data = process_text(training_data)
    with open('training_data.txt', 'w', encoding="utf8") as f:
        f.write(processed_training_data)

    processed_testing_data = process_text(testing_data)
    with open('testing_data.txt', 'w', encoding="utf8") as f:
        f.write(processed_testing_data)

    # Split the processed data into individual sentences
    processed_sentences = processed_training_data.split('\n')

    # Convert each sentence into word embeddings
    embedded_training_data = convert_to_embeddings(processed_sentences, word_embeddings)

except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dropout, Dense


# Split the data into training, validation, and test sets
train_sentences, test_sentences = train_test_split(processed_sentences, test_size=0.2, random_state=42)
train_sentences, val_sentences = train_test_split(train_sentences, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

# Convert sentences into word embeddings
train_data = convert_to_embeddings(train_sentences, word_embeddings)
val_data = convert_to_embeddings(val_sentences, word_embeddings)
test_data = convert_to_embeddings(test_sentences, word_embeddings)

# Convert labels into binary format
train_labels = [1 if '<separated>' in sentence else 0 for sentence in train_sentences]
val_labels = [1 if '<separated>' in sentence else 0 for sentence in val_sentences]
test_labels = [1 if '<separated>' in sentence else 0 for sentence in test_sentences]

# Define the shape of the input data
input_shape = (100, 300)  # Adjust these numbers to match your data

# Define the RNN model with dropout for regularization
model = Sequential([
    SimpleRNN(50, input_shape=input_shape),
    Dropout(0.5),  # Dropout layer with 50% dropout rate
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

yyy

# Train the model on the training data and validate on the validation data
model.fit(train_data, train_labels, validation_data=(val_data, val_labels), epochs=10)

# Evaluate the model on the test data
test_loss, test_accuracy = model.evaluate(test_data, test_labels)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")