In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, TimeDistributed
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [None]:
# Load the dataset
train_data = pd.read_csv("first_10k_rows.csv")
test_data = pd.read_csv("test.csv")

In [None]:
# Preprocess the dataset
sentences = train_data["Sentence"].values
labels = [list(sentence) for sentence in sentences]  # Extract labels (diacritics) from sentences

In [None]:
# Tokenize sentences
tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True, num_words=5000)  # Limit vocabulary size
tokenizer.fit_on_texts(sentences)
X_seq = tokenizer.texts_to_sequences(sentences)

In [None]:
# Create index_to_char dictionary
index_to_char = {idx: char for char, idx in tokenizer.word_index.items() if idx != 0}

# Pad sequences
max_sequence_length = 100  # Limit sequence length
X_padded = tf.keras.preprocessing.sequence.pad_sequences(X_seq, maxlen=max_sequence_length, padding='post')

# Convert labels to one-hot encoding
num_classes = len(tokenizer.word_index) + 1  # Adding 1 for padding token
y_padded = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(labels), maxlen=max_sequence_length, padding='post')
y_one_hot = tf.keras.utils.to_categorical(y_padded, num_classes=num_classes)

In [None]:
# Split the dataset into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_padded, y_one_hot, test_size=0.2, random_state=42)

# Define the model
input_layer = Input(shape=(max_sequence_length,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=50, input_length=max_sequence_length)(input_layer)  # Reduce embedding dimension
bi_lstm_layer = Bidirectional(LSTM(units=64, return_sequences=True))(embedding_layer)  # Reduce LSTM units
output_layer = TimeDistributed(Dense(num_classes, activation='softmax'))(bi_lstm_layer)
model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model with reduced batch size
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=5, batch_size=16)  # Reduce batch size

In [None]:
# Preprocess the test dataset
test_sentences = test_data["Sentence"].values

# Tokenize and pad sequences for test data
X_test_seq = tokenizer.texts_to_sequences(test_sentences)
X_test_padded = tf.keras.preprocessing.sequence.pad_sequences(X_test_seq, maxlen=max_sequence_length, padding='post')

# Make predictions on the test data
predictions = model.predict(X_test_padded)

In [None]:
# Convert predictions to labels (diacritics)
predicted_labels = []
for prediction in predictions:
    predicted_label_seq = np.argmax(prediction, axis=-1)
    predicted_label_seq = [index_to_char[idx] for idx in predicted_label_seq if idx != 0]  # Remove padding tokens
    predicted_label = ''.join(predicted_label_seq)
    predicted_labels.append(predicted_label)

# Write predictions to a CSV file
test_data["Predicted_Label"] = predicted_labels
test_data.to_csv("test_predictions.csv", index=False)