In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Masking

# 1. Load dataset
df = pd.read_csv('/mnt/data/unaccented_data.csv')
words = df['Word'].astype(str).tolist()
accents = df['Accent index'].astype(int).tolist()

# 2. Build character vocabulary
chars = sorted({c for w in words for c in w})
char2idx = {c: i+1 for i, c in enumerate(chars)}  # reserve 0 for padding
vocab_size = len(char2idx) + 1

# 3. Encode words as sequences of char indices
encoded = [[char2idx[c] for c in w] for w in words]
max_len = max(len(seq) for seq in encoded)
padded = pad_sequences(encoded, maxlen=max_len, padding='post')

# 4. Prepare targets: one-hot over possible positions
targets = to_categorical(accents, num_classes=max_len)

# 5. Split into train and test
X_train, X_test, y_train, y_test = train_test_split(
    padded, targets, test_size=0.2, random_state=42
)

# 6. Build model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=64, input_length=max_len, mask_zero=True),
    Bidirectional(LSTM(64, return_sequences=False)),
    Dense(max_len, activation='softmax')
])

model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()

# 7. Train
ehistory = model.fit(
    X_train, y_train,
    validation_split=0.1,
    epochs=10,
    batch_size=32
)

# 8. Evaluate
loss, acc = model.evaluate(X_test, y_test)
print(f"Test loss: {loss:.4f}, Test accuracy: {acc:.4f}")


In [3]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from sklearn.model_selection import train_test_split

# 1. Load dataset
df = pd.read_csv('unaccented_data.csv')
words = df['Word'].astype(str).tolist()
accents = df['Accent index'].astype(int).tolist()

# 2. Build character vocabulary
chars = sorted({c for w in words for c in w})
char2idx = {c: i+1 for i, c in enumerate(chars)}  # reserve 0 for padding
vocab_size = len(char2idx) + 1
max_len = max(len(w) for w in words)

# 3. Encode words as sequences of indices
encoded_seqs = [torch.tensor([char2idx[c] for c in w], dtype=torch.long) for w in words]
labels = torch.tensor(accents, dtype=torch.long)

# Dataset and DataLoader
def collate_fn(batch):
    seqs, labs = zip(*batch)
    lengths = torch.tensor([len(s) for s in seqs], dtype=torch.long)
    padded_seqs = pad_sequence(seqs, batch_first=True, padding_value=0)
    return padded_seqs, lengths, torch.tensor(labs, dtype=torch.long)

class AccentDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels
    def __len__(self):
        return len(self.sequences)
    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

# 4. Split data
indices = list(range(len(encoded_seqs)))
train_idx, test_idx = train_test_split(indices, test_size=0.2, random_state=42)
train_dataset = AccentDataset([encoded_seqs[i] for i in train_idx], labels[train_idx])
test_dataset = AccentDataset([encoded_seqs[i] for i in test_idx], labels[test_idx])
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

# 5. Define the Bi-LSTM model
class AccentModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x, lengths):
        embedded = self.embedding(x)
        packed = pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_out, (h_n, _) = self.lstm(packed)
        # concatenate final forward and backward hidden states
        h_forward = h_n[-2]
        h_backward = h_n[-1]
        h_final = torch.cat((h_forward, h_backward), dim=1)
        out = self.fc(h_final)
        return out

# 6. Initialize model, loss, optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AccentModel(vocab_size, embed_dim=64, hidden_dim=64, output_dim=max_len).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# 7. Training loop
num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for seqs, lengths, labs in train_loader:
        seqs, lengths, labs = seqs.to(device), lengths.to(device), labs.to(device)
        optimizer.zero_grad()
        outputs = model(seqs, lengths)
        loss = criterion(outputs, labs)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

# 8. Evaluation
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for seqs, lengths, labs in test_loader:
        seqs, lengths, labs = seqs.to(device), lengths.to(device), labs.to(device)
        outputs = model(seqs, lengths)
        preds = outputs.argmax(dim=1)
        correct += (preds == labs).sum().item()
        total += labs.size(0)
accuracy = correct / total
print(f"Test Accuracy: {accuracy:.4f}")


Epoch 1/50, Loss: 2.0787
Epoch 2/50, Loss: 1.4942
Epoch 3/50, Loss: 1.2251
Epoch 4/50, Loss: 1.0954
Epoch 5/50, Loss: 0.9644
Epoch 6/50, Loss: 0.8600
Epoch 7/50, Loss: 0.7589
Epoch 8/50, Loss: 0.6832
Epoch 9/50, Loss: 0.5884
Epoch 10/50, Loss: 0.5387
Epoch 11/50, Loss: 0.4690
Epoch 12/50, Loss: 0.4276
Epoch 13/50, Loss: 0.3789
Epoch 14/50, Loss: 0.3478
Epoch 15/50, Loss: 0.3169
Epoch 16/50, Loss: 0.2818
Epoch 17/50, Loss: 0.2645
Epoch 18/50, Loss: 0.2297
Epoch 19/50, Loss: 0.2008
Epoch 20/50, Loss: 0.1812
Epoch 21/50, Loss: 0.1706
Epoch 22/50, Loss: 0.1456
Epoch 23/50, Loss: 0.1356
Epoch 24/50, Loss: 0.1257
Epoch 25/50, Loss: 0.1188
Epoch 26/50, Loss: 0.0989
Epoch 27/50, Loss: 0.0742
Epoch 28/50, Loss: 0.0751
Epoch 29/50, Loss: 0.0733
Epoch 30/50, Loss: 0.0582
Epoch 31/50, Loss: 0.0477
Epoch 32/50, Loss: 0.0432
Epoch 33/50, Loss: 0.0366
Epoch 34/50, Loss: 0.0299
Epoch 35/50, Loss: 0.0256
Epoch 36/50, Loss: 0.0302
Epoch 37/50, Loss: 0.0283
Epoch 38/50, Loss: 0.0240
Epoch 39/50, Loss: 0.

In [14]:
with torch.no_grad():
    sample_word = "njonjast"
    sample_seq = torch.tensor([char2idx.get(c, 0) for c in sample_word], dtype=torch.long).unsqueeze(0).to(device)
    sample_length = torch.tensor([len(sample_seq)], dtype=torch.long).to(device)
    model.eval()
    output = model(sample_seq, sample_length)
    predicted_accent = output.argmax(dim=1).item()
    print(f"Predicted accent index for '{sample_word}': {predicted_accent}")

Predicted accent index for 'njonjast': 2


In [None]:
with torch.no_grad():
    sample_word = "njonjast"
    sample_seq = torch.tensor([char2idx.get(c, 0) for c in sample_word], dtype=torch.long).unsqueeze(0).to(device)
    sample_length = torch.tensor([len(sample_seq)], dtype=torch.long).to(device)
    model.eval()
    output = model(sample_seq, sample_length)
    predicted_accent = output.argmax(dim=1).item()
    print(f"Predicted accent index for '{sample_word}': {predicted_accent}")

Predicted accent index for 'njonjast': 2


In [9]:

# 4. Create train/val/test splits
indices = list(range(len(encoded_seqs)))
train_idx, test_idx = train_test_split(indices, test_size=0.2, random_state=42)
train_idx, val_idx = train_test_split(train_idx, test_size=0.1, random_state=42)  # 10% of train for validation

# Collate function for padding
def collate_fn(batch):
    seqs, labs = zip(*batch)
    lengths = torch.tensor([len(s) for s in seqs], dtype=torch.long)
    padded_seqs = pad_sequence(seqs, batch_first=True, padding_value=0)
    return padded_seqs, lengths, torch.tensor(labs, dtype=torch.long)

class AccentDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels
    def __len__(self):
        return len(self.sequences)
    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

# Prepare datasets and loaders
train_dataset = AccentDataset([encoded_seqs[i] for i in train_idx], labels[train_idx])
val_dataset   = AccentDataset([encoded_seqs[i] for i in val_idx],   labels[val_idx])
test_dataset  = AccentDataset([encoded_seqs[i] for i in test_idx],  labels[test_idx])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True,  collate_fn=collate_fn)
val_loader   = DataLoader(val_dataset,   batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader  = DataLoader(test_dataset,  batch_size=32, shuffle=False, collate_fn=collate_fn)

# 5. Define Bi-LSTM model with dropout
class AccentModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, dropout_p=0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True, dropout=dropout_p)
        self.dropout = nn.Dropout(dropout_p)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x, lengths):
        embedded = self.embedding(x)
        packed = pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_out, (h_n, _) = self.lstm(packed)
        h_forward = h_n[-2]
        h_backward = h_n[-1]
        h_final = torch.cat((h_forward, h_backward), dim=1)
        out = self.dropout(h_final)
        return self.fc(out)

# 6. Initialize model, loss, optimizer (with weight decay)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AccentModel(vocab_size, embed_dim=64, hidden_dim=64, output_dim=max_len, dropout_p=0.3).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)

# 7. Training loop with early stopping
num_epochs = 30
patience = 5
best_val_loss = float('inf')
epochs_no_improve = 0
best_state = None

for epoch in range(num_epochs):
    # Training
    model.train()
    train_loss = 0.0
    for seqs, lengths, labs in train_loader:
        seqs, lengths, labs = seqs.to(device), lengths.to(device), labs.to(device)
        optimizer.zero_grad()
        outputs = model(seqs, lengths)
        loss = criterion(outputs, labs)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    avg_train = train_loss / len(train_loader)

    # Validation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for seqs, lengths, labs in val_loader:
            seqs, lengths, labs = seqs.to(device), lengths.to(device), labs.to(device)
            outputs = model(seqs, lengths)
            val_loss += criterion(outputs, labs).item()
    avg_val = val_loss / len(val_loader)

    print(f"Epoch {epoch+1}: Train Loss={avg_train:.4f}, Val Loss={avg_val:.4f}")

    # Check early stopping
    if avg_val < best_val_loss:
        best_val_loss = avg_val
        epochs_no_improve = 0
        best_state = model.state_dict()
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print("Early stopping triggered")
            break

# Load best model
if best_state:
    model.load_state_dict(best_state)

# 8. Final evaluation on test set
model.eval()
correct = total = 0
with torch.no_grad():
    for seqs, lengths, labs in test_loader:
        seqs, lengths, labs = seqs.to(device), lengths.to(device), labs.to(device)
        preds = model(seqs, lengths).argmax(1)
        correct += (preds == labs).sum().item()
        total += labs.size(0)
print(f"Test Accuracy: {correct/total:.4f}")




Epoch 1: Train Loss=2.1074, Val Loss=1.6697
Epoch 2: Train Loss=1.5474, Val Loss=1.3079
Epoch 3: Train Loss=1.3069, Val Loss=1.1576
Epoch 4: Train Loss=1.1684, Val Loss=1.0630
Epoch 5: Train Loss=1.0821, Val Loss=0.9966
Epoch 6: Train Loss=1.0016, Val Loss=0.9700
Epoch 7: Train Loss=0.9168, Val Loss=0.8995
Epoch 8: Train Loss=0.8295, Val Loss=0.8660
Epoch 9: Train Loss=0.7728, Val Loss=0.8116
Epoch 10: Train Loss=0.6988, Val Loss=0.7964
Epoch 11: Train Loss=0.6619, Val Loss=0.7673
Epoch 12: Train Loss=0.6130, Val Loss=0.7158
Epoch 13: Train Loss=0.5416, Val Loss=0.7204
Epoch 14: Train Loss=0.5014, Val Loss=0.6905
Epoch 15: Train Loss=0.4917, Val Loss=0.7142
Epoch 16: Train Loss=0.4437, Val Loss=0.6354
Epoch 17: Train Loss=0.4088, Val Loss=0.6291
Epoch 18: Train Loss=0.3756, Val Loss=0.6307
Epoch 19: Train Loss=0.3531, Val Loss=0.6479
Epoch 20: Train Loss=0.3292, Val Loss=0.6144
Epoch 21: Train Loss=0.2742, Val Loss=0.6492
Epoch 22: Train Loss=0.2854, Val Loss=0.6087
Epoch 23: Train Los

In [1]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from sklearn.model_selection import train_test_split


In [5]:
# 1. Load dataset
df = pd.read_csv('unaccented_data.csv')
words = df['Word'].astype(str).tolist()
raw_accents = df['Accent index'].astype(int).tolist()  # 0 = no accent, 1..n = accent position (1-based)

# 2. Build character vocabulary
chars = sorted({c for w in words for c in w})
char2idx = {c: i+1 for i, c in enumerate(chars)}  # reserve 0 for padding
vocab_size = len(char2idx) + 1
max_len = max(len(w) for w in words)

# 3. Encode words as sequences of indices
encoded_seqs = [torch.tensor([char2idx[c] for c in w], dtype=torch.long) for w in words]

# 4. Prepare labels: keep 0 as 'no accent', positions are already 1-based
labels = torch.tensor(raw_accents, dtype=torch.long)

# 5. Create train/val/test splits
indices = list(range(len(encoded_seqs)))
train_idx, test_idx = train_test_split(indices, test_size=0.2, random_state=42)
train_idx, val_idx = train_test_split(train_idx, test_size=0.1, random_state=42)

# Collate function
def collate_fn(batch):
    seqs, labs = zip(*batch)
    lengths = torch.tensor([len(s) for s in seqs], dtype=torch.long)
    padded_seqs = pad_sequence(seqs, batch_first=True, padding_value=0)
    return padded_seqs, lengths, torch.tensor(labs, dtype=torch.long)

class AccentDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels
    def __len__(self):
        return len(self.sequences)
    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

# DataLoaders
train_ds = AccentDataset([encoded_seqs[i] for i in train_idx], labels[train_idx])
val_ds   = AccentDataset([encoded_seqs[i] for i in val_idx],   labels[val_idx])
test_ds  = AccentDataset([encoded_seqs[i] for i in test_idx],  labels[test_idx])
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True,  collate_fn=collate_fn)
val_loader   = DataLoader(val_ds,   batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader  = DataLoader(test_ds,  batch_size=32, shuffle=False, collate_fn=collate_fn)

# 6. Define Bi-LSTM model
class AccentModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, dropout_p=0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True, dropout=dropout_p)
        self.dropout = nn.Dropout(dropout_p)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x, lengths):
        embedded = self.embedding(x)
        packed = pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
        _, (h_n, _) = self.lstm(packed)
        h_final = torch.cat((h_n[-2], h_n[-1]), dim=1)
        out = self.dropout(h_final)
        return self.fc(out)

# 7. Initialize model, loss, optimizer
#    output_dim = max_len + 1 to include class 0
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AccentModel(vocab_size, embed_dim=64, hidden_dim=64, output_dim=max_len+1, dropout_p=0.3).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)

# 8. Training with early stopping
num_epochs = 50
patience = 5
best_val_loss = float('inf')
epochs_no_improve = 0
best_state = None

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for seqs, lengths, labs in train_loader:
        seqs, lengths, labs = seqs.to(device), lengths.to(device), labs.to(device)
        optimizer.zero_grad()
        outputs = model(seqs, lengths)
        loss = criterion(outputs, labs)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    avg_train = train_loss / len(train_loader)

    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for seqs, lengths, labs in val_loader:
            seqs, lengths, labs = seqs.to(device), lengths.to(device), labs.to(device)
            outputs = model(seqs, lengths)
            val_loss += criterion(outputs, labs).item()
    avg_val = val_loss / len(val_loader)

    print(f"Epoch {epoch+1}: Train Loss={avg_train:.4f}, Val Loss={avg_val:.4f}")
    if avg_val < best_val_loss:
        best_val_loss = avg_val
        epochs_no_improve = 0
        best_state = model.state_dict()
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print("Early stopping triggered")
            break

if best_state:
    model.load_state_dict(best_state)

# 9. Test evaluation
model.eval()
correct = total = 0
with torch.no_grad():
    for seqs, lengths, labs in test_loader:
        seqs, lengths, labs = seqs.to(device), lengths.to(device), labs.to(device)
        preds = model(seqs, lengths).argmax(1)
        correct += (preds == labs).sum().item()
        total += labs.size(0)
print(f"Test Accuracy: {correct/total:.4f}")




Epoch 1: Train Loss=2.1457, Val Loss=1.6937
Epoch 2: Train Loss=1.5393, Val Loss=1.2807
Epoch 3: Train Loss=1.2609, Val Loss=1.1175
Epoch 4: Train Loss=1.1438, Val Loss=1.0677
Epoch 5: Train Loss=1.0571, Val Loss=0.9915
Epoch 6: Train Loss=0.9705, Val Loss=0.9583
Epoch 7: Train Loss=0.8891, Val Loss=0.9051
Epoch 8: Train Loss=0.8128, Val Loss=0.8390
Epoch 9: Train Loss=0.7283, Val Loss=0.7952
Epoch 10: Train Loss=0.6734, Val Loss=0.7530
Epoch 11: Train Loss=0.6063, Val Loss=0.7145
Epoch 12: Train Loss=0.5532, Val Loss=0.6821
Epoch 13: Train Loss=0.5226, Val Loss=0.6994
Epoch 14: Train Loss=0.4698, Val Loss=0.6456
Epoch 15: Train Loss=0.4321, Val Loss=0.6892
Epoch 16: Train Loss=0.4127, Val Loss=0.6221
Epoch 17: Train Loss=0.3701, Val Loss=0.6244
Epoch 18: Train Loss=0.3651, Val Loss=0.6309
Epoch 19: Train Loss=0.3188, Val Loss=0.6048
Epoch 20: Train Loss=0.3077, Val Loss=0.6287
Epoch 21: Train Loss=0.2894, Val Loss=0.6280
Epoch 22: Train Loss=0.2546, Val Loss=0.6028
Epoch 23: Train Los

In [4]:
torch.save(model.state_dict(), 'model.pt')

In [11]:
# 10. Prediction function for user input
def predict_accent(word: str) -> int:
    """
    Predict accent position for a single Croatian word.
    Returns 0 if no accent, or 1-based index of accented character.
    """
    model.eval()
    # Encode and pad
    seq = torch.tensor([char2idx.get(c, 0) for c in word], dtype=torch.long)
    length = torch.tensor([len(seq)], dtype=torch.long)
    padded = pad_sequence([seq], batch_first=True, padding_value=0)
    padded = padded.to(device)
    length = length.to(device)
    with torch.no_grad():
        logits = model(padded, length)
        pred = logits.argmax(dim=1).item()
    return pred

# Example usage:
print(predict_accent("mnogokut"))  # prints predicted index


5


In [12]:
def predict_accent(word):
    """
    Predict accent position for a single Croatian word.
    Returns a tuple (position, character), where position=0 means no accent and character=None.
    """
    model.eval()
    # Encode and pad
    seq = torch.tensor([char2idx.get(c, 0) for c in word], dtype=torch.long)
    length = torch.tensor([len(seq)], dtype=torch.long)
    padded = pad_sequence([seq], batch_first=True, padding_value=0).to(device)
    length = length.to(device)
    with torch.no_grad():
        logits = model(padded, length)
        pred = logits.argmax(dim=1).item()
    if pred == 0:
        return 0, None
    else:
        # pred is 1-based index into word
        accented_char = word[pred-1] if pred-1 < len(word) else None
        return pred, accented_char

# Example usage:
pos, char = predict_accent("riječ")
print(f"Accent at pos {pos}, character '{char}'")

Accent at pos 2, character 'i'


In [15]:
pos, char = predict_accent("jačina")
print(f"Accent at pos {pos}, character '{char}'")

Accent at pos 4, character 'i'
