In [None]:
# Unit II: Language Syntax and Semantics (Advanced)
# Lab Assignment 3: Morphological Analysis with Finite State Transducers (FST) and Deep Learning
# •	Implement a Finite State Transducer (FST) for morphological parsing (e.g., handling verb conjugations and noun declensions in an Indian language like Hindi or Sanskrit).
# •	Train a sequence-to-sequence deep learning model (LSTM-based) to predict morphemes for unseen words.
# •	Compare performance between FST and deep learning approaches.

In [None]:
# Hindi FST example (simplified)
rules = {
    'ता': 'ता',  # nominalizer
    'ना': 'ना',  # infinitive
    'ता हूँ': 'present tense (1st person)'
}

def fst_hindi(word):
    for suffix in rules:
        if word.endswith(suffix):
            root = word.replace(suffix, '')
            return f"{word} → {root} + {rules[suffix]}"
    return f"{word} → (no match)"

# Example
words = ['करता', 'सोना', 'पढ़ता हूँ']
for w in words:
    print(fst_hindi(w))


करता → कर + ता
सोना → सो + ना
पढ़ता हूँ → पढ़ + present tense (1st person)


In [None]:
# Install dependencies
!pip install -q tensorflow

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, TimeDistributed
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np

# Example dataset (word → morpheme sequences)
data = [
    ("running", ["run", "ing"]),
    ("played", ["play", "ed"]),
    ("happier", ["happy", "er"])
]

# Build vocab
word_vocab = sorted(set("".join(w for w, _ in data)))
morph_vocab = sorted(set(m for _, mlist in data for m in mlist))

char2idx = {c: i + 1 for i, c in enumerate(word_vocab)}
morph2idx = {m: i + 1 for i, m in enumerate(morph_vocab)}

# Prepare input/output
X = [pad_sequences([[char2idx[c] for c in w]], maxlen=10)[0] for w, _ in data]
y = [pad_sequences([[morph2idx[m] for m in morphemes]], maxlen=10)[0] for _, morphemes in data] # Change maxlen to 10

X = np.array(X)
y = to_categorical(y, num_classes=len(morph2idx)+1)

# Model
model = Sequential([
    Embedding(input_dim=len(char2idx)+1, output_dim=32, input_length=10),
    LSTM(64, return_sequences=True), # Return sequences
    TimeDistributed(Dense(len(morph2idx)+1, activation='softmax'))
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X, y, epochs=30, verbose=0)

# Predict
def predict_morphemes(word):
    x = pad_sequences([[char2idx.get(c, 0) for c in word]], maxlen=10)
    pred = model.predict(x)[0]
    return pred

print("Prediction for 'talking':", predict_morphemes("talking"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 277ms/step
Prediction for 'talking': [[0.2270834  0.13824116 0.13789883 0.11410774 0.12512286 0.12723267
  0.13031334]
 [0.33757484 0.12883696 0.12633653 0.08621368 0.10266207 0.1064572
  0.11191867]
 [0.47465336 0.11179635 0.106578   0.06000805 0.07707695 0.08149742
  0.08838977]
 [0.6156067  0.0894434  0.08235191 0.03840302 0.05298768 0.05698854
  0.06421878]
 [0.6561712  0.08323544 0.07397082 0.03327045 0.04604111 0.04951971
  0.05779132]
 [0.6844377  0.07856555 0.06814083 0.03000706 0.0413129  0.044773
  0.05276303]
 [0.7850775  0.05685365 0.04898205 0.01777136 0.02653614 0.0292461
  0.03553324]
 [0.78457993 0.05728573 0.0485362  0.01825638 0.02656362 0.02830995
  0.03646814]
 [0.79154813 0.05533511 0.04702748 0.01753536 0.02585523 0.02706832
  0.03563026]
 [0.7782867  0.05884828 0.04861753 0.01932239 0.02822301 0.02918444
  0.03751771]]
