# 📝 Assignment – Persian NLP Toolkit Playground

Welcome!  
In this notebook you’ll experiment with **Hazm**, **Stanza‑fa**, **Parsivar**, **BERT‑fa**, and **GPT2‑fa**.  
Follow the steps below and **replace every `.......` with your own code / answer**.

In [None]:
!pip install parsivar hazm stanza

In [1]:
from logging import fatal
# ✨ Normalizer quick‑compare — Hazm, Parsivar, Stanza‑fa
text = "مي‌روم   به تهران 123٤٥۶!"

# 1) Hazm ------------------------------------------------------------
from hazm import Normalizer as HazmNormalizer
hazm_norm = HazmNormalizer(remove_diacritics=True)
clean_hazm = hazm_norm.normalize(text)
print("Hazm →", clean_hazm)

# 2) Parsivar --------------------------------------------------------
from parsivar import Normalizer as ParsivarNormalizer
parsivar_norm = ParsivarNormalizer(statistical_space_correction=True)
clean_parsivar = parsivar_norm.normalize(text)
print("Parsivar →", clean_parsivar)

# 3) Stanza‑fa (normalisation occurs inside the tokenizer) ----------
import stanza
stz = stanza.Pipeline("fa", processors="tokenize", use_gpu=False, verbose=False)
doc = stz(text)
stanza_tokens = [w.text for s in doc.sentences for w in s.words]
print("Stanza tokens →", stanza_tokens)
print("Stanza (normalised join) →", " ".join(stanza_tokens))


Hazm → می‌روم به تهران ۱۲۳۴۵۶!
Parsivar → می‌روم به تهران 1234٥6 ! 
Stanza tokens → ['مي\u200cروم', 'به', 'تهران', '123٤٥۶', '!']
Stanza (normalised join) → مي‌روم به تهران 123٤٥۶ !


In [2]:
# 🔹 Quick word‑tokenizer showcase — Hazm, Parsivar, Stanza‑fa
sample = "کتاب‌هایمان را به او می‌دهند."

# 1) Hazm ------------------------------------------------------------
from hazm import word_tokenize
print("Hazm tokens:", word_tokenize(sample))

# 2) Parsivar --------------------------------------------------------
from parsivar import Tokenizer
p_tokenizer = Tokenizer()
print("Parsivar tokens:", p_tokenizer.tokenize_words(sample))

# 3) Stanza‑fa -------------------------------------------------------
import stanza
nlp = stanza.Pipeline("fa", processors="tokenize", use_gpu=False, verbose=False)
doc = nlp(sample)
stanza_tokens = [w.text for s in doc.sentences for w in s.words]
print("Stanza tokens:", stanza_tokens)


Hazm tokens: ['کتاب\u200cهایمان', 'را', 'به', 'او', 'می\u200cدهند', '.']
Parsivar tokens: ['کتاب\u200cهایمان', 'را', 'به', 'او', 'می\u200cدهند.']
Stanza tokens: ['کتاب\u200cهای', 'مان', 'را', 'به', 'او', 'می\u200cدهند', '.']


In [17]:
#Part‑of‑Speech (POS) Tagging
sample = "کتاب‌هایمان را به او می‌دهند."

# -------- Hazm POS --------
!wget https://github.com/roshan-research/hazm/releases/download/v0.5/resources-0.5.zip
!unzip resources-0.5.zip


from hazm import Normalizer, word_tokenize, POSTagger
normalizer = Normalizer()
tokens_hazm = word_tokenize(normalizer.normalize(sample))
pos_hazm = POSTagger(model='/content/postagger.model').tag(tokens_hazm)
print("Hazm POS:", pos_hazm)

# -------- Parsivar POS --------
from parsivar import Normalizer as PVNorm, Tokenizer, FindPOS
pv_norm   = PVNorm()
pv_tokens = Tokenizer().tokenize_words(pv_norm.normalize(sample))
pos_parsi = FindPOS().parse_tokens(" ".join(pv_tokens))
print("Parsivar POS:", pos_parsi)

# -------- Stanza‑fa POS --------
import stanza
stz = stanza.Pipeline("fa", processors="tokenize,pos", use_gpu=False, verbose=False)
doc = stz(sample)
pos_stanza = [(w.text, w.upos) for s in doc.sentences for w in s.words]
print("Stanza POS:", pos_stanza)


Stanza POS: [('کتاب\u200cهای', 'NOUN'), ('مان', 'PRON'), ('را', 'ADP'), ('به', 'ADP'), ('او', 'PRON'), ('می\u200cدهند', 'VERB'), ('.', 'PUNCT')]


In [9]:
#Stemming / Lemmatization
sample = "کتاب‌هایمان را به او می‌بخشند و دوباره پس می‌گیرند!"

# -------- Hazm stem / lemma --------
from hazm import Stemmer, Lemmatizer, word_tokenize
stemmer    = Stemmer()
lemmatizer = Lemmatizer()
print("Hazm stem:",    [stemmer.stem(t)    for t in word_tokenize(sample)])
print("Hazm lemma:",   [lemmatizer.lemmatize(t) for t in word_tokenize(sample)])

# -------- Parsivar stem / lemma --------

from parsivar import FindStems, Tokenizer
my_stemmer = FindStems()
tokenizer = Tokenizer()
tokens = tokenizer.tokenize_words(sample)
stems_parsi = [my_stemmer.convert_to_stem(word) for word in tokens]
print("Parsivar stem:", stems_parsi)

# Note: Parsivar doesn't have a direct lemmatizer in current version
# As an alternative, we can use Stemmer for both stem and lemma
print("Parsivar 'lemma' (using stem):", stems_parsi)

# -------- Stanza‑fa lemma --------
import stanza
stz = stanza.Pipeline("fa", processors="tokenize,pos,lemma", use_gpu=False, verbose=False)
doc = stz(sample)
lemmas_stanza = [w.lemma for s in doc.sentences for w in s.words]
print("Stanza lemma:", lemmas_stanza)



Hazm stem: ['کتاب', 'را', 'به', 'او', 'می\u200cبخشند', 'و', 'دوباره', 'پس', 'می\u200cگیرند', '!']
Hazm lemma: ['کتاب', 'را', 'به', 'او', 'بخشید#بخش', 'و', 'دوباره', 'پس', 'گرفت#گیر', '!']
Parsivar stem: ['کتاب', 'را', 'به', 'او', 'بخشید&بخش', 'و', 'دوباره', 'پس', 'می\u200cگیرند!']
Parsivar 'lemma' (using stem): ['کتاب', 'را', 'به', 'او', 'بخشید&بخش', 'و', 'دوباره', 'پس', 'می\u200cگیرند!']
Stanza lemma: ['کتاب', 'ما', 'را', 'به', 'او', 'بخشید', 'و', 'دوباره', 'پس', 'گرفت', '!']
