# Stanza

Stanza is a collection of accurate and efficient tools for the linguistic analysis of many human languages including persian.
Stanza use for tokenizing, multi words token expansian, part of speech, lemmatization, dependet parsing and name entity recognition. in below see how can use stanza for persian language.

In [None]:
!pip install stanza

In [11]:
import stanza
stanza.download('fa')       # This downloads the persian models for the neural pipeline
nlp = stanza.Pipeline('fa') # This sets up a default neural pipeline in persian

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: fa (Persian) ...
INFO:stanza:File exists: /root/stanza_resources/fa/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: fa (Persian):
| Processor | Package        |
------------------------------
| tokenize  | perdt          |
| mwt       | perdt          |
| pos       | perdt_charlm   |
| lemma     | perdt_nocharlm |
| depparse  | perdt_charlm   |
| ner       | arman          |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


### detecting tokens and sentences

In [38]:
doc = nlp("سلام من فریبا نفر هستم و این اولین کد منه!امیدوارم براتون مفید باشه.")
tokens_text = [token.text for sen in doc.sentences for token in sen.words]
sentences_text = [sen.text for sen in doc.sentences]

In [39]:
print("tokens:", tokens_text)
print("sentences:", sentences_text)

tokens: ['سلام', 'من', 'فریبا', 'نفر', 'هستم', 'و', 'این', 'اولین', 'کد', 'منه!', 'امیدوار', 'م', 'براتون', 'مفید', 'باشه', '.']
sentences: ['سلام من فریبا نفر هستم و این اولین کد منه!', 'امیدوارم براتون مفید باشه.']


In [40]:
tokens = [token for sen in doc.sentences for token in sen.words]
sentences = [sen for sen in doc.sentences]

### Accessing Named Entities for Sentence and doc


In [41]:
sentences[0].ents

[{
   "text": "فریبا",
   "type": "pers",
   "start_char": 8,
   "end_char": 13
 }]

In [50]:
doc.ents

[{
   "text": "فریبا",
   "type": "pers",
   "start_char": 8,
   "end_char": 13
 }]

### Lemmatization

In [47]:
tokens[4].lemma

'هست'

### Features of token

In [48]:
tokens[4]

{
  "id": 5,
  "text": "هستم",
  "lemma": "هست",
  "upos": "VERB",
  "xpos": "AUX",
  "feats": "Number=Sing|Person=1",
  "head": 1,
  "deprel": "ccomp",
  "start_char": 18,
  "end_char": 22
}

# DadmaTools: A Python NLP Library for Persian

Named Entity Recognition | Part of Speech Tagging | Dependency Parsing | Informal To Formal
Constituency Parsing | Chunking | Kasreh Ezafe Detection
Spellchecker | Normalizer | Tokenizer | Lemmatizer | Sentiment Analysis


In [None]:
!pip install dadmatools

## Normalizer

In [2]:
from dadmatools.normalizer import Normalizer

normalizer = Normalizer(
    full_cleaning=False,
    unify_chars=True,
    refine_punc_spacing=True,
    remove_extra_space=True,
    remove_puncs=False,
    remove_html=False,
    remove_stop_word=False,
    replace_email_with="<EMAIL>",
    replace_number_with=None,
    replace_url_with="",
    replace_mobile_number_with=None,
    replace_emoji_with=None,
    replace_home_number_with=None
)

text = """
<p>
دادماتولز اولین نسخش سال ۱۴۰۰ منتشر شده.
امیدواریم که این تولز بتونه کار با متن رو براتون شیرین‌تر و راحت‌تر کنه
لطفا با ایمیل dadmatools@dadmatech.ir با ما در ارتباط باشید
آدرس گیت‌هاب هم که خب معرف حضور مبارک هست:
 https://github.com/Dadmatech/DadmaTools
</p>
"""
normalized_text = normalizer.normalize(text)
print("costomize nomalization:" , normalized_text)
# full cleaning
normalizer = Normalizer(full_cleaning=True)
normalized_text = normalizer.normalize(text)
print("full cleaning nomalization:" , normalized_text)

costomize nomalization: <p> دادماتولز اولین نسخش سال 1400 منتشر شده. امیدواریم که این تولز بتونه کار با متن رو براتون شیرین‌تر و راحت‌تر کنه لطفا با ایمیل <EMAIL> با ما در ارتباط باشید آدرس گیت‌هاب هم که خب معرف حضور مبارک هست: </p>
full cleaning nomalization: دادماتولز نسخش سال منتشر تولز بتونه کار متن براتون شیرین‌تر راحت‌تر کنه ایمیل ارتباط آدرس گیت‌هاب معرف حضور مبارک


In [None]:
import dadmatools.pipeline.language as language

# here lemmatizer and pos tagger will be loaded
# as tokenizer is the default tool, it will be loaded as well even without calling
pips = 'tok, lem, pos, dep, chunk, cons, spellchecker, kasreh, itf, ner, sent'
nlp = language.Pipeline(pips)

In [17]:
# doc is an SpaCy object
doc = nlp('سلام من فریبا نفر هستم و این اولین کد منه!امیدوارم براتون مفید باشهه')

1it [00:00,  4.67it/s]


In [18]:
tokens = [token for sen in doc['sentences'] for token in sen['tokens']]

In [20]:
tokens[4]

{'id': 5,
 'text': 'هستم',
 'upos': 'AUX',
 'xpos': 'V_PRS',
 'feats': 'Number=Sing|Person=1|Tense=Pres',
 'head': 3,
 'deprel': 'cop',
 'lemma': 'هست',
 'ner': 'O',
 'kasreh': 'O'}

## Word embedding with fasttext, glove, word2vec

In [22]:
from dadmatools.embeddings import get_embedding, get_all_embeddings_info, get_embedding_info
from pprint import pprint

#### load embedding ####
word_embedding = get_embedding('glove-wiki')


In [25]:
#get vector of the word
print('dimention of vectors:',len(word_embedding['سلام']))
#vocab
vocab = word_embedding.get_vocab()
print('vocabulary size:',len(vocab))

### some useful functions ###
print(word_embedding.top_nearest("زمستان", 10))
print(word_embedding.similarity('کتب', 'کتاب'))
print(word_embedding.embedding_text('امروز هوای خوبی بود'))


dimention of vectors: 50
vocabulary size: 240548
[('تابستان', 0.9611946940422058), ('پاییز', 0.865199625492096), ('بهار', 0.8493462800979614), ('سرد', 0.8197917938232422), ('هوای', 0.7853583097457886), ('روزهای', 0.7834357619285583), ('فصول', 0.7756936550140381), ('برف', 0.7650289535522461), ('باران', 0.7549418210983276), ('خشک', 0.7512510418891907)]
0.77167135
[ 4.3834025e-01 -3.1859803e-01 -1.9272313e+00 -8.5763007e-02
  2.9289627e-01  5.6910348e-01 -7.1155220e-01 -1.1371025e-01
  2.2091776e-01 -1.1764800e-01  1.9504672e-01  2.8748301e-01
 -7.1474272e-01 -2.1440999e-01  4.8238149e-01 -4.3609923e-01
  9.2353255e-02 -6.6413760e-02  2.5924450e-01 -7.0753738e-02
 -5.1785523e-01 -2.1039526e-01  2.6589626e-01  8.0122501e-02
  3.9807302e-01  7.7017081e-01 -8.2785249e-02  5.6122452e-01
 -2.4727201e-01  2.4349928e-01  7.2026926e-01  7.5088227e-01
 -2.2402122e+00 -2.6327750e-01 -3.3325753e-01  9.6817985e-02
  2.9644626e-01  7.3354429e-01 -2.0648825e-01 -3.5078496e-02
 -8.3022952e-02  2.9674774

# hazm

Hazm is a python library to perform natural language processing tasks on Persian text. It offers various features for analyzing, processing, and understanding Persian text. You can use Hazm to normalize text, tokenize sentences and words, lemmatize words, assign part-of-speech tags, identify dependency relations, create word and sentence embeddings, or read popular |Persian corpora.

In [None]:
!pip install hazm

In this class, we use the Hazm and re libraries for data preprocessing. Our preprocessing steps include removing punctuation, diacritics, repeated characters, and stopwords.

In [None]:
import hazm
import string
import re
import pandas as pd


class preprocessing:
  def __init__(self):
    persian_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
    self.punctuations_list = string.punctuation + persian_punctuations
    self.arabic_diacritics = re.compile("""
                             ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    self.stop_words = hazm.stopwords_list()
    self.lemmatizer = hazm.Lemmatizer()

  def fit(self, train_data):
    train_data['Text'] = train_data['Text'].apply(self._remove_diacritics)
    train_data['Text'] = train_data['Text'].apply(self._remove_punctuations)
    train_data['Text'] = train_data['Text'].apply(self._remove_repeating_char)
    train_data['Text'] = train_data['Text'].apply(self._normalize_persian)
    train_data['Text'] = train_data['Text'].apply(self._tokenize)
    train_data['Text'] = train_data['Text'].apply(self._remove_stopwords)
    train_data['Text'] = train_data['Text'].apply(self._lemmatizer)
    return train_data


  def _remove_diacritics(self, text):
    text = re.sub(self.arabic_diacritics, '', text)
    return text

  def _remove_punctuations(self, text):
    translator = str.maketrans('', '', self.punctuations_list)
    return text.translate(translator)

  def _remove_repeating_char(self, text):
    return re.sub(r'(.)\1+', r'\1', text)


  def _normalize_persian(self, text):
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ي", "ی", text)
    text = re.sub("ؤ", "و", text)
    text = re.sub("ئ", "ی", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("ك" ,"ک" , text)
    text = re.sub("[^ابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی]", " ", text)
    text = re.sub("[^\S\n\t]+", ' ', text)
    return text


  def _tokenize(self, text):
    return text.split()

  def _remove_stopwords(self, words):
    return [word  for word in words if word not in self.stop_words]

  def _lemmatizer(self, words):
    result = set()
    for token in words:
      result.add(self.lemmatizer.lemmatize(token))
    return list(result)