### 3 Important NLP Libraries for Indian Languages You Should Try Out Today!

#### https://www.analyticsvidhya.com/blog/2020/01/3-important-nlp-libraries-indian-languages-python/?utm_source=av&utm_medium=feed-articles&utm_campaign=feed

## iNLTK (Natural Language Toolkit for Indic Languages)

In [None]:
#pip install torch==1.3.1+cpu -f https://download.pytorch.org/whl/torch_stable.html

In [None]:
#pip install inltk

In [None]:
from inltk.inltk import setup

In [None]:
## Setting the language

In [None]:
setup('hi')

In [None]:
### Tokenization

In [None]:
from inltk.inltk import tokenize

hindi_text = """प्राचीन काल में विक्रमादित्य नाम के एक आदर्श राजा हुआ करते थे।
अपने साहस, पराक्रम और शौर्य के लिए  राजा विक्रम मशहूर थे। 
ऐसा भी कहा जाता है कि राजा विक्रम अपनी प्राजा के जीवन के दुख दर्द जानने के लिए रात्री के पहर में भेष बदल कर नगर में घूमते थे।"""

# tokenize(input text, language code)
tokenize(hindi_text, "hi")

### Generate similar sentences from a given text input

In [None]:
from inltk.inltk import get_similar_sentences

# get similar sentences to the one given in hindi
output = get_similar_sentences('मैं आज बहुत खुश हूं', 5, 'hi')

print(output)

In [None]:
import warnings

In [None]:
warnings.simplefilter("ignore")

In [None]:
from inltk.inltk import get_similar_sentences

# get similar sentences to the one given in hindi
output = get_similar_sentences('मैं आज बहुत खुश हूं', 5, 'hi')

print(output)

## Identify the language of a text

In [None]:
from inltk.inltk import identify_language_language

In [None]:
identify_language("मैे खाना खाता हूं")

# Extract embedding vectors

In [None]:
from inltk.inltk import get_embedding_vectors

# get embedding for input words
vectors = get_embedding_vectors("विश्लेषिकी विद्या", "hi")

print(vectors)
# print shape of the first word
print("shape:", vectors[0].shape)

# Text completion

In [None]:
from inltk.inltk import setup
from inltk.inltk import predict_next_words

# download models for Gujarati
setup('bn')
# predict the next words of the sentence "The weather is nice today"
predict_next_words("আবহাওয়া চমৎকার", 10, "bn", 0.7)

# Finding similarity between two sentences

In [None]:
from inltk.inltk import get_sentence_similarity

# similarity of encodings is calculated by using cmp function whose default is cosine similarity
get_sentence_similarity('मुझे भोजन पसंद है।', 'मैं ऐसे भोजन की सराहना करता हूं जिसका स्वाद अच्छा हो।', 'hi')

# 2. Indic NLP Library

In [None]:
# Installing the Indic NLP Library

In [None]:
#pip install indic-nlp-library

In [None]:
# download the resource
#git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git

In [None]:
import sys
from indicnlp import common

# The path to the local git repo for Indic NLP library
INDIC_NLP_LIB_HOME=r"indic_nlp_library"

# The path to the local git repo for Indic NLP Resources
INDIC_NLP_RESOURCES=r"indic_nlp_resources"

# Add library to Python path
sys.path.append(r'{}\src'.format(INDIC_NLP_LIB_HOME))

# Set environment variable for resources folder
common.set_resources_path(INDIC_NLP_RESOURCES)

In [None]:
from indicnlp.tokenize import sentence_tokenize

indic_string="""तो क्या विश्व कप 2019 में मैच का बॉस टॉस है? यानी मैच में हार-जीत में \
टॉस की भूमिका अहम है? आप ऐसा सोच सकते हैं। विश्वकप के अपने-अपने पहले मैच में बुरी तरह हारने वाली एशिया की दो टीमों \
पाकिस्तान और श्रीलंका के कप्तान ने हालांकि अपने हार के पीछे टॉस की दलील तो नहीं दी, लेकिन यह जरूर कहा था कि वह एक अहम टॉस हार गए थे।"""

# Split the sentence, language code "hi" is passed for hingi
sentences=sentence_tokenize.sentence_split(indic_string, lang='hi')

# print the sentences
for t in sentences:
    print(t)

In [None]:
# Transliteration among various Indian Language Scripts

In [None]:
from indicnlp.transliterate.unicode_transliterate import UnicodeIndicTransliterator

# Input text "Today the weather is good. Sun is bright and there are no signs of rain. Hence we can play today."
input_text='आज मौसम अच्छा है। सूरज उज्ज्वल है और बारिश के कोई संकेत नहीं हैं। इसलिए हम आज खेल सकते हैं!'

# Transliterate from Hindi to Telugu
print(UnicodeIndicTransliterator.transliterate(input_text,"hi","pa"))

In [None]:
from indicnlp.transliterate.unicode_transliterate import ItransTransliterator

input_text='आज मौसम अच्छा है। इसलिए हम आज खेल सकते हैं!'

# Transliterate Hindi to Roman
print(ItransTransliterator.to_itrans(input_text, 'hi'))

In [None]:
## Understanding the phonetics of a character

In [None]:
from indicnlp.langinfo import *

# Input character 
c='आ'
# Language is Hindi or 'hi'
lang='hi'

print('Is vowel?:  {}'.format(is_vowel(c,lang)))
print('Is consonant?:  {}'.format(is_consonant(c,lang)))
print('Is velar?:  {}'.format(is_velar(c,lang)))
print('Is palatal?:  {}'.format(is_palatal(c,lang)))
print('Is aspirated?:  {}'.format(is_aspirated(c,lang)))
print('Is unvoiced?:  {}'.format(is_unvoiced(c,lang)))
print('Is nasal?:  {}'.format(is_nasal(c,lang)))

# How similar do two characters sound?

In [None]:
from indicnlp.script import  indic_scripts as isc
from indicnlp.script import  phonetic_sim as psim

c1='क'
c2='ख'
c3='भ'
lang='hi'

print('Similarity between {} and {}'.format(c1,c2))
print(psim.cosine(
    isc.get_phonetic_feature_vector(c1,lang),
    isc.get_phonetic_feature_vector(c2,lang)
    ))

print(u'Similarity between {} and {}'.format(c1,c3))
print(psim.cosine(
    isc.get_phonetic_feature_vector(c1,lang),
    isc.get_phonetic_feature_vector(c3,lang)
    ))

## Splitting words into Syllables

In [None]:
from indicnlp.syllable import  syllabifier

# Word to be broken into syllables
w='जगदीशचंद्र'
# Language code Hindi in this case 
lang='hi'

# Break into syllables
print(' '.join(syllabifier.orthographic_syllabify(w,lang)))

# StanfordNLP

#### StanfordNLP is an NLP library right from Stanford’s Research Group on Natural Language Processing.
The most striking feature of this library is that it supports around 53 human languages for text processing!

In [4]:
#pip install stanfordnlp

import stanfordnlp

stanfordnlp.download('hi')

Using the default treebank "hi_hdtb" for language "hi".
Would you like to download the models for: hi_hdtb now? (Y/n)
y

Default download directory: C:\Users\admin\stanfordnlp_resources
Hit enter to continue or type an alternate directory.
d:\Users\admin\stanfordnlp_resources

Downloading models for: hi_hdtb
Download location: d:\Users\admin\stanfordnlp_resources\hi_hdtb_models.zip


100%|███████████████████████████████████████████████████████████████████████████████| 208M/208M [03:28<00:00, 1.02MB/s]



Download complete.  Models saved to: d:\Users\admin\stanfordnlp_resources\hi_hdtb_models.zip
Extracting models file for: hi_hdtb
Cleaning up...Done.


# Extracting Part of Speech (POS) Tags for Hindi

StanfordNLP comes with built-in processors to perform five basic NLP tasks:

Tokenization
Multi-Word Token Expansion
Lemmatization
Parts of Speech Tagging
Dependency Parsing

In [3]:
%tb

SystemExit: 1

In [8]:
nlp = stanfordnlp.Pipeline(processors = "pos")

Use device: cpu
---
Loading: pos
With settings: 
{'model_path': 'C:\\Users\\admin\\stanfordnlp_resources\\en_ewt_models\\en_ewt_tagger.pt', 'pretrain_path': 'C:\\Users\\admin\\stanfordnlp_resources\\en_ewt_models\\en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Cannot load model from C:\Users\admin\stanfordnlp_resources\en_ewt_models\en_ewt_tagger.pt


SystemExit: 1

In [6]:
hindi_doc = nlp("""केंद्र की मोदी सरकार ने शुक्रवार को अपना अंतरिम बजट पेश किया. कार्यवाहक वित्त मंत्री पीयूष गोयल ने अपने बजट में किसान, मजदूर, करदाता, महिला वर्ग समेत हर किसी के लिए बंपर ऐलान किए. हालांकि, बजट के बाद भी टैक्स को लेकर काफी कन्फ्यूजन बना रहा. केंद्र सरकार के इस अंतरिम बजट क्या खास रहा और किसको क्या मिला, आसान भाषा में यहां समझें""")

NameError: name 'nlp' is not defined

In [None]:
#dictionary that contains pos tags and their explanations
pos_dict = {
'CC': 'coordinating conjunction','CD': 'cardinal digit','DT': 'determiner',
'EX': 'existential there (like: \"there is\" ... think of it like \"there exists\")',
'FW': 'foreign word','IN':  'preposition/subordinating conjunction','JJ': 'adjective \'big\'',
'JJR': 'adjective, comparative \'bigger\'','JJS': 'adjective, superlative \'biggest\'',
'LS': 'list marker 1)','MD': 'modal could, will','NN': 'noun, singular \'desk\'',
'NNS': 'noun plural \'desks\'','NNP': 'proper noun, singular \'Harrison\'',
'NNPS': 'proper noun, plural \'Americans\'','PDT': 'predeterminer \'all the kids\'',
'POS': 'possessive ending parent\'s','PRP': 'personal pronoun I, he, she',
'PRP$': 'possessive pronoun my, his, hers','RB': 'adverb very, silently,',
'RBR': 'adverb, comparative better','RBS': 'adverb, superlative best',
'RP': 'particle give up','TO': 'to go \'to\' the store.','UH': 'interjection errrrrrrrm',
'VB': 'verb, base form take','VBD': 'verb, past tense took',
'VBG': 'verb, gerund/present participle taking','VBN': 'verb, past participle taken',
'VBP': 'verb, sing. present, non-3d take','VBZ': 'verb, 3rd person sing. present takes',
'WDT': 'wh-determiner which','WP': 'wh-pronoun who, what','WP$': 'possessive wh-pronoun whose',
'WRB': 'wh-abverb where, when','QF' : 'quantifier, bahut, thoda, kam (Hindi)','VM' : 'main verb',
'PSP' : 'postposition, common in indian langs','DEM' : 'demonstrative, common in indian langs'
}

#extract parts of speech
def extract_pos(doc):
    parsed_text = {'word':[], 'pos':[], 'exp':[]}
    for sent in doc.sentences:
        for wrd in sent.words:
            if wrd.pos in pos_dict.keys():
                pos_exp = pos_dict[wrd.pos]
            else:
                pos_exp = 'NA'
            parsed_text['word'].append(wrd.text)
            parsed_text['pos'].append(wrd.pos)
            parsed_text['exp'].append(pos_exp)
    #return a dataframe of pos and text
    return pd.DataFrame(parsed_text)