From 17027975c7dd16840412e1e1e02feb9cbb1cc2e4 Mon Sep 17 00:00:00 2001 From: Gaurav Date: Sun, 11 Oct 2020 15:09:16 +0530 Subject: [PATCH 1/2] add hinglish, tanglish, manglish support --- inltk/config.py | 17 ++++++++++++++--- inltk/inltk.py | 5 +++++ inltk/tokenizer.py | 21 +++++++++++++++++++++ inltk/utils.py | 16 ++++++++++++++++ 4 files changed, 56 insertions(+), 3 deletions(-) diff --git a/inltk/config.py b/inltk/config.py index d6315d7..20a67d1 100644 --- a/inltk/config.py +++ b/inltk/config.py @@ -12,12 +12,17 @@ class LanguageCodes: tamil = 'ta' urdu = 'ur' english = 'en' + # Code-Mixed Languages in Latin script + hinglish = 'hi-en' + tanglish = 'ta-en' + manglish = 'ml-en' def get_all_language_codes(self): return [self.bengali, self.gujarati, self.hindi, self.kannada, self.malyalam, self.marathi, self.nepali, self.odia, self.panjabi, - self.sanskrit, self.tamil, self.urdu, self.english] + self.sanskrit, self.tamil, self.urdu, self.english, + self.hinglish, self.tanglish, self.manglish] class LMConfigs: @@ -35,7 +40,10 @@ class LMConfigs: all_language_codes.sanskrit: 'https://www.dropbox.com/s/4ay1by5ryz6k39l/sanskrit_export.pkl?raw=1', all_language_codes.tamil: 'https://www.dropbox.com/s/88klv70zl82u39b/export.pkl?raw=1', all_language_codes.urdu: 'https://www.dropbox.com/s/0ovetjk27np0fcz/urdu_export.pkl?raw=1', - all_language_codes.english: 'https://www.dropbox.com/s/fnzfz23tukv3aku/export.pkl?raw=1' + all_language_codes.english: 'https://www.dropbox.com/s/fnzfz23tukv3aku/export.pkl?raw=1', + all_language_codes.hinglish: 'https://www.dropbox.com/s/8neihsc8r21jz64/export.pkl?raw=1', + all_language_codes.tanglish: 'https://www.dropbox.com/s/2xjhwtaepm157vt/export.pkl?raw=1', + all_language_codes.manglish: 'https://www.dropbox.com/s/d0jn6g4422pq5kv/export.pkl?raw=1' } tokenizer_model_file_url = { all_language_codes.bengali: 'https://www.dropbox.com/s/29h7vqme1kb8pmw/bengali_lm.model?raw=1', @@ -50,7 +58,10 @@ class LMConfigs: all_language_codes.sanskrit: 'https://www.dropbox.com/s/e13401nsekulq17/tokenizer.model?raw=1', all_language_codes.tamil: 'https://www.dropbox.com/s/jpg4kaqyfb71g1v/tokenizer.model?raw=1', all_language_codes.urdu: 'https://www.dropbox.com/s/m5l1yy41ij6vwxa/urdu_lm.model?raw=1', - all_language_codes.english: 'https://www.dropbox.com/s/2u3greusrnyh7qy/vocab.pkl?raw=1' + all_language_codes.english: 'https://www.dropbox.com/s/2u3greusrnyh7qy/vocab.pkl?raw=1', + all_language_codes.hinglish: 'https://www.dropbox.com/s/oblv8oalv5lwdec/tokenizer.model?raw=1', + all_language_codes.tanglish: 'https://www.dropbox.com/s/wgsv87tx0rhqx95/tokenizer.model?raw=1', + all_language_codes.manglish: 'https://www.dropbox.com/s/877ogp4qu3kf05v/tokenizer.model?raw=1' } def __init__(self, language_code: str): diff --git a/inltk/inltk.py b/inltk/inltk.py index 2043443..9997eed 100644 --- a/inltk/inltk.py +++ b/inltk/inltk.py @@ -1,3 +1,5 @@ +import sys +import warnings import asyncio import random from math import ceil @@ -10,6 +12,9 @@ from inltk.const import tokenizer_special_cases from inltk.utils import cos_sim, reset_models, is_english +if not sys.warnoptions: + warnings.simplefilter("ignore") + lcodes = LanguageCodes() all_language_codes = lcodes.get_all_language_codes() diff --git a/inltk/tokenizer.py b/inltk/tokenizer.py index 82844a3..4b5c5e2 100644 --- a/inltk/tokenizer.py +++ b/inltk/tokenizer.py @@ -3,6 +3,7 @@ from pathlib import Path from inltk.config import LanguageCodes +from inltk.utils import handle_all_caps, handle_upper_case_first_letter, lower_case_everything path = Path(__file__).parent @@ -76,6 +77,7 @@ def remove_foreign_tokens(self, t: str): local_pieces.append(self.sp.IdToPiece(i)) return local_pieces + class AllLanguageTokenizer(LanguageTokenizer): def __init__(self, lang: str): LanguageTokenizer.__init__(self, lang) @@ -139,3 +141,22 @@ def __init__(self, lang: str): class UrduTokenizer(LanguageTokenizer): def __init__(self, lang: str): LanguageTokenizer.__init__(self, lang) + + +class HinglishTokenizer(LanguageTokenizer): + def __init__(self, lang: str): + LanguageTokenizer.__init__(self, lang) + + +class TanglishTokenizer(LanguageTokenizer): + def __init__(self, lang: str): + # because of some bug in fastai -- need to dive in further + lang = LanguageCodes.tanglish + LanguageTokenizer.__init__(self, lang) + + +class ManglishTokenizer(LanguageTokenizer): + def __init__(self, lang: str): + # because of some bug in fastai -- need to dive in further + lang = LanguageCodes.manglish + LanguageTokenizer.__init__(self, lang) diff --git a/inltk/utils.py b/inltk/utils.py index a64ce42..a2bd345 100644 --- a/inltk/utils.py +++ b/inltk/utils.py @@ -19,3 +19,19 @@ def is_english(s: str) -> bool: return False else: return True + + +def handle_all_caps(t: str) -> str: + tokens = t.split() + tokens = replace_all_caps(tokens) + return ' '.join(tokens) + + +def handle_upper_case_first_letter(t: str) -> str: + tokens = t.split() + tokens = deal_caps(tokens) + return ' '.join(tokens) + + +def lower_case_everything(t: str) -> str: + return t.lower() From 3b25230769df3edc6dc96681d4c0fed9843508af Mon Sep 17 00:00:00 2001 From: Gaurav Date: Sun, 11 Oct 2020 17:17:18 +0530 Subject: [PATCH 2/2] update readme for code-mixed support --- README.md | 45 ++++++++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 70f8da3..edf4db5 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,8 @@ Checkout detailed docs along with Installation instructions ### Supported languages +#### Native languages + | Language | Code | |:--------:|:----:| | Hindi | hi | @@ -30,23 +32,33 @@ Checkout detailed docs along with Installation instructions | Sanskrit | sa | | English | en | -#### Repositories containing models used in iNLTK +#### Code Mixed languages -| Language | Repository | Dataset used for Language modeling | Perplexity of ULMFiT LM
(on validation set) | Perplexity of TransformerXL LM
(on validation set) | Dataset used for Classification | Classification:
Test set Accuracy | Classification:
Test set MCC | Classification: Notebook
for Reproducibility | ULMFiT Embeddings visualization | TransformerXL Embeddings visualization | -|:---------:|:----------------------------------------------------------------:|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:----------------------------------------------:|:-----------------------------------------------------:|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:---------------------------------------:|:---------------------------------------:|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:| -| Hindi | [NLP for Hindi](https://github.com/goru001/nlp-for-hindi) | [Hindi Wikipedia Articles - 172k](https://www.kaggle.com/disisbig/hindi-wikipedia-articles-172k)


[Hindi Wikipedia Articles - 55k](https://www.kaggle.com/disisbig/hindi-wikipedia-articles-55k) | 34.06


35.87 | 26.09


34.78 | [BBC News Articles](https://github.com/ai4bharat-indicnlp/indicnlp_corpus#publicly-available-classification-datasets)


[IIT Patna Movie Reviews](https://github.com/ai4bharat-indicnlp/indicnlp_corpus#publicly-available-classification-datasets)


[IIT Patna Product Reviews](https://github.com/ai4bharat-indicnlp/indicnlp_corpus#publicly-available-classification-datasets) | 78.75


57.74


75.71 | 71.61


37.23


59.76 | [Notebook](https://github.com/goru001/nlp-for-hindi/blob/master/classification-benchmarks/Hindi_Classification_Model_BBC_Articles.ipynb)


[Notebook](https://github.com/goru001/nlp-for-hindi/blob/master/classification-benchmarks/Hindi_Classification_Model_IITP%2BMovie.ipynb)


[Notebook](https://github.com/goru001/nlp-for-hindi/blob/master/classification-benchmarks/Hindi_Classification_Model_IITP_Product.ipynb) | [Hindi Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-hindi/master/language-model/embedding_projector_config_30k.json) | [Hindi Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-hindi/master/language-model/embedding_projector_config_transformerxl.json) | -| Bengali | [NLP for Bengali](https://github.com/goru001/nlp-for-bengali) | [Bengali Wikipedia Articles](https://www.kaggle.com/disisbig/bengali-wikipedia-articles) | 41.2 | 39.3 | [Bengali News Articles (Soham Articles)](https://github.com/ai4bharat-indicnlp/indicnlp_corpus#publicly-available-classification-datasets) | 90.71 | 87.92 | [Notebook](https://github.com/goru001/nlp-for-bengali/blob/master/classification/Bengali_Classification_Model.ipynb) | [Bengali Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-bengali/master/language-model/embedding_projector_config.json) | [Bengali Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-bengali/master/language-model/embedding_projector_transformer_config.json) | -| Gujarati | [NLP for Gujarati](https://github.com/goru001/nlp-for-gujarati) | [Gujarati Wikipedia Articles](https://www.kaggle.com/disisbig/gujarati-wikipedia-articles) | 34.12 | 28.12 | [iNLTK Headlines Corpus - Gujarati](https://github.com/ai4bharat-indicnlp/indicnlp_corpus#publicly-available-classification-datasets) | 91.05 | 86.09 | [Notebook](https://github.com/goru001/nlp-for-gujarati/blob/master/classification/Gujarati_Classification_Model.ipynb) | [Gujarati Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-gujarati/master/language-model/embedding_projector_config.json) | [Gujarati Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-gujarati/master/language-model/embedding_projector_transformer_config.json) | -| Malayalam | [NLP for Malayalam](https://github.com/goru001/nlp-for-malyalam) | [Malayalam Wikipedia Articles](https://www.kaggle.com/disisbig/malayalam-wikipedia-articles) | 26.39 | 25.79 | [iNLTK Headlines Corpus - Malayalam](https://github.com/ai4bharat-indicnlp/indicnlp_corpus#publicly-available-classification-datasets) | 95.56 | 93.29 | [Notebook](https://github.com/goru001/nlp-for-malyalam/blob/master/classification/Malyalam_Classification_Model.ipynb) | [Malayalam Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-malyalam/master/language-model/embedding_projector_config.json) | [Malayalam Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-malyalam/master/language-model/embedding_projector_transformer_config.json) | -| Marathi | [NLP for Marathi](https://github.com/goru001/nlp-for-marathi) | [Marathi Wikipedia Articles](https://www.kaggle.com/disisbig/marathi-wikipedia-articles) | 18 | 17.42 | [iNLTK Headlines Corpus - Marathi](https://github.com/ai4bharat-indicnlp/indicnlp_corpus#publicly-available-classification-datasets) | 92.40 | 85.23 | [Notebook](https://github.com/goru001/nlp-for-marathi/blob/master/classification/Marathi_Classification_Model.ipynb) | [Marathi Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-marathi/master/language-model/embedding_projector_config.json) | [Marathi Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-marathi/master/language-model/embedding_projector_transformer_config.json) | -| Tamil | [NLP for Tamil](https://github.com/goru001/nlp-for-tamil) | [Tamil Wikipedia Articles](https://www.kaggle.com/disisbig/tamil-wikipedia-articles) | 19.80 | 17.22 | [iNLTK Headlines Corpus - Tamil](https://github.com/ai4bharat-indicnlp/indicnlp_corpus#publicly-available-classification-datasets) | 95.22 | 92.70 | [Notebook](https://github.com/goru001/nlp-for-tamil/blob/master/classification/Tamil_Classifier.ipynb) | [Tamil Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-tamil/master/language-model/embedding_projector_config.json) | [Tamil Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-tamil/master/language-model/embedding_projector_transformer_config.json) | -| Punjabi | [NLP for Punjabi](https://github.com/goru001/nlp-for-punjabi) | [Punjabi Wikipedia Articles](https://www.kaggle.com/disisbig/punjabi-wikipedia-articles) | 24.40 | 14.03 | [IndicNLP News Article Classification Dataset - Punjabi](https://github.com/ai4bharat-indicnlp/indicnlp_corpus#indicnlp-news-article-classification-dataset) | 97.12 | 96.17 | [Notebook](https://github.com/goru001/nlp-for-punjabi/blob/master/classification/Panjabi_Classification_Model.ipynb) | [Punjabi Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-punjabi/master/language-model/embedding_projector_config.json) | [Punjabi Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-punjabi/master/language-model/embedding_projector_transformer_config.json) | -| Kannada | [NLP for Kannada](https://github.com/goru001/nlp-for-kannada) | [Kannada Wikipedia Articles](https://www.kaggle.com/disisbig/kannada-wikipedia-articles) | 70.10 | 61.97 | [IndicNLP News Article Classification Dataset - Kannada](https://github.com/ai4bharat-indicnlp/indicnlp_corpus#indicnlp-news-article-classification-dataset) | 98.87 | 98.30 | [Notebook](https://github.com/goru001/nlp-for-kannada/blob/master/classification/Kannada_Classification_Model.ipynb) | [Kannada Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-kannada/master/language-model/embedding_projector_config.json) | [Kannada Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-kannada/master/language-model/embedding_projector_transformer_config.json) | -| Oriya | [NLP for Oriya](https://github.com/goru001/nlp-for-odia) | [Oriya Wikipedia Articles](https://www.kaggle.com/disisbig/odia-wikipedia-articles) | 26.57 | 26.81 | [IndicNLP News Article Classification Dataset - Oriya](https://github.com/ai4bharat-indicnlp/indicnlp_corpus#indicnlp-news-article-classification-dataset) | 98.83 | 98.44 | [Notebook](https://github.com/goru001/nlp-for-odia/blob/master/classification/Oriya_Classification_Model.ipynb) | [Oriya Embeddings Projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-odia/master/language-model/embedding_projector_config.json) | [Oriya Embeddings Projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-odia/master/language-model/embedding_projector_transformer_config.json) | -| Sanskrit | [NLP for Sanskrit](https://github.com/goru001/nlp-for-sanskrit) | [Sanskrit Wikipedia Articles](https://www.kaggle.com/disisbig/sanskrit-wikipedia-articles) | ~6 | ~3 | [Sanskrit Shlokas Dataset](https://www.kaggle.com/disisbig/sanskrit-shlokas-dataset) | 84.3 (valid set) | | | [Sanskrit Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-sanskrit/master/language-model/embedding_projector_config.json) | [Sanskrit Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-sanskrit/master/language-model/embedding_projector_transformer_config.json) | -| Nepali | [NLP for Nepali](https://github.com/goru001/nlp-for-nepali) | [Nepali Wikipedia Articles](https://www.kaggle.com/disisbig/nepali-wikipedia-articles) | 31.5 | 29.3 | [Nepali News Dataset](https://www.kaggle.com/disisbig/nepali-news-dataset) | 98.5 (valid set) | | | [Nepali Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-nepali/master/language-model/embedding_projector_config.json) | [Nepali Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-nepali/master/language-model/embedding_projector_transformer_config.json) | -| Urdu | [NLP for Urdu](https://github.com/anuragshas/nlp-for-urdu) | [Urdu Wikipedia Articles](https://www.kaggle.com/disisbig/urdu-wikipedia-articles) | 13.19 | 12.55 | [Urdu News Dataset](https://www.kaggle.com/disisbig/urdu-news-dataset) | 95.28 (valid set) | | | [Urdu Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/anuragshas/nlp-for-urdu/master/language-model/embedding_projector_config.json) | [Urdu Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/anuragshas/nlp-for-urdu/master/language-model/embedding_projector_transformer_config.json) | +| Language | Script |Code | +|:--------:|:----:|:----:| +| Hinglish (Hindi+English) | Latin | hi-en | +| Tanglish (Tamil+English) | Latin | ta-en | +| Manglish (Malayalam+English) | Latin | ml-en | + +#### Repositories containing models used in iNLTK +| Language | Repository | Dataset used for Language modeling | Perplexity of ULMFiT LM
(on validation set) | Perplexity of TransformerXL LM
(on validation set) | Dataset used for Classification | Classification:
Test set Accuracy | Classification:
Test set MCC | Classification: Notebook
for Reproducibility | ULMFiT Embeddings visualization | TransformerXL Embeddings visualization | +|:---------:|:----------------------------------------------------------------:|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:----------------------------------------------:|:-----------------------------------------------------:|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:---------------------------------------:|:------------------------------------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:| +| Hindi | [NLP for Hindi](https://github.com/goru001/nlp-for-hindi) | [Hindi Wikipedia Articles - 172k](https://www.kaggle.com/disisbig/hindi-wikipedia-articles-172k)


[Hindi Wikipedia Articles - 55k](https://www.kaggle.com/disisbig/hindi-wikipedia-articles-55k) | 34.06


35.87 | 26.09


34.78 | [BBC News Articles](https://github.com/ai4bharat-indicnlp/indicnlp_corpus#publicly-available-classification-datasets)


[IIT Patna Movie Reviews](https://github.com/ai4bharat-indicnlp/indicnlp_corpus#publicly-available-classification-datasets)


[IIT Patna Product Reviews](https://github.com/ai4bharat-indicnlp/indicnlp_corpus#publicly-available-classification-datasets) | 78.75


57.74


75.71 | 0.71


0.37


0.59 | [Notebook](https://github.com/goru001/nlp-for-hindi/blob/master/classification-benchmarks/Hindi_Classification_Model_BBC_Articles.ipynb)


[Notebook](https://github.com/goru001/nlp-for-hindi/blob/master/classification-benchmarks/Hindi_Classification_Model_IITP%2BMovie.ipynb)


[Notebook](https://github.com/goru001/nlp-for-hindi/blob/master/classification-benchmarks/Hindi_Classification_Model_IITP_Product.ipynb) | [Hindi Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-hindi/master/language-model/embedding_projector_config_30k.json) | [Hindi Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-hindi/master/language-model/embedding_projector_config_transformerxl.json) | +| Bengali | [NLP for Bengali](https://github.com/goru001/nlp-for-bengali) | [Bengali Wikipedia Articles](https://www.kaggle.com/disisbig/bengali-wikipedia-articles) | 41.2 | 39.3 | [Bengali News Articles (Soham Articles)](https://github.com/ai4bharat-indicnlp/indicnlp_corpus#publicly-available-classification-datasets) | 90.71 | 0.87 | [Notebook](https://github.com/goru001/nlp-for-bengali/blob/master/classification/Bengali_Classification_Model.ipynb) | [Bengali Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-bengali/master/language-model/embedding_projector_config.json) | [Bengali Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-bengali/master/language-model/embedding_projector_transformer_config.json) | +| Gujarati | [NLP for Gujarati](https://github.com/goru001/nlp-for-gujarati) | [Gujarati Wikipedia Articles](https://www.kaggle.com/disisbig/gujarati-wikipedia-articles) | 34.12 | 28.12 | [iNLTK Headlines Corpus - Gujarati](https://github.com/ai4bharat-indicnlp/indicnlp_corpus#publicly-available-classification-datasets) | 91.05 | 0.86 | [Notebook](https://github.com/goru001/nlp-for-gujarati/blob/master/classification/Gujarati_Classification_Model.ipynb) | [Gujarati Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-gujarati/master/language-model/embedding_projector_config.json) | [Gujarati Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-gujarati/master/language-model/embedding_projector_transformer_config.json) | +| Malayalam | [NLP for Malayalam](https://github.com/goru001/nlp-for-malyalam) | [Malayalam Wikipedia Articles](https://www.kaggle.com/disisbig/malayalam-wikipedia-articles) | 26.39 | 25.79 | [iNLTK Headlines Corpus - Malayalam](https://github.com/ai4bharat-indicnlp/indicnlp_corpus#publicly-available-classification-datasets) | 95.56 | 0.93 | [Notebook](https://github.com/goru001/nlp-for-malyalam/blob/master/classification/Malyalam_Classification_Model.ipynb) | [Malayalam Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-malyalam/master/language-model/embedding_projector_config.json) | [Malayalam Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-malyalam/master/language-model/embedding_projector_transformer_config.json) | +| Marathi | [NLP for Marathi](https://github.com/goru001/nlp-for-marathi) | [Marathi Wikipedia Articles](https://www.kaggle.com/disisbig/marathi-wikipedia-articles) | 18 | 17.42 | [iNLTK Headlines Corpus - Marathi](https://github.com/ai4bharat-indicnlp/indicnlp_corpus#publicly-available-classification-datasets) | 92.40 | 0.85 | [Notebook](https://github.com/goru001/nlp-for-marathi/blob/master/classification/Marathi_Classification_Model.ipynb) | [Marathi Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-marathi/master/language-model/embedding_projector_config.json) | [Marathi Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-marathi/master/language-model/embedding_projector_transformer_config.json) | +| Tamil | [NLP for Tamil](https://github.com/goru001/nlp-for-tamil) | [Tamil Wikipedia Articles](https://www.kaggle.com/disisbig/tamil-wikipedia-articles) | 19.80 | 17.22 | [iNLTK Headlines Corpus - Tamil](https://github.com/ai4bharat-indicnlp/indicnlp_corpus#publicly-available-classification-datasets) | 95.22 | 0.92 | [Notebook](https://github.com/goru001/nlp-for-tamil/blob/master/classification/Tamil_Classifier.ipynb) | [Tamil Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-tamil/master/language-model/embedding_projector_config.json) | [Tamil Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-tamil/master/language-model/embedding_projector_transformer_config.json) | +| Punjabi | [NLP for Punjabi](https://github.com/goru001/nlp-for-punjabi) | [Punjabi Wikipedia Articles](https://www.kaggle.com/disisbig/punjabi-wikipedia-articles) | 24.40 | 14.03 | [IndicNLP News Article Classification Dataset - Punjabi](https://github.com/ai4bharat-indicnlp/indicnlp_corpus#indicnlp-news-article-classification-dataset) | 97.12 | 0.96 | [Notebook](https://github.com/goru001/nlp-for-punjabi/blob/master/classification/Panjabi_Classification_Model.ipynb) | [Punjabi Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-punjabi/master/language-model/embedding_projector_config.json) | [Punjabi Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-punjabi/master/language-model/embedding_projector_transformer_config.json) | +| Kannada | [NLP for Kannada](https://github.com/goru001/nlp-for-kannada) | [Kannada Wikipedia Articles](https://www.kaggle.com/disisbig/kannada-wikipedia-articles) | 70.10 | 61.97 | [IndicNLP News Article Classification Dataset - Kannada](https://github.com/ai4bharat-indicnlp/indicnlp_corpus#indicnlp-news-article-classification-dataset) | 98.87 | 0.98 | [Notebook](https://github.com/goru001/nlp-for-kannada/blob/master/classification/Kannada_Classification_Model.ipynb) | [Kannada Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-kannada/master/language-model/embedding_projector_config.json) | [Kannada Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-kannada/master/language-model/embedding_projector_transformer_config.json) | +| Oriya | [NLP for Oriya](https://github.com/goru001/nlp-for-odia) | [Oriya Wikipedia Articles](https://www.kaggle.com/disisbig/odia-wikipedia-articles) | 26.57 | 26.81 | [IndicNLP News Article Classification Dataset - Oriya](https://github.com/ai4bharat-indicnlp/indicnlp_corpus#indicnlp-news-article-classification-dataset) | 98.83 | 0.98 | [Notebook](https://github.com/goru001/nlp-for-odia/blob/master/classification/Oriya_Classification_Model.ipynb) | [Oriya Embeddings Projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-odia/master/language-model/embedding_projector_config.json) | [Oriya Embeddings Projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-odia/master/language-model/embedding_projector_transformer_config.json) | +| Sanskrit | [NLP for Sanskrit](https://github.com/goru001/nlp-for-sanskrit) | [Sanskrit Wikipedia Articles](https://www.kaggle.com/disisbig/sanskrit-wikipedia-articles) | ~6 | ~3 | [Sanskrit Shlokas Dataset](https://www.kaggle.com/disisbig/sanskrit-shlokas-dataset) | 84.3 (valid set) | | | [Sanskrit Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-sanskrit/master/language-model/embedding_projector_config.json) | [Sanskrit Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-sanskrit/master/language-model/embedding_projector_transformer_config.json) | +| Nepali | [NLP for Nepali](https://github.com/goru001/nlp-for-nepali) | [Nepali Wikipedia Articles](https://www.kaggle.com/disisbig/nepali-wikipedia-articles) | 31.5 | 29.3 | [Nepali News Dataset](https://www.kaggle.com/disisbig/nepali-news-dataset) | 98.5 (valid set) | | | [Nepali Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-nepali/master/language-model/embedding_projector_config.json) | [Nepali Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-nepali/master/language-model/embedding_projector_transformer_config.json) | +| Urdu | [NLP for Urdu](https://github.com/anuragshas/nlp-for-urdu) | [Urdu Wikipedia Articles](https://www.kaggle.com/disisbig/urdu-wikipedia-articles) | 13.19 | 12.55 | [Urdu News Dataset](https://www.kaggle.com/disisbig/urdu-news-dataset) | 95.28 (valid set) | | | [Urdu Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/anuragshas/nlp-for-urdu/master/language-model/embedding_projector_config.json) | [Urdu Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/anuragshas/nlp-for-urdu/master/language-model/embedding_projector_transformer_config.json) | +| Tanglish | [NLP for Tanglish](https://github.com/goru001/nlp-for-tanglish) | [Synthetic Tanglish Dataset](https://drive.google.com/drive/folders/1M4Sx_clF0iP1y-JG3OhfacFKTDoHXCR1?usp=sharing) | 37.50 | - | Dravidian Codemix HASOC @ FIRE 2020

Dravidian Codemix Sentiment Analysis @ FIRE 2020 | F1 Score: 0.88

F1 Score: 0.62 | - | [Notebook](https://github.com/goru001/nlp-for-tanglish/blob/master/classification/classification_model_hasoc.ipynb)

[Notebook](https://github.com/goru001/nlp-for-tanglish/blob/master/classification/classification_model_dc_fire.ipynb) | [Tanglish Embeddings Projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-tanglish/master/language-model/embedding_projector_config.json) | - | +| Manglish | [NLP for Manglish](https://github.com/goru001/nlp-for-manglish) | [Synthetic Manglish Dataset](https://drive.google.com/drive/folders/1M4Sx_clF0iP1y-JG3OhfacFKTDoHXCR1?usp=sharing) | 45.84 | - | Dravidian Codemix HASOC @ FIRE 2020

Dravidian Codemix Sentiment Analysis @ FIRE 2020 | F1 Score: 0.74

F1 Score: 0.69 | - | [Notebook](https://github.com/goru001/nlp-for-manglish/blob/master/classification/classification_model_hasoc.ipynb)

[Notebook](https://github.com/goru001/nlp-for-manglish/blob/master/classification/classification_model_dc_fire.ipynb) | [Manglish Embeddings Projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-manglish/master/language-model/embedding_projector_config_latin_script.json) | - | +| Hinglish | [NLP for Hinglish](https://github.com/goru001/nlp-for-hinglish) | [Synthetic Hinglish Dataset](https://www.dropbox.com/sh/as5fg8jsrljt6k7/AADnSLlSNJPeAndFycJGurOUa?dl=0) | 86.48 | - | - | - | - | - | [Hinglish Embeddings Projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-hinglish/main/language_model/embedding_projector_config.json) | - | Note: English model has been directly taken from [fast.ai](https://github.com/fastai/fastai) #### Effect of using Transfer Learning + Paraphrases from iNLTK @@ -90,8 +102,7 @@ If you wish for a particular functionality in iNLTK - Start by checking/raising #### ..and being worked upon `Shout out if you want to help :)` -* Add [Telugu](https://github.com/goru001/inltk/issues/1) -and [Maithili](https://github.com/goru001/inltk/issues/10) support +* Add [Maithili](https://github.com/goru001/inltk/issues/10) support #### ..and NOT being worked upon