Using Fasttext to detect Language


In [1]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[?25l[K     |████▊                           | 10 kB 29.5 MB/s eta 0:00:01[K     |█████████▌                      | 20 kB 30.2 MB/s eta 0:00:01[K     |██████████████▎                 | 30 kB 12.7 MB/s eta 0:00:01[K     |███████████████████             | 40 kB 10.2 MB/s eta 0:00:01[K     |███████████████████████▉        | 51 kB 10.2 MB/s eta 0:00:01[K     |████████████████████████████▋   | 61 kB 10.2 MB/s eta 0:00:01[K     |████████████████████████████████| 68 kB 4.4 MB/s 
[?25hCollecting pybind11>=2.2
  Using cached pybind11-2.7.1-py2.py3-none-any.whl (200 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3093670 sha256=690552ebe49f7f321adae6f3e9982538edd7c4d2bd55024d5a725dc61fc68fcf
  Stored in directory: /root/.cache/pip/wheels/4e/ca/bf/b020d2be95f7641801a

In [2]:
!wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin

--2021-09-26 16:10:28--  https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.75.142, 104.22.74.142, 172.67.9.4, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.75.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 131266198 (125M) [application/octet-stream]
Saving to: ‘lid.176.bin’


2021-09-26 16:10:42 (9.93 MB/s) - ‘lid.176.bin’ saved [131266198/131266198]



In [3]:
import fasttext
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
fasttext.load_model("/content/lid.176.bin")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.




<fasttext.FastText._FastText at 0x7f0ba472cd10>

In [4]:
# Tokenize from nltk
# word_tokenize("This is AMMI, je suis le professeur!")

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
class LanguageIdentification:

    def __init__(self):
        pretrained_lang_model = "/content/lid.176.bin"
        self.model = fasttext.load_model(pretrained_lang_model)

    def predict_lang(self, text):
        predictions = self.model.predict(text)    # returns the language with highest probability
        return predictions

language = LanguageIdentification()



In [7]:
lid = LanguageIdentification()
def getLang(lid, word):
  # Token level identification in Fastttext
  lang = language.predict_lang(word)  # Returns lang label (e.g. label_en) and probability
  label = lang[0][0][-2::] # returns the last two chars like en or es
  return label



In [8]:
text = "Esta"
getLang(lid, text)

'es'

In [9]:
from tqdm import tqdm

# Two lists to hold language specified in file and another for predicted languages
original_list = []
pred_list = []
lid = LanguageIdentification()
with open('/content/drive/MyDrive/lid_spaeng/train.conll') as f:
  # num for number in enumerate and line for each line read
  lines = f.readlines()
  for num, line in tqdm(enumerate(lines), total=len(lines)):
    # Lines that start with #, sent enums and hashtags
    if (line[0] == '#'):
      continue
    elif (line == '\n'):  # skip separator lines too
      continue
    else:
      word, lang = line.split() # split lines to word and language
      if ((lang == 'other') or (lang == 'ambiguous') or (lang == 'unk') or (lang == 'mixed') or (lang == 'fw') or (lang == 'ne')): # Skip tokens with other language tags too
        continue
      else:
        pred_lang = getLang(lid, word) # Use Fasttext to predict language of word and append it to a list
        pred_list.append(pred_lang)
        original_list.append(lang)  # A list to hold the languages as specified in the file

100%|██████████| 295281/295281 [00:02<00:00, 106761.94it/s]


In [10]:
# Two lists to hold language specified in file and another for predicted languages
devoriginal_list = []
devpred_list = []
lid = LanguageIdentification()

with open('/content/drive/MyDrive/lid_spaeng/dev.conll') as dev:
  # num for number in enumerate and line for each line read
  lines = dev.readlines()
  for num, line in tqdm(enumerate(lines), total=len(lines)):
    # Lines that start with #, sent enums and hashtags
    if (line[0] == '#'):
      continue
    elif (line == '\n'):  # skip separator lines too
      continue
    else:
      word, lang = line.split() # split lines to word and language
      if ((lang == 'other') or (lang == 'ambiguous') or (lang == 'unk') or (lang == 'mixed') or (lang == 'fw') or (lang == 'ne')): # Skip tokens with other language tags too: # Skip tokens with other language tags too
        continue
      else:
        pred_lang = getLang(lid, word) # Use Fasttext to predict language of word and append it to a list
        devpred_list.append(pred_lang)
        devoriginal_list.append(lang)  # A list to hold the languages as specified in the file

100%|██████████| 47055/47055 [00:00<00:00, 109247.28it/s]


In [11]:
print(len(pred_list))
print(len(original_list))

print(len(devpred_list))
print(len(devoriginal_list))

189265
189265
31405
31405


In [12]:
def compare(list1, list2):
  if (len(list1) != len(list2)):
    print("Lists are not the same length")

  list_len = len(list1)
  correct = 0

  for i in range(list_len):
    if (list1[i] == list2[i]):
      correct += 1
  
  diff = list_len - correct
  print("Correct: ", correct)
  print("Diff: ", diff)
  print(float(correct)/float(list_len))  

In [13]:
# Compare lists for train
compare(pred_list, original_list)

Correct:  121021
Diff:  68244
0.6394262013578844


In [14]:
# Compare lists for dev
compare(devpred_list, devoriginal_list)

Correct:  21127
Diff:  10278
0.6727272727272727


In [15]:
from sklearn.metrics import f1_score
import numpy as np

f1_train = f1_score(np.array(original_list), np.array(pred_list), average='micro')
print(f1_train)

0.6394262013578844


In [16]:
print(pred_list)
print(original_list)

['en', 'en', 'fr', 'fr', 'fr', 'sv', 'fr', 'sv', 'en', 'uk', 'en', 'en', 'en', 'en', 'en', 'it', 'en', 'pt', 'es', 'gl', 'es', 'lt', 'es', 'eo', 'es', 'gl', 'km', 'es', 'es', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'de', 'en', 'en', 'fr', 'th', 'en', 'en', 'en', 'eb', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'fr', 'es', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'es', 'ja', 'uk', 'en', 'en', 'en', 'en', 'br', 'en', 'en', 'en', 'en', 'zh', 'en', 'en', 'en', 'es', 'ja', 'uk', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'ls', 'en', 'en', 'pt', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'pl', 'en', 'en', 'en', 'en', 'pt', 'en', 'fr', 'it', 'es', 'en', 'en', 'en', 'en', 'it', 'it', 'en', 'it', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'eo', 'es', 'en', 'es', 'pl', 'es', 'pt', 'es', 'es', 'es', 'de', 'en', 'en', 'en', 'en', 'en', 'es', 'en', 'en', 'en', 'en', 'en', 'en', 'es', 'en', 'en', 'es', 'es', 'en', 'es', 'es', 'gl', 'fr', 'es', 'es', 'en

In [17]:
print(devpred_list)
print(devoriginal_list)

['en', 'pt', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'de', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'es', 'en', 'gl', 'it', 'eo', 'de', 'es', 'it', 'fr', 'en', 'pt', 'es', 'km', 'bk', 'it', 'fr', 'fr', 'en', 'en', 'es', 'si', 'es', 'pt', 'es', 'ro', 'pt', 'es', 'pt', 'es', 'pt', 'es', 'es', 'pt', 'fr', 'es', 'es', 'es', 'en', 'es', 'en', 'en', 'es', 'fr', 'en', 'en', 'en', 'de', 'en', 'de', 'ia', 'es', 'es', 'en', 'fr', 'es', 'es', 'en', 'nl', 'fr', 'pt', 'es', 'es', 'es', 'es', 'es', 'bo', 'es', 'es', 'es', 'es', 'es', 'pt', 'es', 'es', 'gl', 'fr', 'es', 'it', 'gl', 'es', 'en', 'es', 'ro', 'es', 'pt', 'en', 'fi', 'es', 'es', 'pt', 'it', 'es', 'fr', 'es', 'es', 'es', 'pt', 'es', 'es', 'km', 'es', 'it', 'ia', 'lt', 'pt', 'es', 'it', 'eo', 'es', 'ia', 'es', 'it', 'es', 'es', 'es', 'es', 'es', 'eo', 'fr', 'es', 'eo', 'fr', 'pt', 'it', 'it', 'id', 'id', 'it', 'es', 'sw', 'es', 'pt', 'es', 'es', 'es', 'bo', 'es', 'es', 'bo', 'eo', 'pt', 'gl', 'pt', 'es', 'es', 'bo', 'gl', 'es', 'fr', 'pt

In [18]:
def compare(lang_class, pred_list, label_list):
  '''
  Args: Class of language, prediction list and true label list
  Returns tp, fp and fn which will be used to compute Precision and Recall 
  '''
  if (len(pred_list) != len(label_list)):
    print("Lists are not the same length")

  list_len = len(pred_list)
  correct = 0
  fn = 0

  for i in range(list_len):
    if (pred_list[i] == label_list[i]):
      correct += 1
      
  tp = correct
  fp = list_len - correct

  for i in range(list_len):
    if ((pred_list[i] != lang_class) and (lang_class == label_list[i])):
      fn += 1  

  return tp, fp, fn

# Compute Precision
def precision(tp, fp):
  return (tp)/(tp + fp)

# Compute Recall
def recall(tp, fn):
  return (tp) / (tp + fn)

def f1_score_lang(precision_num, recall_num):
  return 2*((precision_num * recall_num)/(precision_num + recall_num)) 

In [19]:
# English Class
tp_en, fp_en, fn_en = compare('en', pred_list, original_list)
precision_en = precision(tp_en, fp_en)
recall_en = recall(tp_en, fn_en)

f1_en = f1_score_lang(precision_en, recall_en)
print(f1_en)

0.7420526765978189


In [20]:
# Spanish Class

tp_es, fp_es, fn_es = compare('es', pred_list, original_list)
precision_es = precision(tp_es, fp_es)
recall_es = recall(tp_es, fn_es)

f1_es = f1_score_lang(precision_es, recall_es)
print(f1_es)

0.6674498189649706
