Using Fasttext to detect Language


In [4]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[K     |████████████████████████████████| 68 kB 2.3 MB/s 
[?25hCollecting pybind11>=2.2
  Using cached pybind11-2.7.1-py2.py3-none-any.whl (200 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3092527 sha256=ddeba3aba6e6916e44d46bca9862ed87a33fb12e8f9d383f34230294b42706df
  Stored in directory: /root/.cache/pip/wheels/4e/ca/bf/b020d2be95f7641801a6597a29c8f4f19e38f9c02a345bab9b
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.7.1


In [5]:
!wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin

--2021-08-31 22:06:38--  https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.74.142, 172.67.9.4, 104.22.75.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.74.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 131266198 (125M) [application/octet-stream]
Saving to: ‘lid.176.bin’


2021-08-31 22:06:44 (20.9 MB/s) - ‘lid.176.bin’ saved [131266198/131266198]



In [6]:
import fasttext
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
fasttext.load_model("/content/lid.176.bin")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.




<fasttext.FastText._FastText at 0x7fda87dfedd0>

In [5]:
# Tokenize from nltk
# word_tokenize("This is AMMI, je suis le professeur!")

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
class LanguageIdentification:

    def __init__(self):
        pretrained_lang_model = "/content/lid.176.bin"
        self.model = fasttext.load_model(pretrained_lang_model)

    def predict_lang(self, text):
        predictions = self.model.predict(text)    # returns the language with highest probability
        return predictions

language = LanguageIdentification()



In [11]:
lid = LanguageIdentification()
def getLang(lid, word):
  # Token level identification in Fastttext
  lang = language.predict_lang(word)  # Returns lang label (e.g. label_en) and probability
  label = lang[0][0][-2::] # returns the last two chars like en or es
  return label

In [11]:
text = "Esta"
getLang(lid, text)



'es'

In [12]:
from tqdm import tqdm

# Two lists to hold language specified in file and another for predicted languages
original_list = []
pred_list = []
lid = LanguageIdentification()
with open('/content/drive/MyDrive/lid_spaeng/train.conll') as f:
  # num for number in enumerate and line for each line read
  lines = f.readlines()
  for num, line in tqdm(enumerate(lines), total=len(lines)):
    # Lines that start with #, sent enums and hashtags
    if (line[0] == '#'):
      continue
    elif (line == '\n'):  # skip separator lines too
      continue
    else:
      word, lang = line.split() # split lines to word and language
      if ((lang == 'other') or (lang == 'ambiguous') or (lang == 'unk') or (lang == 'mixed') or (lang == 'fw') or (lang == 'ne')): # Skip tokens with other language tags too
        continue
      else:
        pred_lang = getLang(lid, word) # Use Fasttext to predict language of word and append it to a list
        pred_list.append(pred_lang)
        original_list.append(lang)  # A list to hold the languages as specified in the file

100%|██████████| 295281/295281 [00:03<00:00, 89224.82it/s]


In [13]:
# Two lists to hold language specified in file and another for predicted languages
devoriginal_list = []
devpred_list = []
lid = LanguageIdentification()

with open('/content/drive/MyDrive/lid_spaeng/dev.conll') as dev:
  # num for number in enumerate and line for each line read
  lines = dev.readlines()
  for num, line in tqdm(enumerate(lines), total=len(lines)):
    # Lines that start with #, sent enums and hashtags
    if (line[0] == '#'):
      continue
    elif (line == '\n'):  # skip separator lines too
      continue
    else:
      word, lang = line.split() # split lines to word and language
      if ((lang == 'other') or (lang == 'ambiguous') or (lang == 'unk') or (lang == 'mixed') or (lang == 'fw') or (lang == 'ne')): # Skip tokens with other language tags too: # Skip tokens with other language tags too
        continue
      else:
        pred_lang = getLang(lid, word) # Use Fasttext to predict language of word and append it to a list
        devpred_list.append(pred_lang)
        devoriginal_list.append(lang)  # A list to hold the languages as specified in the file

100%|██████████| 47055/47055 [00:00<00:00, 114054.30it/s]


In [14]:
print(len(pred_list))
print(len(original_list))

print(len(devpred_list))
print(len(devoriginal_list))

189265
189265
31405
31405


In [15]:
def compare(list1, list2):
  if (len(list1) != len(list2)):
    print("Lists are not the same length")

  list_len = len(list1)
  correct = 0

  for i in range(list_len):
    if (list1[i] == list2[i]):
      correct += 1
  
  diff = list_len - correct
  print("Correct: ", correct)
  print("Diff: ", diff)
  print(float(correct)/float(list_len))  

In [16]:
# Compare lists for train
compare(pred_list, original_list)

Correct:  121021
Diff:  68244
0.6394262013578844


In [17]:
# Compare lists for dev
compare(devpred_list, devoriginal_list)

Correct:  21127
Diff:  10278
0.6727272727272727


In [22]:
from sklearn.metrics import f1_score
import numpy as np

f1_train = f1_score(np.array(original_list), np.array(pred_list), average='micro')
print(f1_train)

0.6394262013578844
