In [15]:
!pip install langid



In [16]:
from langid import langid
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [17]:
text = "I am here"
lang = langid.classify(text)
print(lang)

('en', -6.440452575683594)


In [18]:
langid.rank(text)

[('en', -6.440452575683594),
 ('de', -7.602324962615967),
 ('lb', -12.375750064849854),
 ('rw', -12.908110618591309),
 ('ro', -12.978541374206543),
 ('es', -13.397968292236328),
 ('cy', -13.39807653427124),
 ('it', -14.155198097229004),
 ('br', -14.201417922973633),
 ('da', -14.257980346679688),
 ('sq', -14.292641162872314),
 ('pt', -14.425897121429443),
 ('zh', -14.7821626663208),
 ('nl', -14.963902950286865),
 ('fr', -15.118206024169922),
 ('la', -15.191981315612793),
 ('ga', -15.509129524230957),
 ('id', -15.718591690063477),
 ('ku', -16.035820484161377),
 ('ms', -16.112281799316406),
 ('ar', -16.47485589981079),
 ('se', -16.542413115501404),
 ('ht', -16.566608428955078),
 ('af', -16.632877826690674),
 ('lo', -16.72516632080078),
 ('eo', -16.803476810455322),
 ('qu', -16.862179279327393),
 ('et', -17.037346363067627),
 ('is', -17.109008312225342),
 ('tl', -17.202204942703247),
 ('fo', -17.218571662902832),
 ('xh', -17.247032642364502),
 ('sw', -17.37736177444458),
 ('ca', -17.471621

In [19]:
def getLang(s):
  lang = langid.classify(s)  # Returns lang label (e.g. 'en', -4.3590) and probability. Higher is best rank!
  label = lang[0] # returns just the part to label the language
  return label

getLang("Je suis")  # English!?!

'en'

In [20]:
text = "This is AMMI, je suis le professeur!"
tokens = word_tokenize(text)
l = []

for i in range(len(tokens)):
  lang_rank = langid.classify(tokens[i])
  label = lang_rank[0]
  l.append(label)

print(l)

['en', 'en', 'en', 'en', 'en', 'en', 'en', 'fr', 'en']


In [21]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
from tqdm import tqdm

# Two lists to hold language specified in file and another for predicted languages
original_list = []
pred_list = []

with open('/content/drive/MyDrive/lid_spaeng/train.conll') as f:
  # num for number in enumerate and line for each line read
  lines = f.readlines()
  for num, line in tqdm(enumerate(lines), total=len(lines)):
    # Lines that start with #, sent enums and hashtags
    if (line[0] == '#'):
      continue
    elif (line == '\n'):  # skip separator lines too
      continue
    else:
      word, lang = line.split() # split lines to word and language
      if ((lang == 'other') or (lang == 'ambiguous') or (lang == 'unk') or (lang == 'mixed') or (lang == 'fw') or (lang == 'ne')): # Skip tokens with other language tags too
        continue
      else:
        pred_lang = getLang(word) # Use getLang to predict language of word and append it to a list
        pred_list.append(pred_lang)
        original_list.append(lang)  # A list to hold the languages as specified in the file

100%|██████████| 295281/295281 [03:26<00:00, 1429.60it/s]


In [23]:
# Two lists to hold language specified in file and another for predicted languages
devoriginal_list = []
devpred_list = []

with open('/content/drive/MyDrive/lid_spaeng/dev.conll') as dev:
  # num for number in enumerate and line for each line read
  lines = dev.readlines()
  for num, line in tqdm(enumerate(lines), total=len(lines)):
    # Lines that start with #, sent enums and hashtags
    if (line[0] == '#'):
      continue
    elif (line == '\n'):  # skip separator lines too
      continue
    else:
      word, lang = line.split() # split lines to word and language
      if ((lang == 'other') or (lang == 'ambiguous') or (lang == 'unk') or (lang == 'mixed') or (lang == 'fw') or (lang == 'ne')): # Skip tokens with other language tags too: # Skip tokens with other language tags too
        continue
      else:
        pred_lang = getLang(word) # Use getLang to predict language of word and append it to a list
        devpred_list.append(pred_lang)
        devoriginal_list.append(lang)  # A list to hold the languages as specified in the file

100%|██████████| 47055/47055 [00:31<00:00, 1511.45it/s]


In [24]:
print(len(pred_list))
print(len(original_list))

print(len(devpred_list))
print(len(devoriginal_list))

189265
189265
31405
31405


In [25]:
from sklearn.metrics import f1_score
import numpy as np

f1_train = f1_score(np.array(original_list), np.array(pred_list), average='micro')
print(f1_train)

0.48773941299236523


In [26]:
def compare(lang_class, pred_list, label_list):
  '''
  Args: Class of language, prediction list and true label list
  Returns tp, fp and fn which will be used to compute Precision and Recall 
  '''
  if (len(pred_list) != len(label_list)):
    print("Lists are not the same length")

  list_len = len(pred_list)
  correct = 0
  fn = 0

  for i in range(list_len):
    if (pred_list[i] == label_list[i]):
      correct += 1
      
  tp = correct
  fp = list_len - correct

  for i in range(list_len):
    if ((pred_list[i] != lang_class) and (lang_class == label_list[i])):
      fn += 1  

  return tp, fp, fn

# Compute Precision
def precision(tp, fp):
  return (tp)/(tp + fp)

# Compute Recall
def recall(tp, fn):
  return (tp) / (tp + fn)

def f1_score_lang(precision_num, recall_num):
  return 2*((precision_num * recall_num)/(precision_num + recall_num)) 

In [27]:
# English Class
tp_en, fp_en, fn_en = compare('en', pred_list, original_list)
precision_en = precision(tp_en, fp_en)
recall_en = recall(tp_en, fn_en)

f1_en = f1_score_lang(precision_en, recall_en)
print(f1_en)

0.6453669654217761


In [28]:
# Spanish Class

tp_es, fp_es, fn_es = compare('es', pred_list, original_list)
precision_es = precision(tp_es, fp_es)
recall_es = recall(tp_es, fn_es)

f1_es = f1_score_lang(precision_es, recall_es)
print(f1_es)

0.4936061449452051
