<img src="https://github.com/UBC-NLP/afrolid/raw/main/images/afrolid_logo.jpg">

AfroLID, a neural LID toolkit for 517 African languages and varieties. AfroLID exploits a multi-domain web dataset manually curated from across 14 language families utilizing five orthographic systems. AfroLID is described in this paper: 
[**AfroLID: A Neural Language Identification Tool for African Languages**](https://arxiv.org/abs/2210.11744).


# LID with AfroLID

In [1]:
!pip install -U git+https://github.com/UBC-NLP/afrolid.git --q

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.4/125.4 KB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m750.6/750.6 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m59.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.0/11.0 MB[0m [31m94.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 KB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 KB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.8/123.8 KB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m271.8/271.8 KB[0m [31m24.9 MB/s[0m eta

In [2]:
! wget https://demos.dlnlp.ai/afrolid/afrolid_model.tar.gz
!tar -xf afrolid_model.tar.gz

--2023-03-02 08:22:52--  https://demos.dlnlp.ai/afrolid/afrolid_model.tar.gz
Resolving demos.dlnlp.ai (demos.dlnlp.ai)... 74.208.236.113, 2607:f1c0:100f:f000::264
Connecting to demos.dlnlp.ai (demos.dlnlp.ai)|74.208.236.113|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2277022086 (2.1G) [application/gzip]
Saving to: ‘afrolid_model.tar.gz’


2023-03-02 08:23:53 (36.0 MB/s) - ‘afrolid_model.tar.gz’ saved [2277022086/2277022086]



In [3]:
import os, sys
import logging
from afrolid.main import classifier

In [4]:
logging.basicConfig(
    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=os.environ.get("LOGLEVEL", "INFO").upper(),
    force=True, # Resets any previous configuration
)
logger = logging.getLogger("afrolid")


In [5]:
cl = classifier(logger, model_path="/content/afrolid_model")

2023-03-02 08:24:40 | INFO | afrolid | Initalizing AfroLID's task and model.


| [input] dictionary: 64001 types
| [label] dictionary: 528 types


In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
SA_dir = r'/content/drive/MyDrive/MIT/MIT 807 mini-dissertation/Data/SA-tweets.json'
KEN_dir = r'/content/drive/MyDrive/MIT/MIT 807 mini-dissertation/Data/KEN-tweets.json'
TZ_dir = r'/content/drive/MyDrive/MIT/MIT 807 mini-dissertation/Data/TZ-tweets.json' 

In [8]:
import pandas as pd
import json
dir = TZ_dir

# open json files 
with open(dir, 'r') as f:
    dfs = {k: pd.read_json(v) for k, v in json.load(f).items()}

2023-03-02 08:25:16 | INFO | numexpr.utils | NumExpr defaulting to 2 threads.


In [9]:
from tqdm import tqdm
tqdm.pandas()

# Combine all dataframes into one and add a column for the key
df = pd.concat(dfs, keys=dfs.keys())
df = df.reset_index(level=1, drop=True)
df = df.reset_index()
df = df.rename(columns={'index': 'key'})
df

Unnamed: 0,key,Datetime,Tweet Id,Text,Username,Location
0,daladala,2023-02-08 07:49:38,1623227542218985473,"Dar Es Salaam watu wana hasira sana, ukimgusa ...",fadhilikangusi,"Dar Es Salaam, Tanzania"
1,daladala,2023-01-31 13:00:19,1620406624908345344,Muonekano wa Kituo Kipya cha Daladala cha Kiny...,raphyrodrick,"Dar es Salaam, Tanzania"
2,daladala,2023-01-27 06:19:14,1618856135645356034,Lakini kumpisha mtu mzima kwenye seat ya dalad...,DejohB,"Dar es Salaam, Tanzania"
3,daladala,2022-09-24 18:21:32,1573739424382423043,Kuna huyu mtu ana hadithia hapa eti anamwaka m...,AsiahSalum,"Dar es Salaam, Tanzania"
4,daladala,2022-09-19 09:35:17,1571795051818192898,Hongera sana @SuluhuSamia kwa Kupata Siti nzu...,ExMayorUbungo,Dar es salaam
...,...,...,...,...,...,...
993,bajaj,2019-03-01 04:42:02,1101341804618690560,Tanzania Trade Fair 2019 ... participation of ...,beenakoshy,
994,bajaj,2019-03-01 04:37:31,1101340668473929728,Bajaj Electricals at the Tanzania trade fair 2...,beenakoshy,
995,bajaj,2018-12-21 07:38:29,1076019057890074624,"In Dar Es Salaam, The Business and Busiest Cit...",mtaliitravels,"Dar es Salaam, Tanzania"
996,bajaj,2018-07-18 14:49:14,1019594953532600320,Safari bora 🚗 huanza unapoendeshwa kwa Bajaj u...,MondoRide255,"Dar es Salaam, Tanzania"


In [10]:
def get_afrolid_prediction(text):
  predictions = cl.classify(text, max_outputs=1)
  for lang in predictions:
    return lang, predictions[lang]['score'], predictions[lang]['name'], predictions[lang]['script']

In [11]:
import nltk
nltk.download('popular')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

True

In [12]:
from nltk.tokenize import ToktokTokenizer
import re

# clean data and remove punctuation characters
token = ToktokTokenizer()
punct = '!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~'

def clean_punct(text):
    words = token.tokenize(text)
    punctuation_filtered = []
    regex = re.compile('[%s]' % re.escape(punct))
    remove_punctuation = str.maketrans(' ', ' ', punct)
    for w in words:
      punctuation_filtered.append(regex.sub('',w))

    # filtered_list = strip_list_noempty(punctuation_filtered)

    return ' '.join(map(str, punctuation_filtered))
  
df['Text'] = df['Text'].apply(lambda x: clean_punct(x))
df


Unnamed: 0,key,Datetime,Tweet Id,Text,Username,Location
0,daladala,2023-02-08 07:49:38,1623227542218985473,Dar Es Salaam watu wana hasira sana ukimgusa ...,fadhilikangusi,"Dar Es Salaam, Tanzania"
1,daladala,2023-01-31 13:00:19,1620406624908345344,Muonekano wa Kituo Kipya cha Daladala cha Kiny...,raphyrodrick,"Dar es Salaam, Tanzania"
2,daladala,2023-01-27 06:19:14,1618856135645356034,Lakini kumpisha mtu mzima kwenye seat ya dalad...,DejohB,"Dar es Salaam, Tanzania"
3,daladala,2022-09-24 18:21:32,1573739424382423043,Kuna huyu mtu ana hadithia hapa eti anamwaka m...,AsiahSalum,"Dar es Salaam, Tanzania"
4,daladala,2022-09-19 09:35:17,1571795051818192898,Hongera sana SuluhuSamia kwa Kupata Siti nzuri...,ExMayorUbungo,Dar es salaam
...,...,...,...,...,...,...
993,bajaj,2019-03-01 04:42:02,1101341804618690560,Tanzania Trade Fair 2019 participation of Baj...,beenakoshy,
994,bajaj,2019-03-01 04:37:31,1101340668473929728,Bajaj Electricals at the Tanzania trade fair 2...,beenakoshy,
995,bajaj,2018-12-21 07:38:29,1076019057890074624,In Dar Es Salaam The Business and Busiest Cit...,mtaliitravels,"Dar es Salaam, Tanzania"
996,bajaj,2018-07-18 14:49:14,1019594953532600320,Safari bora 🚗 huanza unapoendeshwa kwa Bajaj u...,MondoRide255,"Dar es Salaam, Tanzania"


## Get LID using AfroLID

In [13]:
df['predict_iso_afrolid'], df['predict_score_afrolid'], df['predict_name_afrolid'], df['predict_script_afrolid'] = zip(*df['Text'].progress_apply(get_afrolid_prediction))
df

  0%|          | 0/998 [00:00<?, ?it/s]2023-03-02 08:25:22 | INFO | afrolid | Input text: Dar Es Salaam watu wana hasira sana  ukimgusa kidogo kwenye daladala anakupa bonge la tusi 
  0%|          | 2/998 [00:00<04:36,  3.61it/s]2023-03-02 08:25:23 | INFO | afrolid | Input text: Muonekano wa Kituo Kipya cha Daladala cha Kinyerezi pamoja na Barabara ya Lami  KM 71  Manispaa ya Ilala Kituo hiki kina uwezo wa kupokea Daladala 90 kwa wakati mmoja Ujenzi umetekelezwa na ortamisemitz kupitia Mradi wa DMDP Jijini Dar es Salaam httpstcos8xjQXsnLx
  0%|          | 3/998 [00:01<08:14,  2.01it/s]2023-03-02 08:25:24 | INFO | afrolid | Input text: Lakini kumpisha mtu mzima kwenye seat ya daladala sio part ya Maadili Mazuri kwa Upande wa Dar es salaam
  0%|          | 4/998 [00:02<10:15,  1.61it/s]2023-03-02 08:25:25 | INFO | afrolid | Input text: Kuna huyu mtu ana hadithia hapa eti anamwaka mzima hajawahi kukaa kwenye daladala Dar es salaam  always huwa anasimama tu🙌😂
  1%|          | 5/998 [00:02<

Unnamed: 0,key,Datetime,Tweet Id,Text,Username,Location,predict_iso_afrolid,predict_score_afrolid,predict_name_afrolid,predict_script_afrolid
0,daladala,2023-02-08 07:49:38,1623227542218985473,Dar Es Salaam watu wana hasira sana ukimgusa ...,fadhilikangusi,"Dar Es Salaam, Tanzania",swh,99.92,Swahili,Latin
1,daladala,2023-01-31 13:00:19,1620406624908345344,Muonekano wa Kituo Kipya cha Daladala cha Kiny...,raphyrodrick,"Dar es Salaam, Tanzania",swh,100.00,Swahili,Latin
2,daladala,2023-01-27 06:19:14,1618856135645356034,Lakini kumpisha mtu mzima kwenye seat ya dalad...,DejohB,"Dar es Salaam, Tanzania",swh,99.97,Swahili,Latin
3,daladala,2022-09-24 18:21:32,1573739424382423043,Kuna huyu mtu ana hadithia hapa eti anamwaka m...,AsiahSalum,"Dar es Salaam, Tanzania",swh,100.00,Swahili,Latin
4,daladala,2022-09-19 09:35:17,1571795051818192898,Hongera sana SuluhuSamia kwa Kupata Siti nzuri...,ExMayorUbungo,Dar es salaam,swh,100.00,Swahili,Latin
...,...,...,...,...,...,...,...,...,...,...
993,bajaj,2019-03-01 04:42:02,1101341804618690560,Tanzania Trade Fair 2019 participation of Baj...,beenakoshy,,swh,98.88,Swahili,Latin
994,bajaj,2019-03-01 04:37:31,1101340668473929728,Bajaj Electricals at the Tanzania trade fair 2...,beenakoshy,,swh,99.54,Swahili,Latin
995,bajaj,2018-12-21 07:38:29,1076019057890074624,In Dar Es Salaam The Business and Busiest Cit...,mtaliitravels,"Dar es Salaam, Tanzania",swh,85.81,Swahili,Latin
996,bajaj,2018-07-18 14:49:14,1019594953532600320,Safari bora 🚗 huanza unapoendeshwa kwa Bajaj u...,MondoRide255,"Dar es Salaam, Tanzania",swh,100.00,Swahili,Latin


In [14]:
df['predict_name_afrolid'].unique()

array(['Swahili', 'Somali', 'Hausa', 'Kinyarwanda', 'Tsonga', 'Wolof',
       'Bambara', 'Luganda', 'Isixhosa', 'Edo', 'Oromo, Borana-Arsi-Guji',
       'Gamo', 'Isizulu'], dtype=object)

# LID using CLD3

In [15]:
!pip install pycld3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pycld3
  Downloading pycld3-0.22-cp38-cp38-manylinux1_x86_64.whl (13.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.6/13.6 MB[0m [31m81.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pycld3
Successfully installed pycld3-0.22


In [16]:
import cld3

def get_cld3_prediction(text):
  predictions = cld3.get_language(text)
  for lang in predictions:
    return predictions[0], predictions[1], predictions[3], predictions[2]

In [18]:
df['predict_iso_cld3'], df['predict_score_cld3'], df['predict_proportion_cld3'], df['predict_is_reliable_cld3'] = zip(*df['Text'].progress_apply(get_cld3_prediction))
df

100%|██████████| 998/998 [00:00<00:00, 2266.91it/s]


Unnamed: 0,key,Datetime,Tweet Id,Text,Username,Location,predict_iso_afrolid,predict_score_afrolid,predict_name_afrolid,predict_script_afrolid,predict_iso_cld3,predict_score_cld3,predict_proportion_cld3,predict_is_reliable_cld3
0,daladala,2023-02-08 07:49:38,1623227542218985473,Dar Es Salaam watu wana hasira sana ukimgusa ...,fadhilikangusi,"Dar Es Salaam, Tanzania",swh,99.92,Swahili,Latin,sw,0.999916,1.0,True
1,daladala,2023-01-31 13:00:19,1620406624908345344,Muonekano wa Kituo Kipya cha Daladala cha Kiny...,raphyrodrick,"Dar es Salaam, Tanzania",swh,100.00,Swahili,Latin,sw,0.999891,1.0,True
2,daladala,2023-01-27 06:19:14,1618856135645356034,Lakini kumpisha mtu mzima kwenye seat ya dalad...,DejohB,"Dar es Salaam, Tanzania",swh,99.97,Swahili,Latin,sw,0.999998,1.0,True
3,daladala,2022-09-24 18:21:32,1573739424382423043,Kuna huyu mtu ana hadithia hapa eti anamwaka m...,AsiahSalum,"Dar es Salaam, Tanzania",swh,100.00,Swahili,Latin,sw,0.999682,1.0,True
4,daladala,2022-09-19 09:35:17,1571795051818192898,Hongera sana SuluhuSamia kwa Kupata Siti nzuri...,ExMayorUbungo,Dar es salaam,swh,100.00,Swahili,Latin,sw,0.999171,1.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
993,bajaj,2019-03-01 04:42:02,1101341804618690560,Tanzania Trade Fair 2019 participation of Baj...,beenakoshy,,swh,98.88,Swahili,Latin,mt,0.307086,1.0,False
994,bajaj,2019-03-01 04:37:31,1101340668473929728,Bajaj Electricals at the Tanzania trade fair 2...,beenakoshy,,swh,99.54,Swahili,Latin,en,0.381382,1.0,False
995,bajaj,2018-12-21 07:38:29,1076019057890074624,In Dar Es Salaam The Business and Busiest Cit...,mtaliitravels,"Dar es Salaam, Tanzania",swh,85.81,Swahili,Latin,en,0.999088,1.0,True
996,bajaj,2018-07-18 14:49:14,1019594953532600320,Safari bora 🚗 huanza unapoendeshwa kwa Bajaj u...,MondoRide255,"Dar es Salaam, Tanzania",swh,100.00,Swahili,Latin,sw,0.995121,1.0,True


In [19]:
df['predict_iso_cld3'].unique()

array(['sw', 'af', 'en', 'so', 'uz', 'pl', 'ms', 'yo', 'de', 'es',
       'el-Latn', 'ny', 'fr', 'zh-Latn', 'jv', 'ht', 'ig', 'lb', 'nl',
       'hmn', 'mt', 'su', 'cy', 'lt', 'id', 'fil', 'zh', 'la', 'lv', 'zu',
       'mg', 'gl', 'tr', 'eu', 'fy', 'ha', 'sn', 'gd', 'ceb', 'it',
       'ru-Latn', 'sv', 'hi-Latn', 'az', 'ga', 'et', 'ja', 'sr'],
      dtype=object)

# LID using Franc

In [20]:
!pip install pyfranc

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyfranc
  Downloading pyfranc-0.1.1-py3-none-any.whl (262 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m262.9/262.9 KB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyfranc
Successfully installed pyfranc-0.1.1


In [21]:
from pyfranc import franc

In [24]:
def get_franc_prediction(text):
  predictions = franc.lang_detect(text)
  for lang in predictions:
    return predictions[0][0], predictions[0][1]

In [25]:
df['predict_iso_franc'], df['predict_score_franc']= zip(*df['Text'].progress_apply(get_franc_prediction))
df

100%|██████████| 998/998 [00:32<00:00, 30.64it/s]


Unnamed: 0,key,Datetime,Tweet Id,Text,Username,Location,predict_iso_afrolid,predict_score_afrolid,predict_name_afrolid,predict_script_afrolid,predict_iso_cld3,predict_score_cld3,predict_proportion_cld3,predict_is_reliable_cld3,predict_iso_franc,predict_score_franc
0,daladala,2023-02-08 07:49:38,1623227542218985473,Dar Es Salaam watu wana hasira sana ukimgusa ...,fadhilikangusi,"Dar Es Salaam, Tanzania",swh,99.92,Swahili,Latin,sw,0.999916,1.0,True,swh,1.0
1,daladala,2023-01-31 13:00:19,1620406624908345344,Muonekano wa Kituo Kipya cha Daladala cha Kiny...,raphyrodrick,"Dar es Salaam, Tanzania",swh,100.00,Swahili,Latin,sw,0.999891,1.0,True,swh,1.0
2,daladala,2023-01-27 06:19:14,1618856135645356034,Lakini kumpisha mtu mzima kwenye seat ya dalad...,DejohB,"Dar es Salaam, Tanzania",swh,99.97,Swahili,Latin,sw,0.999998,1.0,True,swh,1.0
3,daladala,2022-09-24 18:21:32,1573739424382423043,Kuna huyu mtu ana hadithia hapa eti anamwaka m...,AsiahSalum,"Dar es Salaam, Tanzania",swh,100.00,Swahili,Latin,sw,0.999682,1.0,True,swh,1.0
4,daladala,2022-09-19 09:35:17,1571795051818192898,Hongera sana SuluhuSamia kwa Kupata Siti nzuri...,ExMayorUbungo,Dar es salaam,swh,100.00,Swahili,Latin,sw,0.999171,1.0,True,swh,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
993,bajaj,2019-03-01 04:42:02,1101341804618690560,Tanzania Trade Fair 2019 participation of Baj...,beenakoshy,,swh,98.88,Swahili,Latin,mt,0.307086,1.0,False,roh,1.0
994,bajaj,2019-03-01 04:37:31,1101340668473929728,Bajaj Electricals at the Tanzania trade fair 2...,beenakoshy,,swh,99.54,Swahili,Latin,en,0.381382,1.0,False,sco,1.0
995,bajaj,2018-12-21 07:38:29,1076019057890074624,In Dar Es Salaam The Business and Busiest Cit...,mtaliitravels,"Dar es Salaam, Tanzania",swh,85.81,Swahili,Latin,en,0.999088,1.0,True,eng,1.0
996,bajaj,2018-07-18 14:49:14,1019594953532600320,Safari bora 🚗 huanza unapoendeshwa kwa Bajaj u...,MondoRide255,"Dar es Salaam, Tanzania",swh,100.00,Swahili,Latin,sw,0.995121,1.0,True,swh,1.0


In [28]:
df['predict_iso_franc'].unique()

array(['swh', 'sco', 'roh', 'zdj', 'dag', 'som', 'oci', 'pam', 'cjk',
       'swb', 'lin', 'eng', 'ddn', 'hau', 'loz', 'spa', 'kng', 'nym',
       'crh', 'ina', 'nob', 'kqs', 'bum', 'afr', 'kha', 'cat', 'por',
       'jav', 'pcm', 'ace', 'fra', 'mxi', 'sun', 'min', 'wol', 'dan',
       'bem', 'tet', 'cbu', 'hlt', 'pau', 'dyo', 'src', 'wwa', 'fuf',
       'plt', 'ido', 'uzn', 'glg', 'zlm', 'swe', 'bis', 'snk', 'nba',
       'tdt', 'lld', 'ind', 'xsm', 'bcl', 'mad', 'ceb', 'deu', 'ilo',
       'cof', 'hil', 'suk', 'not', 'ktu', 'lit', 'bug', 'srr', 'tur'],
      dtype=object)

# Sanity check

In [27]:
df.head(3)

Unnamed: 0,key,Datetime,Tweet Id,Text,Username,Location,predict_iso_afrolid,predict_score_afrolid,predict_name_afrolid,predict_script_afrolid,predict_iso_cld3,predict_score_cld3,predict_proportion_cld3,predict_is_reliable_cld3,predict_iso_franc,predict_score_franc
0,daladala,2023-02-08 07:49:38,1623227542218985473,Dar Es Salaam watu wana hasira sana ukimgusa ...,fadhilikangusi,"Dar Es Salaam, Tanzania",swh,99.92,Swahili,Latin,sw,0.999916,1.0,True,swh,1.0
1,daladala,2023-01-31 13:00:19,1620406624908345344,Muonekano wa Kituo Kipya cha Daladala cha Kiny...,raphyrodrick,"Dar es Salaam, Tanzania",swh,100.0,Swahili,Latin,sw,0.999891,1.0,True,swh,1.0
2,daladala,2023-01-27 06:19:14,1618856135645356034,Lakini kumpisha mtu mzima kwenye seat ya dalad...,DejohB,"Dar es Salaam, Tanzania",swh,99.97,Swahili,Latin,sw,0.999998,1.0,True,swh,1.0


In [None]:
# decode the ISO codes to language names for cld3 and franc




### Check that the languages ID'd are spoken within East Africa

**ISO for languages spoken in EA**\
English - 'en'\
Swahili - 'sw'\
Ganda - 'lg'\
Kirundi - 'rn'\
French - 'fr'\
Somali - 'so'\
Arabic - 'ar'\
Amharic - 'am'\
Tigrinya - 'ti'\
Kinyarwanda - 'rw'


In [None]:
# check that the languages ID'd are spoken within East Africa

EA_languages = ['en', 'sw', 'lg', 'rn', 'fr', 'so', 'ar', 'am', 'ti', 'rw']