## Prepare new datasets and explore languages jan 23
Use new labelled dev, and group languages by family

In [1]:
%reload_ext autoreload
%autoreload 2
import pandas as pd
import os, json
from pathlib import Path
from src.pp_data import *
from zipfile import ZipFile
from collections import defaultdict

In [17]:
# Explore zipfiles
LANGUAGES = ['dz', 'am', 'yo', 'twi', 'pcm', 'pt', 'ma', 'ha', 'ig', 'sw', 'ts', 'kr', 'multilingual']
lang_groups = {} # 2-letter key, group as valus
DEV_ONLY = ["or", "tg"]
train_f = "datasets/train_final"
dev_f = "datasets/dev_final"
train_paths = Path(train_f).iterdir()
dev_paths = Path(dev_f).iterdir()
for tp in dev_paths:
    language = tp.name.split("_")[0]
    if not language in LANGUAGES:
        print(language)

lg_df = pd.read_csv("datasets/data_23jan23/languages.csv", sep=";")
reg_lgs = lg_df["2-letter"].to_list()
print([l for l in LANGUAGES if l not in reg_lgs])
for i, row in lg_df.iterrows():
    lang_groups[row["2-letter"]] = row["Class-short"]
print(lang_groups)


or
tg
['multilingual']
{'ts': 'Volta-Congo', 'twi': 'Volta-Congo', 'kr': 'Volta-Congo', 'or': 'Afro-Asiatic-misc', 'tg': 'Afro-Asiatic-Semitic', 'ha': 'Afro-Asiatic-misc', 'yo': 'Volta-Congo', 'ig': 'Volta-Congo', 'pcm': 'Creole', 'am': 'Afro-Asiatic-Semitic', 'dz': 'Afro-Asiatic-Semitic', 'ma': 'Afro-Asiatic-Semitic', 'sw': 'Volta-Congo', 'pt': 'Creole'}


{'ts': 'Volta-Congo', 'twi': 'Volta-Congo', 'kr': 'Volta-Congo', 'or': 'Afro-Asiatic-misc', 'tg': 'Afro-Asiatic-Semitic', 'ha': 'Afro-Asiatic-misc', 'yo': 'Volta-Congo', 'ig': 'Volta-Congo', 'pcm': 'Creole', 'am': 'Afro-Asiatic-Semitic', 'dz': 'Afro-Asiatic-Semitic', 'ma': 'Afro-Asiatic-Semitic', 'sw': 'Volta-Congo', 'pt': 'Creole'}

In [14]:
# revert lang_groups

group_langs = defaultdict(list)
for lang, group in lang_groups.items():
    group_langs[group].append(lang)

with open("datasets/group_langs.json", "w", encoding="utf-8") as wf:
    json.dump(dict(group_langs), wf, ensure_ascii=False)
dict(group_langs)

{'Volta-Congo': ['ts', 'twi', 'kr', 'yo', 'ig', 'sw'],
 'Afro-Asiatic-misc': ['or', 'ha'],
 'Afro-Asiatic-Semitic': ['tg', 'am', 'dz', 'ma'],
 'Creole': ['pcm', 'pt']}

{'Volta-Congo': ['ts', 'twi', 'kr', 'yo', 'ig', 'sw'],
 'Afro-Asiatic-misc': ['or', 'ha'],
 'Afro-Asiatic-Semitic': ['tg', 'am', 'dz', 'ma'],
 'Creole': ['pcm', 'pt']}

### Language group training data
create one train set per group

In [2]:
# Gather train split length
DEV_ONLY = ["or", "tg"]
group_langs = {'Volta-Congo': ['ts', 'twi', 'kr', 'yo', 'ig', 'sw'],
 'Afro-Asiatic-Semitic': [ 'am', 'dz', 'ma'],
 'Creole': ['pcm', 'pt']} # Removed those without training data

lang_info = {}
for group, langs in group_langs.items():
    print(langs)
    group_dfs = []

    # train
    for lang in langs:
        df = pd.DataFrame()
        try:
            df = pd.read_csv(f"datasets/train_final/{lang}_train.tsv", sep="\t")
        except: 
            print("Training data not found:", group, lang)
        group_dfs.append(df)
        print(group, lang, len(df))
        lang_info[ lang]= len(df)
    group_df = pd.concat(group_dfs, ignore_index=True)
    df = group_df.sample(frac=1)
    df.to_csv(f"datasets/group_final/{group}_train.tsv", sep="\t", index=False)

   #  dev
    group_dfs = []
    for lang in langs:
        df = pd.DataFrame()
        try:
            df = pd.read_csv(f"datasets/dev_final/{lang}_dev_gold_label.tsv", sep="\t")
        except: 
            print("Training data not found:", group, lang)
        group_dfs.append(df)
        print(group, lang, len(df))
        lang_info[ lang]= len(df)
    group_df = pd.concat(group_dfs, ignore_index=True)
    df = group_df.sample(frac=1)
    df.to_csv(f"datasets/group_final/{group}_dev.tsv", sep="\t", index=False)

# lang_info


['ts', 'twi', 'kr', 'yo', 'ig', 'sw']
Volta-Congo ts 804
Volta-Congo twi 3481
Volta-Congo kr 3302
Volta-Congo yo 8522
Volta-Congo ig 10192
Volta-Congo sw 1810
Volta-Congo ts 203
Volta-Congo twi 388
Volta-Congo kr 827
Volta-Congo yo 2090
Volta-Congo ig 1841
Volta-Congo sw 453
['am', 'dz', 'ma']
Afro-Asiatic-Semitic am 5984
Afro-Asiatic-Semitic dz 1651
Afro-Asiatic-Semitic ma 5583
Afro-Asiatic-Semitic am 1497
Afro-Asiatic-Semitic dz 414
Afro-Asiatic-Semitic ma 1215
['pcm', 'pt']
Creole pcm 5121
Creole pt 3063
Creole pcm 1281
Creole pt 767


{'ts': 203,
 'twi': 388,
 'kr': 827,
 'yo': 2090,
 'ig': 1841,
 'sw': 453,
 'am': 1497,
 'dz': 414,
 'ma': 1215,
 'pcm': 1281,
 'pt': 767}

In [18]:
lg_df

Unnamed: 0,Language,2-letter,Ethnologue,Classification,Class-short
0,Xitsonga,ts,tso,"Niger-Congo, Atlantic-Congo, Volta-Congo, Benu...",Volta-Congo
1,Twi,twi,aka,"Niger-Congo, Atlantic-Congo, Volta-Congo, Kwa,...",Volta-Congo
2,Kinyarwanda,kr,kin,"Niger-Congo, Atlantic-Congo, Volta-Congo, Benu...",Volta-Congo
3,Oromo,or,orm,"Afro-Asiatic, Cushitic, East, Oromo",Afro-Asiatic-misc
4,Tigrinya,tg,tig,"Afro-Asiatic, Semitic, South, Ethiopian, North",Afro-Asiatic-Semitic
5,Hausa,ha,hau,"Afro-Asiatic, Chadic, West, A, A.1",Afro-Asiatic-misc
6,Yoruba,yo,yor,"Niger-Congo, Atlantic-Congo, Volta-Congo, Benu...",Volta-Congo
7,Igbo,ig,na,"Niger-Congo, Atlantic-Congo, Volta-Congo, Benu...",Volta-Congo
8,Nigerian_Pidgin,pcm,na,English Creole,Creole
9,Amharic,am,na,"Afro-Asiatic, Semitic, South, Ethiopian, South...",Afro-Asiatic-Semitic


In [19]:

lg_df = lg_df.set_index("2-letter", drop=True)
lg_df["train"] = None
lang_info
for key, val in lang_info.items():
    lg_df.loc[key, "train"] = val
lg_df.drop(["Ethnologue", "Class-short"], axis=1).sort_index().to_latex("results/lang_train.txt")
lg_df.drop([ "Class-short"], axis=1).sort_index().to_csv("datasets/lang_train.csv")

  lg_df.drop(["Ethnologue", "Class-short"], axis=1).sort_index().to_latex("results/lang_train.txt")


Unnamed: 0_level_0,Language,Ethnologue,Classification,Class-short
2-letter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ts,Xitsonga,tso,"Niger-Congo, Atlantic-Congo, Volta-Congo, Benu...",Volta-Congo
twi,Twi,aka,"Niger-Congo, Atlantic-Congo, Volta-Congo, Kwa,...",Volta-Congo
kr,Kinyarwanda,kin,"Niger-Congo, Atlantic-Congo, Volta-Congo, Benu...",Volta-Congo
or,Oromo,orm,"Afro-Asiatic, Cushitic, East, Oromo",Afro-Asiatic-misc
tg,Tigrinya,tig,"Afro-Asiatic, Semitic, South, Ethiopian, North",Afro-Asiatic-Semitic
ha,Hausa,hau,"Afro-Asiatic, Chadic, West, A, A.1",Afro-Asiatic-misc
yo,Yoruba,yor,"Niger-Congo, Atlantic-Congo, Volta-Congo, Benu...",Volta-Congo
ig,Igbo,na,"Niger-Congo, Atlantic-Congo, Volta-Congo, Benu...",Volta-Congo
pcm,Nigerian_Pidgin,na,English Creole,Creole
am,Amharic,na,"Afro-Asiatic, Semitic, South, Ethiopian, South...",Afro-Asiatic-Semitic
