## Plan

1. read words and their language labels from data files. Assume words have already been preprocessed and cleaned.
3. merge the word lists and resolve any inconsistencies in language labelling among the different corpora.
4. store final list of words and their language labels in a 2-column CSV file.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Load Arabic word lists from files

In [2]:
corpus_filename = ['TUN_Social_Media_words.csv', 'OSAC_words.csv', 'AWN_words.csv']
corpus_label = ['TUN_SM', 'OSAC', 'AWN']

In [3]:
#?pd.read_csv

In [4]:
df = pd.DataFrame(columns=['word','lang_label','corpus_label'])
for i,filename in enumerate(corpus_filename):
    df_temp = pd.read_csv('../../generated_data/'+filename,header=None, names=['word','lang_label'])
    df_temp['corpus_label'] = corpus_label[i]
    df = df.append(df_temp,sort=True)
    print(df_temp.shape,df.shape)

df = df[['word','lang_label','corpus_label']]
df.head()

(2916, 3) (2916, 3)
(40927, 3) (43843, 3)
(12432, 3) (56275, 3)


Unnamed: 0,word,lang_label,corpus_label
0,اب,MSA,TUN_SM
1,ابداع,MSA,TUN_SM
2,ابن,MSA,TUN_SM
3,ابوها,MSA,TUN_SM
4,اتصالات,MSA,TUN_SM


In [73]:
df.corpus_label.value_counts()

OSAC      40927
AWN       12432
TUN_SM     2916
Name: corpus_label, dtype: int64

In [74]:
df.lang_label.value_counts()

MSA    55488
TN       787
Name: lang_label, dtype: int64

In [75]:
df.word.nunique()

47682

## Merge/integrate words lists
We have word lists from 3 different sources.

We need to make sure the language labels of the common words are consistent.

In [76]:
df.head()

Unnamed: 0,word,lang_label,corpus_label
0,اب,MSA,TUN_SM
1,ابداع,MSA,TUN_SM
2,ابن,MSA,TUN_SM
3,ابوها,MSA,TUN_SM
4,اتصالات,MSA,TUN_SM


In [92]:
u = df.groupby(by='word').agg({'corpus_label': lambda x: ','.join(x), 'lang_label': lambda x: ','.join(x)})
u.shape

(47682, 2)

In [93]:
u.head()

Unnamed: 0_level_0,corpus_label,lang_label
word,Unnamed: 1_level_1,Unnamed: 2_level_1
آئ,OSAC,MSA
آب,"OSAC,AWN","MSA,MSA"
آباء,OSAC,MSA
آبائه,OSAC,MSA
آبائهم,OSAC,MSA


In [94]:
u.corpus_label.value_counts()

OSAC               33167
OSAC,AWN            5907
AWN                 5692
TUN_SM,OSAC         1061
TUN_SM              1022
TUN_SM,OSAC,AWN      792
TUN_SM,AWN            41
Name: corpus_label, dtype: int64

In [95]:
u.lang_label.value_counts()

MSA            39225
MSA,MSA         6912
MSA,MSA,MSA      758
TN               656
TN,MSA            97
TN,MSA,MSA        34
Name: lang_label, dtype: int64

In [81]:
pd.crosstab(u.corpus_label,u.lang_label)

lang_label,MSA,"MSA,MSA","MSA,MSA,MSA",TN,"TN,MSA","TN,MSA,MSA"
corpus_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AWN,5692,0,0,0,0,0
OSAC,33167,0,0,0,0,0
"OSAC,AWN",0,5907,0,0,0,0
TUN_SM,366,0,0,656,0,0
"TUN_SM,AWN",0,29,0,0,12,0
"TUN_SM,OSAC",0,976,0,0,85,0
"TUN_SM,OSAC,AWN",0,0,758,0,0,34


In [96]:
idx = u.lang_label.str.contains("TN,MSA")
print(idx.sum())
u[idx].head()

131


Unnamed: 0_level_0,corpus_label,lang_label
word,Unnamed: 1_level_1,Unnamed: 2_level_1
اش,"TUN_SM,OSAC","TN,MSA"
اك,"TUN_SM,OSAC","TN,MSA"
الحبس,"TUN_SM,OSAC,AWN","TN,MSA,MSA"
الحجامة,"TUN_SM,OSAC","TN,MSA"
الخ,"TUN_SM,OSAC","TN,MSA"


In [97]:
df_clean = u.copy()
df_clean.lang_label.value_counts()

MSA            39225
MSA,MSA         6912
MSA,MSA,MSA      758
TN               656
TN,MSA            97
TN,MSA,MSA        34
Name: lang_label, dtype: int64

In [98]:
idx = df_clean.lang_label.str.contains("TN,MSA")
df_clean.loc[idx,'lang_label'] = "TN"
df_clean.lang_label.value_counts()

MSA            39225
MSA,MSA         6912
TN               787
MSA,MSA,MSA      758
Name: lang_label, dtype: int64

In [99]:
idx = df_clean.lang_label.str.contains("MSA")
df_clean.loc[idx,'lang_label'] = "MSA"
df_clean.lang_label.value_counts()

MSA    46895
TN       787
Name: lang_label, dtype: int64

In [102]:
df_clean.head()

Unnamed: 0_level_0,corpus_label,lang_label
word,Unnamed: 1_level_1,Unnamed: 2_level_1
آئ,OSAC,MSA
آب,"OSAC,AWN",MSA
آباء,OSAC,MSA
آبائه,OSAC,MSA
آبائهم,OSAC,MSA


In [101]:
df_clean.shape

(47682, 2)

### Store words and their language labels in a csv file

In [103]:
filename = '../../generated_data/Arabic_words_corpus_final.csv'
df_clean['lang_label'].to_csv(filename, encoding='utf8', header=False, index=True)