In [25]:
import os
import pandas as pd
from tqdm.auto import tqdm

In [26]:
### chaii
data_dir = '../data/chaii/'
train_df = pd.read_csv(os.path.join(data_dir, 'train.csv'))
### chaii-trans
data_dir_trans = '../data/chaii-trans/'
train_translated_df = pd.read_csv(os.path.join(data_dir_trans, 'train_translated.csv'))
train_transliterated_df = pd.read_csv(os.path.join(data_dir_trans, 'train_transliterated.csv'))

In [27]:
k = 5

In [28]:
def reformat_lang_chaii(lang):
    if len(lang) > 3:
        return lang[:2]
    else:
        return lang+'^'

# Entire Data

In [60]:
df = pd.read_csv(os.path.join(data_dir_trans, f'train_translated.csv'))
df['language'] = df['language'].apply(reformat_lang_chaii)
print(df.shape)
df.sample(5)

id_to_souce_lang = {id: lang for id, lang in df[df['is_original']==True][['id', 'language']].values}

df['source_language'] = df['id'].apply(lambda x: id_to_souce_lang[x])
result = df.groupby(['language', 'source_language']).size().unstack(level=1).fillna(0).T
result

(3728, 7)


language,bn^,en^,hi,hi^,ml^,mr^,ta,ta^,te^
source_language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
hi,220.0,383.0,746.0,289.0,230.0,218.0,0.0,221.0,247.0
ta,70.0,182.0,0.0,133.0,110.0,78.0,368.0,117.0,116.0


In [61]:
result / result.sum(axis=0)

language,bn^,en^,hi,hi^,ml^,mr^,ta,ta^,te^
source_language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
hi,0.758621,0.677876,1.0,0.684834,0.676471,0.736486,0.0,0.653846,0.680441
ta,0.241379,0.322124,0.0,0.315166,0.323529,0.263514,1.0,0.346154,0.319559


In [62]:
df = pd.read_csv(os.path.join(data_dir_trans, f'train_transliterated.csv'))
df['language'] = df['tgt'] 
df['language'] = df['language'].apply(reformat_lang_chaii)
print(df.shape)
df.sample(5)

id_to_souce_lang = {id: lang for id, lang in df[df['is_original']==True][['id', 'language']].values}

df['source_language'] = df['id'].apply(lambda x: id_to_souce_lang[x])
result = df.groupby(['language', 'source_language']).size().unstack(level=1).fillna(0).T
result

(7703, 9)


language,bn^,en^,hi,hi^,ml^,mr^,ta,ta^,te^
source_language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
hi,644.0,708.0,746.0,708.0,644.0,644.0,0.0,644.0,644.0
ta,263.0,316.0,0.0,263.0,263.0,263.0,368.0,322.0,263.0


In [63]:
result / result.sum(axis=0)

language,bn^,en^,hi,hi^,ml^,mr^,ta,ta^,te^
source_language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
hi,0.710033,0.691406,1.0,0.729145,0.710033,0.710033,0.0,0.666667,0.710033
ta,0.289967,0.308594,0.0,0.270855,0.289967,0.289967,1.0,0.333333,0.289967


# Train Data

In [29]:
result = None
for i in tqdm(range(k)):

    df = pd.read_csv(os.path.join(data_dir_trans, f'train_translated_train_k{i}.csv'))
    df['language'] = df['language'].apply(reformat_lang_chaii)
    if i == 0:
        result = df.groupby(['is_original']).size()
    else:
        result_new = df.groupby(['is_original']).size()
        result = pd.concat([result, result_new], axis=1)
result.columns = range(k)
print(result)

  0%|          | 0/5 [00:00<?, ?it/s]

                0     1     2     3     4
is_original                              
False        2170  2114  2109  2087  2154
True          908   906   906   907   905


In [30]:
result = None
for i in tqdm(range(k)):

    df = pd.read_csv(os.path.join(data_dir_trans, f'train_translated_train_k{i}.csv'))
    df['language'] = df['language'].apply(reformat_lang_chaii)
    if i == 0:
        result = df.groupby(['is_original', 'language']).size()
    else:
        result_new = df.groupby(['is_original', 'language']).size()
        result = pd.concat([result, result_new], axis=1)
result.columns = range(k)
print(result)

  0%|          | 0/5 [00:00<?, ?it/s]

                        0    1    2    3    4
is_original language                         
False       bn^       240  232  231  228  240
            en^       467  456  461  452  462
            hi^       348  348  345  334  347
            ml^       280  273  273  277  282
            mr^       247  243  236  236  249
            ta^       283  274  267  271  274
            te^       305  288  296  289  300
True        hi        609  606  607  607  606
            ta        299  300  299  300  299


# Test Data

In [31]:
result = None
for i in tqdm(range(k)):

    df = pd.read_csv(os.path.join(data_dir, f'train_test_k{i}.csv'))
    df['language'] = df['language'].apply(reformat_lang_chaii)
    if i == 0:
        result = df.groupby(['language']).size()
    else:
        result_new = df.groupby(['language']).size()
        result = pd.concat([result, result_new], axis=1)
result.columns = range(k)
print(result)

  0%|          | 0/5 [00:00<?, ?it/s]

           0   1   2   3   4
language                    
hi        67  67  67  67  67
ta        33  33  33  33  33


In [32]:
# find the number of common examples across splits
result = {}
for i in tqdm(range(k)):
    result[i] = {}
    for j in range(k):
        ids_i = set(pd.read_csv(os.path.join(data_dir, f'train_test_k{i}.csv'))['id'])
        ids_j = set(pd.read_csv(os.path.join(data_dir, f'train_test_k{j}.csv'))['id'])
        ids_common = ids_i.intersection(ids_j)
        result[i][j] = len(ids_common)
result = pd.DataFrame(result)  
result.columns = range(k)
print(result)

  0%|          | 0/5 [00:00<?, ?it/s]

     0    1    2    3    4
0  100    4    7    9    8
1    4  100    8   10   12
2    7    8  100    6   10
3    9   10    6  100    6
4    8   12   10    6  100
