In [None]:
# ! git clone https://github.com/global-asp/global-asp.git
# ! git clone https://github.com/global-asp/asp-source.git
# ! git clone https://github.com/global-asp/lcb-source
# ! git clone https://github.com/global-asp/pb-source
# ! git clone https://github.com/global-asp/sbc-source
# ! git clone https://github.com/global-asp/gasp-mexico
# ! git clone https://github.com/global-asp/global-pb
# ! git clone https://github.com/global-asp/global-lcb
# ! git clone https://github.com/global-asp/sbjm-source
# ! git clone https://github.com/global-asp/sbug-source
# ! git clone https://github.com/global-asp/sbno-source
# ! git clone https://github.com/global-asp/sbk-source
# ! git clone https://github.com/global-asp/sbuk-source
# ! git clone https://github.com/global-asp/lida-source
# ! git clone https://github.com/global-asp/global-lida
# ! git clone https://github.com/global-asp/gasp-alternates
# ! git clone https://github.com/global-asp/asp-new

In [None]:
import os
import pandas as pd
import re
from tqdm import tqdm
from GlotScript import sp

In [None]:
# process each .md file
def process(main_directory_path):
    # Initialize an empty list to store DataFrames
    dfs = []

    # Loop through subdirectories in the main directory
    for language_folder in os.listdir(main_directory_path):
        language_folder_path = os.path.join(main_directory_path, language_folder)
        if os.path.isdir(language_folder_path):
            # Initialize an empty list to store DataFrames for this language
            language_dfs = []

            # Loop through files in the subdirectory
            for file_name in os.listdir(language_folder_path):
                if file_name.endswith('.md') and file_name != 'README.md':
                    with open(os.path.join(language_folder_path, file_name), 'r', encoding='utf-8') as file:
                        text = file.read()

                        # Extract content between ##
                        sections = re.split(r'##\n+', text.strip())[1:]
                        sections = [s.strip() for s in sections]
                        sections = [s for s in sections if len(s)!=0]
                        sections = [s for s in sections if sp(s)[0] not in ['Zyyy', 'Zzzz', 'Zinh']]

                        # Get metadata
                        metadata = {}
                        for line in sections[-1].split('\n* '):
                            try:
                                key, value = line.split(': ')
                                key = key.strip().strip('*').strip()
                                value = value.strip().strip('*').strip()
                                metadata[key] = value
                            except:
                                print(line)
                                print(file_name)
                                print(language_folder)

                        contents = sections[:-1]

                        # Create DataFrame
                        df = pd.DataFrame({'Row': contents, 'Row Number': range(len(contents))})
                        for key, value in metadata.items():
                            df[key] = value

                        # Add a unique identifier (file name)
                        df['File Name'] = file_name

                        # Append DataFrame to the list for this language
                        language_dfs.append(df)

            # Concatenate DataFrames for this language
            try:
                language_final_df = pd.concat(language_dfs, ignore_index=True)
                
                # Add a column for language folder name
                language_final_df['Language Folder'] = language_folder

                # Append DataFrame to the list
                dfs.append(language_final_df)

            except:
                print(language_folder)
                print(main_directory_path)

    # Concatenate all DataFrames
    final_df = pd.concat(dfs, ignore_index=True)
    return final_df

In [None]:
# apply process function which process files and save them
def save(key):
    main_directory_path = f'./{key}/'
    final_df = process(main_directory_path)
    final_df['Source'] = f'global-asp/{key}'
    final_df.to_csv(f'{key}.csv', index=False)

In [None]:
# list of folders
download_list = [
    'global-asp', 
    'asp-source',
    'lcb-source', 
    'pb-source', 
    'sbc-source',
    'gasp-mexico',
    'global-pb',
    'global-lcb',
    'sbjm-source',
    'sbug-source',
    'sbno-source',
    'sbk-source',
    'sbuk-source',
    'lida-source',
    'global-lida',
    'gasp-alternates',
    'asp-new']

In [None]:
# apply
for key in download_list:
    save(key)

In [None]:
# merge 
import os
import pandas as pd

# Get the current directory
current_dir = os.getcwd()

# Find all CSV files in the current directory
csv_files = [file for file in os.listdir(current_dir) if file.endswith('.csv')]

# Initialize an empty list to store dataframes
dfs = []

# Loop through the CSV files and read them into dataframes
for file in csv_files:
    df = pd.read_csv(os.path.join(current_dir, file))
    dfs.append(df)

# Concatenate all the dataframes
concatenated_df = pd.concat(dfs, ignore_index=True)

In [None]:
# post process and normalize
def translation(row):
    
    if not pd.isna(row['Translation']):
        return row['Translation']
    
    elif not pd.isna(row['Translator']):
        return row['Translator']
    
    elif not pd.isna(row['Translated By']):
        return row['Translated By']
    
    else:
        return ""

concatenated_df['Translation'] = concatenated_df.apply(translation, axis=1)
concatenated_df['License'] = concatenated_df['License'].apply(lambda x: x.strip('[').strip(']').strip())
concatenated_df['Language'] = concatenated_df['Language'].apply(lambda x: x.split('\n')[0].strip())
concatenated_df = concatenated_df[['Row', 'Row Number', 'License', 'Text', 'Translation', 'Language', 'File Name', 'Source']]
# Rename columns
concatenated_df = concatenated_df.rename(columns={'Text': 'Text By', 'Row': 'Text', 'Row Number': 'Text Number', 'Translation': 'Translation By'})

In [None]:
# map iso codes
import pycountry

def iso_639_1_to_3(iso_639_1_code):
    
    iso_639_1_code = iso_639_1_code.split('-')[0]
    if len(iso_639_1_code) == 3:
        return iso_639_1_code
    
    try:
        language = pycountry.languages.get(alpha_2=iso_639_1_code)
        return language.alpha_3
    except AttributeError:
        return None

# Example usage:
iso_639_1_code = 'en'  # Example ISO 639-1 code for English
iso_639_3_code = iso_639_1_to_3(iso_639_1_code)

if iso_639_3_code:
    print(f"ISO 639-3 code for {iso_639_1_code} is {iso_639_3_code}")
else:
    print(f"No ISO 639-3 code found for {iso_639_1_code}")

concatenated_df['ISO639-3'] = concatenated_df['Language'].apply(iso_639_1_to_3)

In [None]:
# clean script level
concatenated_df['Script'] = concatenated_df['Text'].apply(lambda x: sp(x)[0])
grouped = concatenated_df.groupby(['ISO639-3', 'Script']).size().reset_index(name='Count').sort_values('Count')
filtered_groups = grouped[grouped['Count'] >= 5]
result_df = pd.merge(concatenated_df, filtered_groups, on=['ISO639-3', 'Script'])

In [None]:
result_df