In [43]:
import pycountry

def iso_two_to_three_conversion(code):
    if len(code) == 2:
        lang = pycountry.languages.get(alpha_2=code)
        if lang:
            return lang.alpha_3
    elif len(code) == 3:
        lang = pycountry.languages.get(alpha_3=code)
        if lang:
            return lang.alpha_3
    return None

def extract_iso_code_from_bcp(bcp_identifier):
    # Check if there's a hyphen in the identifier
    if '_' in bcp_identifier:
        language, _ = bcp_identifier.split('_', 1)
        if 2 <= len(language) <= 3:
            return iso_two_to_three_conversion(language)
        else:
            # Handle non-ISO codes
            raise ValueError(f"Invalid ISO code: {bcp_identifier}")
    else:
        # If no hyphen, assume the input is either a two-letter or three-letter code
        if 2 <= len(bcp_identifier) <= 3:
            return iso_two_to_three_conversion(bcp_identifier)
        else:
            return language


In [44]:
import csv
import json
import numpy as np

def process_tsv_to_json(tsv_file, output_json):
    result_list = []

    # Open and process the TSV file
    with open(tsv_file, 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file, delimiter='\t')
        
        # Skip the header and iterate through the rows
        for row in reader:
            try:
                # Apply the conversion function on the 'BCP-47' column
                iso_code = extract_iso_code_from_bcp(row['BCP-47'])
                result_list.append(iso_code)
            except Exception as e:
                # If there's an issue with conversion, you can decide to log it or continue
                result_list.append(row['BCP-47'])

    result_list = result_list[1:]
    # Write the result list to a JSON file
    with open(output_json, 'w', encoding='utf-8') as json_file:
        json.dump(result_list, json_file, ensure_ascii=False, indent=4)

In [45]:
process_tsv_to_json('madlad.tsv','madlad_aplha_3.json')