In [1]:
# Import dependencies
import csv

In [2]:
filename = './data/output_wiktionary.tsv'
line_count = 0

with open(filename, 'r', encoding='utf-8') as file:
    reader = csv.reader(file, delimiter='\t')
    line_count = sum(1 for _ in reader) - 1

print("Number of lines:", line_count)

Number of lines: 10714


In [3]:
def count_tokens_per_language(file_path):
    """
    Function to count the number of tokens per language in a TSV file.

    Param:
        file_path (str): Path to the input TSV file.

    Returns:
        dict: A dictionary containing language-wise token counts, with 'Total' representing the overall count.
    """
    token_counts = {}
    total_count = 0  # Initialize total count

    with open(file_path, 'r', encoding='utf-8') as tsv_file:
        reader = csv.reader(tsv_file, delimiter='\t')
        header = next(reader)  

        for row in reader:
            for language, token in zip(header, row):
                if token != 'None':
                    token_counts[language] = token_counts.get(language, 0) + 1
                    total_count += 1  # Increment total count

    token_counts['Total'] = total_count  # Add total count to token_counts dictionary

    return token_counts

file_path = './data/output_wiktionary.tsv'

results = count_tokens_per_language(file_path)

# Sort the token counts in descending order
sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)

for language, count in sorted_results:
    count_str = f"{count:,}"
    print(f"{language} = {count_str}")



Total = 100,549
Indonesian = 10,714
English = 10,714
Finnish = 9,911
Portuguese = 8,972
Mandarin Chinese = 8,278
Polish = 8,172
Greek = 7,398
Spanish = 7,341
Japanese = 7,148
Arabic = 6,422
Serbo-Croatian = 6,025
Thai = 5,179
Slovene = 4,275
