# Simple dictionary preparation for every language that is supported by Google Translate API
Provide file with 50k words from: https://github.com/hermitdave/FrequencyWords (or in corresponding format) and go to [Running code](#running-code) section to create own dictionary

## Imports & config

In [34]:
import spacy
import asyncio
import nest_asyncio
from googletrans import Translator
from nltk.corpus import wordnet
import nltk
from tqdm import tqdm
from typing import List, Tuple, OrderedDict
from collections import OrderedDict
from collections import Counter
import os

nest_asyncio.apply()

nlp = spacy.load("pl_core_news_sm") # run manually: python -m spacy download pl_core_news_sm

def ensure_nltk_resource(resource, download_name=None):
    try:
        nltk.data.find(resource)
    except LookupError:
        nltk.download(download_name or resource.split('/')[-1])

ensure_nltk_resource('corpora/wordnet')
ensure_nltk_resource('corpora/omw-1.4', 'omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/grzegorzpozorski/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/grzegorzpozorski/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [35]:
translator_batch_size = 32
semaphore_max_concurrency = 48 # to control the number of concurrent requests to Google Translate API
translator_list_operation_max_concurrency = 32

translator = Translator(service_urls=['translate.googleapis.com'], list_operation_max_concurrency=translator_list_operation_max_concurrency)

## Functions

#### Functions for reading and preprocessing the input file

In [44]:
def read_file_to_memory(input_path) -> List[Tuple[str, int]]:
    # we expect to process files with 50k lines
    with open(input_path, 'r', encoding='utf-8') as input_file:
        input_file.seek(0)
        lines = [line.strip() for line in input_file]
        read_lines = []
        for line in lines:
            parts = line.split()
            if len(parts) != 2:
                continue
            word, occurrence = parts
            read_lines.append((word, occurrence))

    return read_lines


def deduplicate_lines(lines_to_process):
    unique = OrderedDict()
    for lemma, occurrence in lines_to_process:
        if lemma not in unique:
            unique[lemma] = occurrence
    return list(unique.items())


def get_valid_lemmatized_nouns(words_with_occurrences, minimal_word_length=4, maximal_word_length=8) -> List[Tuple[str, int]]:
    batch_size = 64
    valid_nouns_with_occurrences = []

    docs = nlp.pipe([word for word, _ in words_with_occurrences], batch_size=batch_size)
    for (word, occurrence), doc in tqdm(zip(words_with_occurrences, docs), total=len(words_with_occurrences), desc="Processing lines"):
        lemma = doc[0].lemma_
        if minimal_word_length <= len(lemma) <= maximal_word_length:
            if doc[0].pos_ == "NOUN":
                valid_nouns_with_occurrences.append((lemma, occurrence))

    print(f"Valid lemmatized nouns count: {len(valid_nouns_with_occurrences)}")
    deduplicated_valid_nouns_with_occurrences = deduplicate_lines(valid_nouns_with_occurrences)
    print(f"Deduplicated, valid, lemmatized nouns count: {len(deduplicated_valid_nouns_with_occurrences)}")

    return deduplicated_valid_nouns_with_occurrences


#### Functions for categorizing nouns

In [45]:
def get_english_noun_category(english_noun):
    synsets = wordnet.synsets(english_noun, pos=wordnet.NOUN)
    if not synsets:
        return "unknown"
    hypernyms = synsets[0].hypernyms()
    if hypernyms:
        return hypernyms[0].lemma_names()[0]
    return "general_noun"


async def categorize_nouns_batch(batch, semaphore, original_dictionary_language):
    language_for_categorization = 'en'

    async with semaphore:
        lemmas = [lemma for lemma, _ in batch]
        english_translations = await translator.translate(lemmas, src=original_dictionary_language, dest=language_for_categorization)
        
        english_words = [t.text.lower() for t in english_translations]
        categories = [get_english_noun_category(w) for w in english_words]
        
        valid_indices = [idx for idx, cat in enumerate(categories) if cat not in ("unknown", "general_noun")]
        
        results = []
        mismatch_count = 0
        unknown_general_category_count = 0
        
        if not valid_indices:
            unknown_general_category_count += len(batch)
            return results, mismatch_count, unknown_general_category_count
        
        valid_english_words = [english_words[idx] for idx in valid_indices]

        original_language_translations = await translator.translate(valid_english_words, src=language_for_categorization, dest=original_dictionary_language)
        original_language_translations = [t.text.lower() for t in original_language_translations]

        for j, idx in enumerate(valid_indices):
            lemma, occurrence = batch[idx]
            if original_language_translations[j] != lemma:
                mismatch_count += 1
                continue

            results.append(f"{lemma} {occurrence} {categories[idx]}\n")

        return results, mismatch_count, unknown_general_category_count


async def categorize_nouns(lines_to_process, output_path, original_dictionary_language):
    semaphore = asyncio.Semaphore(semaphore_max_concurrency)
    batches = [lines_to_process[i:i+translator_batch_size] for i in range(0, len(lines_to_process), translator_batch_size)]
    tasks = [categorize_nouns_batch(batch, semaphore, original_dictionary_language) for batch in batches]

    mismatch_count = 0
    unknown_general_category_count = 0

    with open(output_path, 'w', encoding='utf-8') as outfile:
        for fut in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Processing batches"):
            results, mismatches, unknowns = await fut
            mismatch_count += mismatches
            unknown_general_category_count += unknowns
            outfile.writelines(results)

    print(f"Processed: {mismatch_count} mismatches, {unknown_general_category_count} unknown/general categories")

#### Functions for checking categories

In [46]:
def group_by_category(input_path):
    category_dict = {}

    with open(input_path, 'r', encoding='utf-8') as infile:
        for line in infile:
            parts = line.strip().split()
            if len(parts) < 3:
                continue
            word, count, category = parts[0], parts[1], parts[2]
            if category not in category_dict:
                category_dict[category] = []
            category_dict[category].append((word, count))

    return category_dict

def get_top_categories(category_dict, min_words=10, should_print=True):
    ordered_category_dict = dict(sorted(category_dict.items(), key=lambda item: len(item[1]), reverse=True))
    top_categories_dict = {}
    cnt = 0
    for category, words in ordered_category_dict.items():
        if len(words) < min_words:
            continue
        if should_print:
            print(f"Category: {category}, Words count: {len(words)}")
        top_categories_dict[category] = words
        cnt += 1

    if should_print:
        print(f"\nCategories with {min_words}+ words count: {cnt}")

    return top_categories_dict


def get_nouns_lengths_sorted(input_path):
    counts = []
    with open(input_path, 'r', encoding='utf-8') as infile:
        for line in infile:
            parts = line.strip().split()
            if len(parts) < 3:
                continue
            word, count, category = parts[0], parts[1], parts[2]
            counts.append(len(word))


    counter = Counter(counts)
    sorted_counter = dict(sorted(counter.items(), key=lambda item: item[0]))

    return sorted_counter


async def save_top_categories(top_categories_dict, original_dictionary_language, output_path):
    with open(output_path, 'w', encoding='utf-8') as outfile:
        for category, entry in top_categories_dict.items():
            if "_" in category or "-" in category:
                category = category.replace("_", " ")
            category_translation = await translator.translate(category, src='en', dest=original_dictionary_language)
            original_language_category = category_translation.text.lower()
            for word, count in entry:
                outfile.write(f"{word} {count} {original_language_category}\n")

## Running code

In [59]:
original_dictionary_language = 'pl'
directory = f"data/{original_dictionary_language}"

input_path = f"{directory}/{original_dictionary_language}_50k.txt"
assert os.path.exists(input_path), f"File {input_path} does not exist. Please download the file from: https://github.com/hermitdave/FrequencyWords"

output_path_categorized_dict = f"{directory}/{original_dictionary_language}_output_categorized.txt"
output_path_top_categories = f"{directory}/{original_dictionary_language}_top_categories.txt"

words_with_occurrences = read_file_to_memory(input_path)
nouns_with_occurrences = get_valid_lemmatized_nouns(words_with_occurrences)
# processing time: ~36s

Processing lines: 100%|██████████| 50000/50000 [00:33<00:00, 1475.05it/s]

Valid lemmatized nouns count: 17067
Deduplicated, valid, lemmatized nouns count: 10606





In [60]:
asyncio.run(categorize_nouns(nouns_with_occurrences, output_path_categorized_dict, original_dictionary_language))
# processing time for 13k records: ~60s

Processing batches: 100%|██████████| 332/332 [00:46<00:00,  7.13it/s]

Processed: 4563 mismatches, 0 unknown/general categories





In [61]:
category_dict = group_by_category(output_path_categorized_dict)
top_categories_dict = get_top_categories(category_dict, min_words=10, should_print=True)
asyncio.run(save_top_categories(top_categories_dict, original_dictionary_language, output_path_top_categories))

Category: person, Words count: 25
Category: activity, Words count: 14
Category: structure, Words count: 13
Category: container, Words count: 11
Category: property, Words count: 11
Category: dish, Words count: 11
Category: Gregorian_calendar_month, Words count: 10
Category: woman, Words count: 10
Category: state, Words count: 10
Category: area, Words count: 10

Categories with 10+ words count: 10


In [62]:
counter = get_nouns_lengths_sorted(output_path_categorized_dict)
print("Counts of words by length:")
for length, count in counter.items():
    print(f"Length {length}: {count} words")

Counts of words by length:
Length 4: 339 words
Length 5: 569 words
Length 6: 584 words
Length 7: 546 words
Length 8: 463 words


## Notes
- concurrency equal 48 is set after several attempts to find the best value
- we can have more words in dict, if we classify words into multiple categories (i.e. 'nail' can be 'nail' and 'finger_nail')
- there is not specific error handling