Sources:
- https://github.com/dwyl/english-words/blob/master/words_dictionary.json
- https://dumps.wikimedia.org/enwiktionary
- https://gcide.gnu.org.ua/download
- https://wordnet.princeton.edu/download
- http://wordlist.aspell.net/ aka SCOWL (Spell Checker Oriented Word Lists)

In [183]:
import os
import re
import json
from typing import List, Optional
import pandas as pd


import IPython.display

import xmltodict
from lxml import etree

### GCIDE

In [102]:
def read_gcide_data(file_path):
    # Read XML data
    with open(file_path, "r") as file:
        xml_data = file.read()
    
    # Wrap with root
    wrapped_xml_data = f"<root>{xml_data}</root>"
    
    # Parse ignoring entity errors
    parser = etree.XMLParser(recover=True)
    tree = etree.fromstring(wrapped_xml_data.encode(), parser=parser)
    
    # Convert to dictionary if needed
    parsed_data = xmltodict.parse(etree.tostring(tree))
    
    d_out = {}
    
    for item in parsed_data['root']['p']:
        if item is None:
            continue
        ent = item.get('ent')
        if isinstance(ent, list):
            ent = ent[0]
        if isinstance(ent, dict):
            ent = list(ent.keys())[0]
        if ent:  # Check if 'ent' exists in the current entry
            cleaned_entry = {}
            for k, v in item.items():
                if k != 'ent':  # Exclude 'ent' itself
                    # Handle unhashable types by converting them to strings
                    if isinstance(v, (list, dict)):
                        # Convert lists/dicts to string
                        cleaned_entry[k] = str(v)
                    else:
                        cleaned_entry[k] = v
            ent = str.lower(ent)
            d_out[ent] = cleaned_entry
    return d_out

In [128]:
# Specify the directory path

def get_filelist(directory_path: str, except_list: Optional[List[str]] = None) -> List[str]:
    # Get all files with extensions in the specified directory
    files_with_extensions = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f)) and '.' in f]
    # files_with_extensions = [os.path.abspath(os.path.join(directory_path, f)) for f in os.listdir(directory_path)
                             # if os.path.isfile(os.path.join(directory_path, f)) and '.' in f]
    if except_list is not None:
        fl = set(files_with_extensions) - set(except_list)
    else:
        fl = files_with_extensions
    fl = [os.path.abspath(os.path.join(directory_path, f)) for f in fl]
    return fl

In [103]:
dir_path = "gcide_xml-0.53/"
except_list = ['gcide_authorities.xml', 'gcide_abbreviations.xml', 'gcide.xml']
fl = get_filelist(dir_path, except_list)
d_gcide = {}
for f in fl:
    d_gcide.update(read_gcide_data(f))

In [104]:
len(d_gcide.keys())

108182

In [119]:
d_gcide['r']

{'br': '[None, None]',
 'hw': 'R',
 'pr': '(r)',
 'def': "{'xex': ['semivowel', 'liquid', 'Guide to Pronunciation'], '#text': 'R, the eighteenth letter of the English alphabet, is a vocal consonant.  It is sometimes called a , and a . See ,  178, 179, and 250-254.'}",
 'xex': 'R',
 'rj': "{'au': 'B. Jonson.'}",
 'source': '1913 Webster',
 '#text': ".   is the dog's letter and hurreth in the sound.  \n[]"}

### Wordnet

In [171]:
import re

def read_wordnet_dict(file_path):
    with open(file_path, "r", encoding="ISO-8859-1") as file:  # Added encoding to handle potential issues
        wordnet_data = file.read()

    # Define regex patterns for each part of speech:
    # 1. Adjective (a)
    adj_pattern = r"(\d{8})\s+\d{2}\s+a\s+\d{2}\s+([a-zA-Z_]+)\s+\d{1,2}\s+[\w=\s\+!]*\s+\|\s+(.*)"
    # 2. Adverb (r)
    adv_pattern = r"(\d{8})\s+\d{2}\s+r\s+\d{2}\s+([a-zA-Z_]+)\s+\d{1,2}\s+[\w=\s\+!]*\s+\|\s+(.*)"
    # 3. Noun (n)
    noun_pattern = r"(\d{8})\s+\d{2}\s+n\s+\d{2}\s+([a-zA-Z_]+)\s+\d{1,2}\s+[\w=\s\+!]*\s+\|\s+(.*)"
    # 4. Verb (v)
    verb_pattern = r"(\d{8})\s+\d{2}\s+v\s+\d{2}\s+([a-zA-Z_]+)\s+\d{1,2}\s+[\w=\s\+!]*\s+\|\s+(.*)"
    
    wn = {}

    # Process the data line by line and match using each pattern
    for line in wordnet_data.splitlines():
        # Check each pattern one by one
        match_adj = re.match(adj_pattern, line)
        match_adv = re.match(adv_pattern, line)
        match_noun = re.match(noun_pattern, line)
        match_verb = re.match(verb_pattern, line)

        # If any of the patterns match, process the word and its definition
        if match_adj:
            word = match_adj.group(2)
            definition = match_adj.group(3)
            wn[word] = {'def': definition, 'pos': 'adjective', 'source_file': file_path.split('/')[-1]}
        elif match_adv:
            word = match_adv.group(2)
            definition = match_adv.group(3)
            wn[word] = {'def': definition, 'pos': 'adverb', 'source_file': file_path.split('/')[-1]}
        elif match_noun:
            word = match_noun.group(2)
            definition = match_noun.group(3)
            wn[word] = {'def': definition, 'pos': 'noun', 'source_file': file_path.split('/')[-1]}
        elif match_verb:
            word = match_verb.group(2)
            definition = match_verb.group(3)
            wn[word] = {'def': definition, 'pos': 'verb', 'source_file': file_path.split('/')[-1]}

    return wn

In [172]:
dir_path = "wordnet"
except_list = ['index.adj', 'index.adv', 'index.noun', 'index.sense', 'index.verb']
fl = get_filelist(dir_path, except_list)
d_wn = {}
for f in fl:
    d_wn.update(read_wordnet_dict(f))
len(d_wn.keys())

1573

### SCOWL

In [153]:
def read_scowl_dict(file_path):
    with open(file_path, "r", encoding="ISO-8859-1") as file:
        scowl_data = file.read()
    words = scowl_data.splitlines()
    
    # Dictionary to store base form words
    base_words_dict = {}
    
    # Loop over each word in the list
    for word in words:
        # Remove possessive 's' or any non-alphabetic characters at the end of the word
        base_word = re.sub(r"'s?$|[^a-zA-Z]", "", word)
        
        # Add the base word to the dictionary with value 1
        base_words_dict[base_word] = 1
    
    # Display the final dictionary
    return base_words_dict

In [155]:
dir_path = "scowl"
except_list = ['README']
fl = get_filelist(dir_path)
d_scowl = {}
for f in fl:
    d_scowl.update(read_scowl_dict(f))

### From excel files

In [213]:
# Read the Excel/CSV file
def read_excel_dict(file_path):
    df = pd.read_excel(file_path)  # Use pd.read_csv for CSV files
    
    # Initialize the dictionary
    data_dict = {}
    
    # Iterate through each row to build the dictionary
    for _, row in df.iterrows():
        # Ensure the main_word is a string
        main_word = str(row['Word']) if pd.notna(row['Word']) else 'Unknown'
    
        # Handle POS with valid checks for NaN or invalid values
        pos = row['POS'] if pd.notna(row['POS']) and row['POS'] != '#N/A' else None
    
        # Extract synonyms and filter out blanks or NaNs
        synonyms = [
            row.get(f'Syn0{i}', None) 
            for i in range(1, 6)
            if pd.notna(row.get(f'Syn0{i}', None)) and row[f'Syn0{i}'] != '#N/A'
        ]
        
        # Populate the dictionary
        data_dict[main_word] = {
            'pos': pos,
            'syn': synonyms
        }
    return data_dict

In [214]:
# MyWordNet
file_path = 'excel/MyWordNet.xlsx' 
d_mywnet = read_excel_dict(file_path=file_path)

In [215]:
list(d_mywordnet.keys())[-5:]

['zoo', 'zoological', 'zoologist', 'zoology', 'zoom']

In [216]:
# MyWordNet
file_path = 'excel/Thesaurus_a-z.xlsx'
d_thesaz = read_excel_dict(file_path=file_path)

In [217]:
print(list(d_thesaz.keys())[:50])

["'s gravenhage", "'tween decks", '0.22', '.22-calibre', '.22 caliber', '.22 calibre', '.38-caliber', '.38-calibre', '.38 caliber', '.38 calibre', '.45-caliber', '.45-calibre', '.45 caliber', '.45 calibre', '0', '1', '1-dodecanol', '1-hitter', '10', '10-membered', '100', '1000', '10000', '100000', '1000000', '1000000000', '1000000000000', '1000th', '100th', '101', '101st', '105', '105th', '10th', '11', '11-plus', '110', '110th', '115', '115th', '2021-11-11 00:00:00', '11th', '12', '12-tone music', '12-tone system', '120', '120th', '125', '125th', '12th']


## From englist-words word dict

In [188]:
! pwd

/Users/deepaksingh/Desktop/experiments/wordguru


In [220]:
with open('english-words/words_dict.json', 'r') as f:
    raw = json.load(f)

print(f"raw original: {len(raw.keys())}\n")

wordlists = {
    "d_wn": d_wn,
    "d_scowl": d_scowl,
    "d_gcide": d_gcide,
    "d_mywnet": d_mywordnet,
    "d_thesaz": d_thesaz,
}

for name, wl in wordlists.items():
    print(f"words in {name}: {len(wl.keys())}")
    raw.update(wl)
    print(f"raw after adding {name}: {len(raw.keys())}\n")

raw original: 370100

words in d_wn: 1573
raw after adding d_wn: 370399

words in d_scowl: 526912
raw after adding d_scowl: 543023

words in d_gcide: 108182
raw after adding d_gcide: 559056

words in d_mywnet: 30259
raw after adding d_mywnet: 563831

words in d_thesaz: 145789
raw after adding d_thesaz: 631016



In [221]:
raw_just_words = {k:1 for k,v in raw.items()}
with open('raw_words.json', 'w') as f:
    json.dump(raw_just_words, f, indent=4)