In [None]:
import requests
from tqdm import tqdm

# URL of the file (direct link to raw content)
file_url = "https://huggingface.co/datasets/lbourdois/panlex/resolve/main/panlex.csv"

# Local filename to save the file
local_filename = "panlex.csv"

# Send a request to get the file size
response = requests.get(file_url, stream=True)
response.raise_for_status()

# Get total file size in bytes (if available)
total_size = int(response.headers.get("content-length", 0))

# Download and save the file with a progress bar
with open(local_filename, "wb") as file, tqdm(
    desc="Downloading",
    total=total_size,
    unit="B",
    unit_scale=True,
    unit_divisor=1024,
) as progress:
    for chunk in response.iter_content(chunk_size=8192):
        file.write(chunk)
        progress.update(len(chunk))

print(f"File downloaded successfully as: {local_filename}")


In [None]:
import pandas as pd

df = pd.read_csv('panlex.csv', sep=';')
df = df[df['english_name_var']=='English']
df = df[~df['vocab'].str.contains(' ')]

In [None]:
import pandas as pd
from transformers import AutoTokenizer
from tqdm import tqdm

# Load the tokenizer for CodeLlama-7b-hf
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")

def filter_tokens(word):
    tokens = tokenizer.tokenize(word)
    if len(tokens) == 1:
        return word  # Keep the word if it's tokenized into a single token
    else:
        return None  # Discard the word if it's tokenized into more than one token

# Enable tqdm for pandas apply
tqdm.pandas()

# Apply the filter function to the 'vocab' column with progress tracking
df['filtered'] = df['vocab'].progress_apply(filter_tokens)

# Remove rows where the filtered column is None (i.e., more than 1 token)
df_filtered = df.dropna(subset=['filtered'])

In [None]:
special_chars = [str(i) for i in range(0, 101)] + ['{', '}', '(', ')', '[', ']', '-', '+', '.', ',', ';', '"']

keywords_list = {}

keywords_list['ENG'] = list(set(df_filtered['vocab'].values) - set(special_chars))
keywords_list['ENGPURE'] = list(set(keywords_list['ENG']) - set(keywords_list['all']))