In [1]:
import pandas as pd
from glob import glob

In [None]:
dfs = []
for fname in glob("/Users/francois.weber/Downloads/xa*"):
    print(len(dfs) + 1)

    df = pd.read_json(fname, lines=True)
    
    # only keep FR
    df = df.query("lang_code == 'fr'")
    
    # exclude categories 
    exclude_pos_titles = [
        "Locution nominale", # ex: barbe à papa, bande dessinée 
        "Nom propre",
        "Locution interjective", # ex: joyeux Noël 
        "Locution-phrase", # ex: je t'aime
        "Locution prépositive", 
        "Locution verbale", # exemple : en avoir plein le dos 
        "Locution adjectivale", # exemple : en avoir plein le dos 
        "Proverbe",
        "Forme de locution nominale", 
        "Particule", 
        "Variante par contrainte typographique", # ex: coeur de métier
        "Locution conjonctive", # ex: vu que
        "Pronom possessif", # ex: le mien
        "Forme de locution adjectivale", # ex: cousu de fil blanc
        "Adjectif numéral", # un, deux 
        "Forme de locution conjonctive", # ex: bien qu'
        "Prénom",
        "Nom de famille",
        "Forme de locution verbale", #ex: cru bon
        "Symbole",
        "Suffixe",
        "Forme de variante par contrainte typographique"
        "Préfixe"
    ]

    for exclude_pos_title in exclude_pos_titles:
        df = df[~df.pos_title.str.startswith(exclude_pos_title)]

    # categories that should be flagged as optional 
    not_really_words_pos_titles = [
        "Locution adverbiale",
        "Locution conjonctive",
        "Locution pronominale",
        "Pronom possessif",
        "Pronom indéfini",
        "Variante par contrainte typographique",
        "Forme de locution nominale",
        "Adjectif indéfini",
        "Particule",
        "Locution adverbiale 1",
        "Locution adverbiale 2",
        "Locution adverbiale 3",
        "Onomatopée",
        "Postposition",
    ]

    df["optional_category"] = df.pos_title.isin(not_really_words_pos_titles)
    
    # Exclude derivatives of verbs 
    def is_conjugated_verb(word: pd.Series) -> bool:
        return word.pos_title == "Forme de verbe"

    df["is_conjugated_verb"] = df.apply(is_conjugated_verb, axis=1)
    df = df[~df.is_conjugated_verb]
    
    # deal with derived forms of other words
    def is_form_of_other_word(word: pd.Series) -> bool:
        for sense in word.senses:
            if (forms_of:=sense.get("form_of", None)) is not None:
                for form_of in forms_of:
                    if "word" in form_of:
                        return True 
        return False

    df["is_form_of_other_word"] = df.apply(is_form_of_other_word, axis=1)
    df = df[~df.is_form_of_other_word]
    
    # Deal with specialmulti-word expressions 
    def word_has_spaces(word: pd.Series) -> bool:
        return " " in word.word

    df["word_has_spaces"] = df.apply(word_has_spaces, axis=1)

    dissallow_with_spaces_pos_titles = [
        "Adjectif indéfini", # ex: n'importe quel
        "Pronom indéfini", # ex: les deux
        "Pronom interrogatif", # ex: ce qui
        "Pronom relatif", # ex: ce qui
        "Adverbe interrogatif", # ex: comment ça
        "Forme d’article partitif", # ex: de la
    ]

    for pos_title in dissallow_with_spaces_pos_titles:
        df = df[~(df.word_has_spaces & (df.pos_title == pos_title))]
        
    # deal with gentillés
    def is_gentile(word: pd.Series) -> bool:
        descr = str(word.categories) + str(word.senses)
        return "gentilé" in descr.lower()

    df["is_gentile"] = df.apply(is_gentile, axis=1)
    df = df[~df.is_gentile]
    
    # deal with proper nouns 
        
    dfs.append(df)

1
2
3
4
5
6
7
8


In [13]:
df = pd.concat(dfs).reset_index(drop=True)[["word", "pos", "pos_title", "senses", "tags", "optional_category"]]

# re process smaller dataset

In [14]:
from typing import List


def extract_definitions(senses: pd.Series) -> List[str]:
    defs = []
    for sense in senses:
        if "glosses" in sense:
            defs.extend(sense["glosses"])
    return defs


In [15]:
df["definitions"] = df.pop("senses").apply(extract_definitions)

In [16]:
df

Unnamed: 0,word,pos,pos_title,tags,optional_category,definitions
0,ré-augmenter,verb,Verbe,"[intransitive, transitive]",False,[Augmenter à nouveau.]
1,arrière-grand-mamie,noun,Nom commun,[feminine],False,[La mère de l’un des arrière-grands-parents.]
2,V1,noun,Nom commun,"[feminine, masculine]",False,"[Premier missile de croisière, développé par l..."
3,pouff,noun,Nom commun,[masculine],False,"[Sorte de gros siège mou, sans bois apparent, ..."
4,pffu,intj,Interjection,,False,[Onomatopée qui traduit une forme de désintérêt.]
...,...,...,...,...,...,...
276757,fortunal,noun,Nom commun,[masculine],False,"[Tempête, coup de mer. Encore employé au XVIII..."
276758,palmule,noun,Nom commun,[feminine],False,"[Organe élargi en éventail, sur le corps du ta..."
276759,aka-bo,noun,Nom commun,"[masculine, singular-only]",False,[Langue de la famille grand adamanais parlée a...
276760,aka-kede,noun,Nom commun,"[masculine, singular-only]",False,[Langue de la famille grand adamanais parlée a...


In [17]:
df.to_parquet("/Users/francois.weber/perso/tmp/dico-fr.parquet")