## 1 Data Extraction And Cleaning


In [1]:
import regex
import pandas as pd
import spacy


In [2]:
def extract_raw_matches(file_source: str):
    regex_pattern = "(^\d{1,4} (?!\|))((.*\n){0,8}?.*?(?=\d+ \|))"
    with open(file_source, "r") as data:
        string = data.read()

    unfiltered_matches = regex.findall(regex_pattern, string, regex.MULTILINE)
    unfiltered_matches.pop(0)

    filtered_matches = []
    next_match = 1
    for match in unfiltered_matches:
        if int(match[0]) == next_match:
            filtered_matches.append(match)
            next_match += 1

    return filtered_matches


matches = extract_raw_matches("source.txt")


In [3]:
def parse_raw_matches_into_dict(matches):
    result = []
    for match in matches:
        result.append(
            {
                "frequency_idx": int(match[0]),
                "raw_match": regex.sub("\d+", "", match[1].replace("|", "")),
            }
        )
    return result


match_dict = parse_raw_matches_into_dict(matches)


In [5]:
def parse_raw_sentence(dictionary):
    str_to_parse = dictionary["raw_match"].replace("æ", "œ").replace("’", "'")
    part_of_speach_codes = {
        "adj",
        "adji",
        "adji\(pl\)",
        "adv",
        "conj",
        "det",
        "intj",
        "n",
        "nf",
        "nm",
        "nadj",
        "prep",
        "pro",
        "v",
        "nmi",
        "nfi",
        "nmpl",
        "nfpl",
        "adj\(f\)",
        "nadj\(f\)",
        "nm\(pl\)",
        "nf\(pl\)",
        "adj\(pl\)",
        "nadj\(pl\)",
        "nmfi",
        "adjf",
        "nadjpl",
    }

    min = None
    max = None
    for i in part_of_speach_codes:
        part_of_speach_regex = f" {i}[, ]"
        match = regex.search(part_of_speach_regex, str_to_parse, regex.MULTILINE)
        if match:
            start, end = match.span()
            if min is None or start < min:
                min = start
            if max is None or end > max:
                max = end

    if min is None and max is None:
        end_of_first_word = str_to_parse.find(" ")
        min, max = end_of_first_word, end_of_first_word

    dictionary["french_word"] = str_to_parse[:min].strip()
    dictionary["pos_codes"] = str_to_parse[min:max].strip()
    dictionary["word_english"] = str_to_parse[max:].split("\n")[0]
    split_sentence = (
        str_to_parse[max:]
        .replace(dictionary["word_english"], "", 1)
        .replace("\n", "")
        .split("–")
    )
    if len(split_sentence) == 1:
        split_sentence = (
            str_to_parse[max:]
            .replace(dictionary["word_english"], "", 1)
            .replace("\n", "")
            .split("-")
        )
    sentence_french = split_sentence[0].strip()
    sentence_english = "-".join(split_sentence[1:]).strip()
    dictionary["sentence_french"], dictionary["sentence_english"] = (
        sentence_french,
        sentence_english,
    )
    return dictionary


In [6]:
failures = []
for dict in match_dict:
    try:
        parse_raw_sentence(dict)
    except Exception as e:
        failures.append({"frequency_idx": dict["frequency_idx"], "exception": str(e)})


## 2 Replace Target Word In Example Sentence


In [7]:
nlp = spacy.load("fr_core_news_lg")


In [8]:
def replace_word_in_sentence(word, sentence):
    word_regex = "(?<=(\.|,|\?| |^|'|-))" + word + "(?=(\.|,|\?| |$||\!|-))"
    new_sentence = regex.sub(word_regex, "___", sentence)
    return new_sentence


def delete_lemma_from_example(word, sentence):
    doc = nlp(sentence)
    word_doc = nlp(word)
    input_word_token = word_doc[0]
    tokens = [
        {
            "original": token.text,
            "lemma": token.lemma_,
            "similarity": token.similarity(input_word_token),
        }
        for token in doc
    ]
    tokens.sort(key=lambda x: x["similarity"], reverse=True)
    for token in tokens:
        if token["lemma"] == word or token["original"] == word:
            return True, replace_word_in_sentence(token["original"], sentence)

    if len(word_doc) == 1:
        return False, replace_word_in_sentence(tokens[0]["original"], sentence)
    else:
        return False, replace_word_in_sentence(word, sentence)


In [9]:
no_lemma_match = []
replace_failures = []
for item in match_dict:
    word = item["french_word"]
    sentence = item["sentence_french"]
    success, result = delete_lemma_from_example(word, sentence)

    item["sentence_french_deleted"] = result

    if not success:
        no_lemma_match.append(item)
    if "_" not in result:
        replace_failures.append(item)


  tokens = [{"original": token.text, "lemma": token.lemma_, 'similarity': token.similarity(input_word_token)} for token in doc]


In [10]:
for item in match_dict:
    item["pronunciation"] = f"[sound:french_audio_{item['frequency_idx']}.mp3]"


In [None]:
export_df = pd.DataFrame(match_dict)
export_df.drop(inplace=True, columns="raw_match")
export_df.to_csv("basic_french_flashcards.csv", index=False, sep="&")
