## 1 Data Extraction And Cleaning

In [3]:
import regex
import pandas as pd
import spacy

In [4]:
def extract_raw_matches(file_source: str):
    regex_pattern = '(^\d{1,4} (?!\|))((.*\n){0,8}?.*?(?=\d+ \|))'
    with open(file_source, 'r') as data:
        string = data.read()

    unfiltered_matches = regex.findall(regex_pattern, string, regex.MULTILINE)
    unfiltered_matches.pop(0)

    filtered_matches = []
    next_match = 1
    for match in unfiltered_matches:
        if int(match[0]) == next_match:
            filtered_matches.append(match)
            next_match +=1

    return filtered_matches

matches = extract_raw_matches('source.txt')

In [5]:
def parse_raw_matches_into_dict(matches):
    result = []
    for match in matches:
        result.append({'frequency_idx': int(match[0]), "raw_match": regex.sub('\d+', "", match[1].replace("|", ""))})
    return result
match_dict = parse_raw_matches_into_dict(matches)

In [6]:
def parse_raw_sentence(dictionary):
    str_to_parse = dictionary['raw_match'].replace('æ','œ').replace('’', "'")
    part_of_speach_codes = {'adj', 'adji','adji\(pl\)', 'adv', 'conj', 'det', 'intj', 'n', 'nf', 'nm', 'nadj', 'prep', 'pro', 'v', 'nmi', 'nfi', 'nmpl', 'nfpl', 'adj\(f\)', 'nadj\(f\)', 'nm\(pl\)', 'nf\(pl\)', 'adj\(pl\)', 'nadj\(pl\)', 'nmfi', "adjf", "nadjpl"  }
    
    min = None
    max = None
    for i in part_of_speach_codes:
        part_of_speach_regex =f' {i}[, ]'
        match =  regex.search(part_of_speach_regex, str_to_parse,regex.MULTILINE)
        if match:
            start, end = match.span()
            if min is None or start < min:
                min = start
            if max is None or end > max:
                max = end
    
    if min is None and max is None:
        end_of_first_word = str_to_parse.find(' ')
        min, max = end_of_first_word, end_of_first_word
            
    dictionary['french_word'] = str_to_parse[:min].strip()
    dictionary['pos_codes'] = str_to_parse[min:max].strip()
    dictionary['word_english'] = str_to_parse[max:].split('\n')[0]
    split_sentence = str_to_parse[max:].replace(dictionary['word_english'], "", 1).replace('\n', "").split('–')
    if len(split_sentence) == 1:
        split_sentence =  str_to_parse[max:].replace(dictionary['word_english'], "", 1).replace('\n', "").split('-')
    sentence_french = split_sentence[0].strip()
    sentence_english = "-".join(split_sentence[1:]).strip()
    dictionary['sentence_french'], dictionary["sentence_english"] = sentence_french, sentence_english
    return dictionary

In [7]:
failures = []
for dict in match_dict:
    try:
        parse_raw_sentence(dict)
    except Exception as e:
        failures.append({'frequency_idx': dict['frequency_idx'], 'exception': str(e)})

## 2 Replace Target Word In Example Sentence

In [8]:
nlp = spacy.load('fr_core_news_lg')

In [9]:
def replace_word_in_sentence(word, sentence):
    word_regex = "(?<=(\.|,|\?| |^|'|-))" + word + "(?=(\.|,|\?| |$||\!|-))"
    new_sentence = regex.sub(word_regex, "___", sentence)
    return new_sentence

def delete_lemma_from_example(word, sentence):
    doc = nlp(sentence)
    word_doc = nlp(word)
    input_word_token = word_doc[0]
    tokens = [{"original": token.text, "lemma": token.lemma_, 'similarity': token.similarity(input_word_token)} for token in doc]
    tokens.sort(key=lambda x: x["similarity"],reverse=True)
    for token in tokens:
        if token["lemma"] == word or token["original"] == word:
            return True, replace_word_in_sentence(token["original"], sentence)
    
    if len(word_doc) == 1:
        return False, replace_word_in_sentence(tokens[0]['original'], sentence)
    else:
        return False, replace_word_in_sentence(word, sentence)

In [10]:
no_lemma_match = []
replace_failures = []
for item in match_dict:
    word = item['french_word']
    sentence = item['sentence_french']
    success, result = delete_lemma_from_example(word, sentence)

    item['sentence_french_deleted'] = result
    
    if not success:
        no_lemma_match.append(item)
    if "_" not in result:
        replace_failures.append(item)

  tokens = [{"original": token.text, "lemma": token.lemma_, 'similarity': token.similarity(input_word_token)} for token in doc]


In [15]:
for item in match_dict:
    item['pronunciation'] = f"[sound:french_audio_{item['frequency_idx']}.mp3]"

In [22]:
export_df = pd.DataFrame(match_dict)
export_df.drop(inplace=True, columns='raw_match')
export_df.to_csv('basic_french_flashcards.csv', index=False, sep='&')

## 3 Download Audio Sound Files

In [9]:
from config import FORVO_API_KEY, SCRAPER_API_KEY
import asyncio
import aiohttp
import urllib.parse
import sysc
import pandas as pd
import os

In [10]:
async def get_sound_file(dictionary, session, api_key = FORVO_API_KEY,):
    word = urllib.parse.quote(dictionary["french_word"])
    freq_idx = dictionary["frequency_idx"]
    url =  f"https://apifree.forvo.com/key/{api_key}/format/json/action/standard-pronunciation/word/{word}/language/fr"
    file_name = f"french_audio_{freq_idx}.mp3"
    path = "mp3_files/"
    try:
        async with session.get(url) as response:
            json_response = await response.json()
            mp3_download_url = json_response["items"][0]["pathmp3"]
        async with session.get(mp3_download_url) as mp3_response:
            mp3_file = await mp3_response.read()
        if sys.getsizeof(mp3_file) < 100:
            raise ValueError('No File Downloaded')
        with open(f'{path}{file_name}', 'wb') as local_file:
            local_file.write(mp3_file)
            print(f'Downloaded {freq_idx}')
    except Exception as e:
        return (dictionary, str(e))
    


async def get_multiple_sound_files(match_dict):
    all_responses = []
    async with aiohttp.ClientSession() as session:
        chunked_match_dicts = [match_dict[i:i + 5] for i in range(0, len(match_dict), 5)]
        for chunked_match_dict in chunked_match_dicts:
            tasks = []
            tasks.append(asyncio.sleep(5))
            for item in chunked_match_dict:
                tasks.append(get_sound_file(item, session))
            responses = await asyncio.gather(*tasks)
            all_responses.extend([response for response in responses if response is not None])
    return all_responses

In [11]:
# try_again = await get_multiple_sound_files(match_dict[2130:2380])

In [12]:
def get_failed_downloads(dir):
    mp3_files = os.listdir('./mp3_files/')
    failed_downloads = [file for file in mp3_files if os.path.getsize(f'{dir}{file}') <= 200]
    failed_downloads_idx = [int(filename.split("_")[2].split('.')[0]) - 1 for filename in failed_downloads]
    failed_donwloads_dict_items = [match_dict[i] for i in failed_downloads_idx]
    return failed_donwloads_dict_items

In [13]:
failed_donwloads_dict_items = get_failed_downloads("./mp3_files/")

## 4 Download Verb Conjugations

### Extract Dataframe of Verbs

In [25]:
df = pd.DataFrame(match_dict)
is_verb = lambda x: True if regex.search('(?<=( |^))v(?=( |$))', x) else False
df_verbs = df[df.pos_codes.apply(is_verb)]

In [17]:
import bs4
from bs4 import BeautifulSoup

In [18]:
async def get_verb_conjugation(word, session):
    url = f'https://www.wordreference.com/conj/frverbs.aspx?v={word}'
    params = {'api_key': SCRAPER_API_KEY, 'url': url}
    endpoint = "http://api.scraperapi.com"
    async with session.get(endpoint, params = params) as response:
        if response.status != 200:
            print(response)
            raise ValueError('Invalid HTTP status')
        response_text =  await response.text()
    return response_text

In [19]:
## Returns True if conjugation table does not contain any items with the css class 'antiquated'

def find_antiquated_conjugation_table(table):
    iterable = table.descendants
    for i in iterable:
        if type(i) == bs4.element.Tag and 'class' in i.attrs and 'antiquated' in i['class']: 
            return False
    return True

In [116]:
# Parses HTML response, removes all tables with antiquated conjugations, returns list of records with all relevant conjugations

def process_conjugation_response(response_text):
    soup = BeautifulSoup(response_text, "html5lib")
    conjugation_tables = soup.find_all('table', 'neoConj')
    relevant_conjugation_tables = list(filter(find_antiquated_conjugation_table, conjugation_tables))
    parsed_conjugation_tables = [pd.read_html(str(table), flavor='html5lib')[0] for table in relevant_conjugation_tables]
    parsed_conjugation_dicts = [table.to_dict('records') for table in parsed_conjugation_tables]
    return parsed_conjugation_dicts
    

In [20]:
async with aiohttp.ClientSession() as session:
    response_text = await get_verb_conjugation('parler', session)
    parsed_conjugation_records = process_conjugation_response(response_text)


In [136]:
parsed_conjugation_records = process_conjugation_response(result)
print(parsed_conjugation_records)

[[{'présent': 'je', 'présent.1': 'parle'}, {'présent': 'tu', 'présent.1': 'parles'}, {'présent': 'il, elle, on', 'présent.1': 'parle'}, {'présent': 'nous', 'présent.1': 'parlons'}, {'présent': 'vous', 'présent.1': 'parlez'}, {'présent': 'ils, elles', 'présent.1': 'parlent'}], [{'imparfait': 'je', 'imparfait.1': 'parlais'}, {'imparfait': 'tu', 'imparfait.1': 'parlais'}, {'imparfait': 'il, elle, on', 'imparfait.1': 'parlait'}, {'imparfait': 'nous', 'imparfait.1': 'parlions'}, {'imparfait': 'vous', 'imparfait.1': 'parliez'}, {'imparfait': 'ils, elles', 'imparfait.1': 'parlaient'}], [{'passé simple': 'je', 'passé simple.1': 'parlai'}, {'passé simple': 'tu', 'passé simple.1': 'parlas'}, {'passé simple': 'il, elle, on', 'passé simple.1': 'parla'}, {'passé simple': 'nous', 'passé simple.1': 'parlâmes'}, {'passé simple': 'vous', 'passé simple.1': 'parlâtes'}, {'passé simple': 'ils, elles', 'passé simple.1': 'parlèrent'}], [{'futur simple': 'je', 'futur simple.1': 'parlerai'}, {'futur simple': 

In [144]:
def convert_to_result_dictionary(records, word):
    result = {'base_form': word}
    for record in records:
        for dict in record:
            dict_as_lst = list(dict.items())
            tense = dict_as_lst[0][0]
            person = dict_as_lst[0][1]
            conjugation = dict_as_lst[1][1]
            if not isinstance(tense, str) or not isinstance(person, str) or not isinstance(conjugation, str):
                continue
            format_key = lambda x: x.replace(',', '').replace(' ', '_')
            key = format_key(f'{tense}_{person}')
            result[key] = conjugation
    return result

convert_to_result_dictionary(parsed_conjugation_records, 'parler')

{'base_form': 'parler',
 'présent_je': 'parle',
 'présent_tu': 'parlerais',
 'présent_il_elle_on': 'parlerait',
 'présent_nous': 'parlerions',
 'présent_vous': 'parleriez',
 'présent_ils_elles': 'parleraient',
 'imparfait_je': 'parlais',
 'imparfait_tu': 'parlais',
 'imparfait_il_elle_on': 'parlait',
 'imparfait_nous': 'parlions',
 'imparfait_vous': 'parliez',
 'imparfait_ils_elles': 'parlaient',
 'passé_simple_je': 'parlai',
 'passé_simple_tu': 'parlas',
 'passé_simple_il_elle_on': 'parla',
 'passé_simple_nous': 'parlâmes',
 'passé_simple_vous': 'parlâtes',
 'passé_simple_ils_elles': 'parlèrent',
 'futur_simple_je': 'parlerai',
 'futur_simple_tu': 'parleras',
 'futur_simple_il_elle_on': 'parlera',
 'futur_simple_nous': 'parlerons',
 'futur_simple_vous': 'parlerez',
 'futur_simple_ils_elles': 'parleront',
 "passé_composé_j'": 'ai parlé',
 'passé_composé_tu': 'as parlé',
 'passé_composé_il_elle_on': 'a parlé',
 'passé_composé_nous': 'avons parlé',
 'passé_composé_vous': 'avez parlé',
 '