In [None]:
import pandas as pd
import regex
import bs4
from bs4 import BeautifulSoup
from config import SCRAPER_API_KEY
import asyncio
import aiohttp
import urllib.parse

In [None]:
df = pd.read_csv("basic_french_flashcards.csv", sep="&")
is_verb = lambda x: True if regex.search("(?<=( |^))v(?=( |$))", x) else False
df["pos_codes"] = df.pos_codes.fillna("")
df_verbs = df[df.pos_codes.apply(is_verb)].reset_index()
df_verbs = df_verbs[["frequency_idx", "french_word", "word_english"]]
verb_records = df_verbs.to_dict("records")


In [None]:
async def get_verb_conjugation(word, session):
    url = f"https://www.wordreference.com/conj/frverbs.aspx?v={word}"
    endpoint = f"https://api.scraperapi.com?api_key={SCRAPER_API_KEY}&url={url}"
    async with session.get(endpoint) as response:
        if response.status != 200:
            print(response)
            raise ValueError("Invalid HTTP status")
        response_text = await response.text()
    return response_text


In [None]:
async with aiohttp.ClientSession() as session:
    response_text = await get_verb_conjugation("atteindre", session)


In [None]:
def extract_base_word_forms(soup):
    word_forms_table = soup.find_all("table", id="conjtable")[0]
    word_forms = word_forms_table.find_all("td")[1]
    html_string = ""
    for i in word_forms:
        html_string += str(i)
    html_split = html_string.strip().split("<br/>")
    word_forms = [
        item.replace("</b>", "").replace("<b>", "").replace("–", "")
        for item in html_split
    ]
    word_form_types = [
        "infinitif",
        "participe présent",
        "participe passé",
        "forme pronominale",
    ]
    result = dict(zip(word_form_types, word_forms))
    del result["infinitif"]
    result["has_reflexive"] = True if result["forme pronominale"] else False
    del result["forme pronominale"]
    return result


In [None]:
def extract_conjugation_tables(soup):
    conjugaiton_tables = soup.find_all("table", "neoConj")
    conjugation_tables_parsed = [
        pd.read_html(str(table), flavor="html5lib")[0] for table in conjugaiton_tables
    ]
    indicatif_tables = conjugation_tables_parsed[0:4]
    formes_composées = conjugation_tables_parsed[4:8].pop(2)
    subjonctif = conjugation_tables_parsed[8:11].pop(1)
    conditionnel = conjugation_tables_parsed[12:14]
    impératif = conjugation_tables_parsed[15:16]
    return impératif


In [None]:
def process_conjugation_response(response_text):
    soup = BeautifulSoup(response_text, "html5lib")
    word_forms = extract_base_word_forms(soup)
    conjugation_tables = extract_conjugation_tables(soup)
    return word_forms, conjugation_tables


In [None]:
soup = BeautifulSoup(response_text, "html5lib")
result = extract_conjugation_tables(soup)
print(result[0])