In [None]:
from config import FORVO_API_KEY
import asyncio
import aiohttp
import urllib.parse
import pandas as pd
import sys
import os

In [None]:
df = pd.read_csv("basic_french_flashcards.csv", sep="&")
records = df.to_dict("records")

In [None]:
async def get_sound_file(
    dictionary,
    session,
    api_key=FORVO_API_KEY,
):
    word = urllib.parse.quote(dictionary["french_word"])
    freq_idx = dictionary["frequency_idx"]
    url = f"https://apifree.forvo.com/key/{api_key}/format/json/action/standard-pronunciation/word/{word}/language/fr"
    file_name = f"french_audio_{freq_idx}.mp3"
    path = "mp3_files/"
    try:
        async with session.get(url) as response:
            json_response = await response.json()
            mp3_download_url = json_response["items"][0]["pathmp3"]
        async with session.get(mp3_download_url) as mp3_response:
            mp3_file = await mp3_response.read()
        if sys.getsizeof(mp3_file) < 100:
            raise ValueError("No File Downloaded")
        with open(f"{path}{file_name}", "wb") as local_file:
            local_file.write(mp3_file)
            print(f"Downloaded {freq_idx}")
    except Exception as e:
        return (dictionary, str(e))


async def get_multiple_sound_files(match_dict):
    all_responses = []
    async with aiohttp.ClientSession() as session:
        chunked_match_dicts = [
            match_dict[i : i + 5] for i in range(0, len(match_dict), 5)
        ]
        for chunked_match_dict in chunked_match_dicts:
            tasks = []
            tasks.append(asyncio.sleep(5))
            for item in chunked_match_dict:
                tasks.append(get_sound_file(item, session))
            responses = await asyncio.gather(*tasks)
            all_responses.extend(
                [response for response in responses if response is not None]
            )
    return all_responses


In [None]:
result = sys.getsizeof('mp3_files/french_audio_1.mp3')
print(result)

In [None]:
# failed_downloads = await get_multiple_sound_files(records[3380:3500])

In [None]:
# retry = [item[0] for item in failed_downloads]
# failed_downloads2 = await get_multiple_sound_files(retry)

In [None]:
def check_for_missing_audio_filename(last_audio):
    dir = os.listdir('mp3_files')
    dir.remove('.DS_Store')
    extract_num = lambda x: int(x.replace('french_audio_', "").replace(".mp3", ""))
    nums = [extract_num(name) for name in dir]
    nums.sort()
    comparison_list = list(range(1, last_audio+1))
    return nums == comparison_list
# check_for_missing_audio_filename(3500)

In [None]:
def check_for_corrupt_downloads():
    dir = os.listdir('mp3_files')
    dir.remove(".DS_Store")
    filesizes = [{"file_size": sys.getsizeof(f"mp3_files/{file}"), "file_name": file} for file in dir]
    df = pd.DataFrame.from_records(filesizes)
    return df
    
df = check_for_corrupt_downloads()
print(df.file_size.min())