In [1]:
# import needed libraries
import os
import io
from requests import get  # to retrieve files from web
from dotenv import load_dotenv  # variables in .env
import gzip  # unzip retrived files
from datetime import datetime  # to get actual year and month
import logging # log files
import pandas as pd
import random  # to generate random user-agent
from lxml import etree # check README.md to solve install problem on Linux/macOS

# init log format
logging.basicConfig(
    format="%(asctime)s , %(levelname)-8s , %(message)s",
    filename="logging_omm.log",
    datefmt="%Y-%m-%d , %H:%M:%S",
    encoding="utf-8",
    level=logging.INFO,
)

In [2]:
def load_stored_variables_in_env_file(VARENV:str):
    """load variables from .env file

    Arguments:
        VARENV -- searched env variable

    Returns:
        VARENV -- value for env variable
    """    
    load_dotenv()
    return os.getenv(VARENV)

In [3]:
def check_and_output_a_two_digit_month(month_value):
    """check if 'month' value is a 2 digit value

    Arguments:
        month_value -- current month value

    Returns:
        2 digits values
    """    
    if len(str(month_value)) == 1:
        month_value = f"0{month_value}"
    else:
        pass
    return month_value

In [4]:
def create_omm_file_url(url_to_complete,year_to_add,month_to_add):
    """compose the current url : https://donneespubliques.meteofrance.fr/donnees_libres/Txt/Synop/Archive/synop.<year><month>.csv.gz
    
    Arguments :
        url_to_complete -- init_url
        year_to_add -- targeted year
        month_to_add -- targeted month
    
    Return :
        completed_url -- recomposed url
    """
    completed_url = f"{url_to_complete}synop.{year_to_add}{check_and_output_a_two_digit_month(month_to_add)}.csv.gz"
    return completed_url

def create_omm_filename(url_to_complete,year_to_add,month_to_add):
    """compose the current url : https://donneespubliques.meteofrance.fr/donnees_libres/Txt/Synop/Archive/synop.<year><month>.csv.gz

    Arguments :
        url_to_complete -- init_url
        year_to_add -- targeted year
        month_to_add -- targeted month

    Return :
        completed_file -- recomposed filename
    """
    completed_filename = f"synop.{year_to_add}{check_and_output_a_two_digit_month(month_to_add)}.csv"
    return completed_filename

In [5]:
def get_file(url_to_get,recomposed_filename):
    """retrieve a specific file from an url

    Arguments :
        url_to_get -- path to the file
        recomposed_filename -- targeted file

    Return :
        website_response.content -- searched file
    """
    website_response = get(url_to_get, allow_redirects=True)
    if website_response.status_code == 200:
        logging.info(f"Successful Request with status code:, {website_response.status_code}")
    else:
        logging.error(f"Request failed for file {recomposed_filename} with status code:, {website_response.status_code}")
    return website_response.content

def record_file(recomposed_filename,wished_extension,website_response):
    os.makedirs(os.path.dirname(f"data/{recomposed_filename}.{wished_extension}"), exist_ok=True)
    open(f"data/{recomposed_filename}.{wished_extension}", 'wb').write(website_response)

def gunzip_file(__file__):
    with open(f"data/{__file__}.gz", 'rb') as from_gz, open(f"data/{__file__}", 'w', encoding='utf8') as to_csv:
        decompress_the_file = gzip.decompress(from_gz.read()).decode('utf-8')
        to_csv.write(decompress_the_file)
    os.remove(f"data/{__file__}.gz")

In [6]:
def convert_csv_as_dataframe(__file__):
    temp = pd.read_csv(f"data/{__file__}", sep=";", header=0)
    return temp

In [7]:
def clean_data(__file__):
    temp = convert_csv_as_dataframe(__file__)
    # make a copy of the dataframe with important columns to remove Pandas warning
    df = temp[["numer_sta","t","td","u","date"]].copy()
    df["date_"] = pd.to_datetime(df["date"], format="%Y%m%d%H%M%S")
    # create a 'date' column
    df['date'] = df['date_'].dt.date
    # create an 'hour' column
    df['heure'] = df['date_'].dt.strftime('%H:%M')
    # Drop the original timestamp column
    df = df.drop('date_', axis=1)
    df = df.rename({"numer_sta":"ID"},axis='columns')
    return df

In [8]:
def generate_random_user_agent_list() -> list:
    """generate a random user-agent list based on the most used user-agent

    Parameters :
        url_user_agent : url to retrieve the list
    Returns:
        user_agents : list
    """
    url_user_agent = (
        "https://www.useragents.me/#most-common-desktop-useragents-json-csv"
    )

    # Fetch the website content
    response = get(url_user_agent)
    if response.status_code != 200:
        logging.error("Failed to fetch user agent list")
        pass

    # Parse the HTML content
    html = etree.HTML(response.text)

    # Extract user agent strings using XPath
    user_agents = html.xpath(
        '//*[@id="most-common-desktop-useragents-json-csv"]/div[1]/textarea/text()'
    )
    if not user_agents:
        logging.error("Failed to extract user agent list")
        pass

    # Parse the JSON data
    try:
        user_agents = eval(user_agents[0])
    except:
        logging.warning("Failed to parse user agent list")
        pass

    return user_agents

In [9]:
def random_user_agent() -> dict:
    """select a random user-agent from the list

    Returns:
        random_user_agent : dict
    """
    random_user_agent = {"user-agent": random.choice(generate_random_user_agent_list())["ua"]}
    return random_user_agent

def retrieve_data_txt(url_station,file_station) -> dict:
    """Retrieve JSON datas from the URL

    Parameters:
    url_to_get: previously generated URL where to get datas

    Returns:
    data_json: dict
    """
    user_agent = random_user_agent()
    website_response = get(f"{url_station}{file_station}", headers=user_agent)
    if website_response.status_code == 200:
        data_txt = website_response.content
    # Process the JSON data as needed
    else:
        logging.error(
            f"Request failed with status code:", {website_response.status_code}
        )
    return data_txt

In [10]:
def make_the_job(target_url,current_loop_year,current_loop_month,url_station,file_station):
    __url__ = create_omm_file_url(target_url,current_loop_year,current_loop_month)
    __file__ = create_omm_filename(target_url,current_loop_year,current_loop_month)
    record_file(__file__,"gz",get_file(__url__,__file__))
    gunzip_file(__file__)
    omm_data_loop=clean_data(__file__)
    user_agents_list = generate_random_user_agent_list()
    liste_station__ =retrieve_data_txt(url_station,file_station)
    liste_of_station = pd.read_csv(io.StringIO(liste_station__.decode('utf-8')),sep=";")
    final = pd.merge(omm_data_loop, liste_of_station, on="ID")
    try :
        final.to_csv(f"data/omm/{__file__}",index = False)
        logging.info(f"Successfully record data : {current_loop_year}  {check_and_output_a_two_digit_month(current_loop_month)}")
    except:
        logging.error(f"Failed to record data : {current_loop_year}  {check_and_output_a_two_digit_month(current_loop_month)}")

In [11]:
def main():
    # get the current date to retrieve from the initial file to the latest one
    current_datetime = datetime.now()
    year_loop_end = current_datetime.year
    month_loop_end = current_datetime.month
    #  dictionnary of stations
    url_station = load_stored_variables_in_env_file("STATION_URL")
    file_station = load_stored_variables_in_env_file("STATION_FILE")
    # load variables from .env
    year_loop_start = int(load_stored_variables_in_env_file("YEAR_LOOP_START"))
    month_loop_start = int(load_stored_variables_in_env_file("MONTH_LOOP_START"))
    target_url = load_stored_variables_in_env_file("URL")

    for current_loop_year in range(year_loop_start,year_loop_end+1):
        for current_loop_month in range(month_loop_start,12+1):
            make_the_job(target_url,current_loop_year,current_loop_month,url_station,file_station)
            if (current_loop_year == year_loop_end) and (current_loop_month == month_loop_end):
                logging.info(f"Task ended from {year_loop_start}{month_loop_start}")
                break
            else:
                pass

In [12]:
main()