In [2]:
# Version 2.5
import pandas as pd
import os 

def pandas_pure_etl(input_path: str) -> pd.DataFrame:
    """
    Perform an ETL process on an Excel or CSV data file using pandas with the in order columns of date-time, wind-speed, gust-speed, and wind-direction
    
    Extracts the data from the input_path provided, transforms it by removing null values and ensureing a datetime data type. 
    Loads the data into a pandas data frame

    Parameters:
        input_path (str): The file system path to the data file. The file must be either an Excel (.xlsx) or CSV (.csv) format.

    Raises:
        FileNotFoundError: If the file specified by 'input_path' does not exist.
        ValueError: If the file is neither an Excel file nor a CSV file, or if it cannot be read.

    Returns:
        pd.DataFrame: A DataFrame containing cleaned wind data with the columns: 'date_time', 'wind_speed', 'gust_speed', and 'wind_direction'.
    """
    
    if not os.path.isfile(input_path):
        raise FileNotFoundError(f"The file does not exist: {input_path}")
    
    file_extension = os.path.splitext(input_path)[1].lower()
    if file_extension not in [".xlsx",".csv"]:
        raise ValueError("File is not an Excel or CSV file.")
    
    try:
        if file_extension == ".xlsx":  
            df = pd.read_excel(
                io=input_path,
                engine="openpyxl",
                names=["date_time","wind_speed","gust_speed","wind_direction"]
            )
        else:
            df = pd.read_csv(
                io=input_path,
                names=["date_time","wind_speed","gust_speed","wind_direction"]
            )
    except Exception as e:
        raise ValueError(f"Failed to read file: {e}")
        
    df = df.dropna().reset_index(drop=True)
    df["date_time"] = pd.to_datetime(df["date_time"])
        
    return df

In [19]:
def diurnal_variation(dataframe : pd.DataFrame, year : int = None, month : int = None, day : int = None) -> pd.DataFrame:
    """
    Calculate the diurnal variation of wind speed for a given year, month, and day.

    This function takes a DataFrame containing wind measurements and computes the average wind speed for each hour. 
    The user can specify a particular year, month, and day to filter the data. If no year, month, or day is specified, 
    the function returns the average wind speed for each hour across all dates in the DataFrame.

    Parameters:
    - dataframe (pd.DataFrame): A DataFrame containing at least the following columns: 'date_time' (datetime), 'wind_speed' (numeric),
      'gust_speed' (numeric), and 'wind_direction' (numeric).
    - year (int, optional): The year to filter the DataFrame on. If not provided, data for all years is used.
    - month (int, optional): The month to filter the DataFrame on. If provided, 'year' must also be provided.
    - day (int, optional): The day to filter the DataFrame on. If provided, both 'year' and 'month' must also be provided.

    Returns:
    - pd.DataFrame: A DataFrame with the columns 'year', 'month', 'day', 'hour', and 'avg_wind_speed', representing the average 
      wind speed for each hour of the specified time period.

    Raises:
    - ValueError: If 'dataframe' is None, if required columns are missing from the DataFrame, or if 'year', 'month', and 'day' 
      are not provided in a logically consistent way.

    Example usage:
    ```
    df = pd.DataFrame({
        'date_time': pd.date_range(start='1/1/2018', periods=48, freq='H'),
        'wind_speed': np.random.rand(48) * 10,
        'gust_speed': np.random.rand(48) * 15,
        'wind_direction': np.random.choice(['N', 'S', 'E', 'W'], size=48)
    })
    
    # Get average wind speed for each hour on July 22, 2010
    result_df = diurnal_variation(dataframe=df, year=2010, month=7, day=22)
    ```
    """

    
    if dataframe is None:
        raise ValueError("No dataframe given")
    
    required_columns = ["date_time","wind_speed","gust_speed","wind_direction"]
    
    if not all(col in dataframe.columns for col in required_columns):
        raise ValueError("Invalid columns")
    
    if month is not None and year is None:
        raise ValueError("Year must not be none if month is not none")
    
    if day is not None and (year is None or month is None):
        raise ValueError("Year and month must not be none if day is not none")
    
    
    dataframe["year"] = dataframe["date_time"].dt.year
    dataframe["month"] = dataframe["date_time"].dt.month
    dataframe["day"] = dataframe["date_time"].dt.day
    dataframe["hour"] = dataframe["date_time"].dt.hour + 1
    
    if year is not None:
        dataframe = dataframe[dataframe["year"] == year]
    if month is not None:
        dataframe = dataframe[dataframe["month"] == month]
    if day is not None: 
        dataframe = dataframe[dataframe["day"] == day]
    
    result = dataframe.groupby(["year", "month", "day", "hour"]).agg(avg_wind_speed=("wind_speed","mean")).reset_index()
    
    return result