In [1]:
# Load excel/csv to a dataframe and clean it. 
import pandas as pd
import os 

def pandas_pure_etl(input_path: str) -> pd.DataFrame:
    """
    Perform an ETL process on an Excel or CSV data file using pandas with the in order columns of date-time, wind-speed, gust-speed, and wind-direction
    
    Extracts the data from the input_path provided, transforms it by removing null values and ensureing a datetime data type. 
    Loads the data into a pandas data frame

    Parameters:
        input_path (str): The file system path to the data file. The file must be either an Excel (.xlsx) or CSV (.csv) format.

    Raises:
        FileNotFoundError: If the file specified by 'input_path' does not exist.
        ValueError: If the file is neither an Excel file nor a CSV file, or if it cannot be read.

    Returns:
        pd.DataFrame: A DataFrame containing cleaned wind data with the columns: 'date_time', 'wind_speed', 'gust_speed', and 'wind_direction'.
    """
    
    if not os.path.isfile(input_path):
        raise FileNotFoundError(f"The file does not exist: {input_path}")
    
    file_extension = os.path.splitext(input_path)[1].lower()
    if file_extension not in [".xlsx",".csv"]:
        raise ValueError("File is not an Excel or CSV file.")
    
    try:
        if file_extension == ".xlsx":  
            df = pd.read_excel(
                io=input_path,
                engine="openpyxl",
                names=["date_time","wind_speed","gust_speed","wind_direction"]
            )
        else:
            df = pd.read_csv(
                io=input_path,
                names=["date_time","wind_speed","gust_speed","wind_direction"]
            )
    except Exception as e:
        raise ValueError(f"Failed to read file: {e}")
        
    df = df.dropna().reset_index(drop=True)
    df["date_time"] = pd.to_datetime(df["date_time"])
        
    return df

In [2]:
hotdog = pandas_pure_etl("/Users/gioabeleda/Desktop/wind-energy-dashboard-streamlit/data/wind_energy.xlsx")

In [3]:
hotdog.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84241 entries, 0 to 84240
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date_time       84241 non-null  datetime64[ns]
 1   wind_speed      84241 non-null  float64       
 2   gust_speed      84241 non-null  float64       
 3   wind_direction  84241 non-null  float64       
dtypes: datetime64[ns](1), float64(3)
memory usage: 2.6 MB


In [6]:
import pandas as pd

def diurnal_variation_pandas(dataframe : pd.DataFrame, year : int = None, month : int = None, day : int = None) -> pd.DataFrame:
    """
    Calculate the diurnal variation of wind speed for a given year, month, and day.
    
    This function takes a DataFrame containing wind measurements and computes the average wind speed for each hour.
    The user can specify a particular year, month, and day to filter the data. If no year, month, or day is specified,
    the function returns the average wind speed for each hour across all dates in the DataFrame.
    Now also checks if 'date_time' is of datetime type and if other specified columns are of type float64.
    
    Parameters:
    - dataframe (pd.DataFrame): A DataFrame containing at least the following columns: 'date_time' (datetime),
      'wind_speed' (float), 'gust_speed' (float), and 'wind_direction' (float).
    - year (int, optional): The year to filter the DataFrame on. If not provided, data for all years is used.
    - month (int, optional): The month to filter the DataFrame on. If provided, 'year' must also be provided.
    - day (int, optional): The day to filter the DataFrame on. If provided, both 'year' and 'month' must also be provided.
    
    Returns:
    - pd.DataFrame: A DataFrame with the columns 'year', 'month', 'day', 'hour', and 'avg_wind_speed', representing the average
      wind speed for each hour of the specified time period.
    
    Raises:
    - ValueError: If 'dataframe' is None, if required columns are missing, if the 'date_time' column is not
      datetime type, or if other columns are not of type float64.
    
    Example usage:
    ```
    df = pd.DataFrame({
        'date_time': pd.date_range(start='1/1/2018', periods=48, freq='H'),
        'wind_speed': np.random.rand(48) * 10,
        'gust_speed': np.random.rand(48) * 15,
        'wind_direction': np.random.choice(['N', 'S', 'E', 'W'], size=48)
    })
    
    # Get average wind speed for each hour on July 22, 2010
    result_df = diurnal_variation(dataframe=df, year=2010, month=7, day=22)
    ```
    """
    
    if dataframe is None:
        raise ValueError("Input dataframe cannot be None")
    
    for label, value in {"year" : year, "month" : month, "day" : day}.items():
        if value is not None and not isinstance(value,int):
            raise ValueError(f"{label} must be an integer.")
        
    
    required_columns = ["date_time","wind_speed","gust_speed","wind_direction"]
    
    if not all(col in dataframe.columns for col in required_columns):
        raise ValueError("Dataframe does not have required columns")
    
    for col in dataframe.columns:
        if col == "date_time" and not pd.api.types.is_datetime64_any_dtype(dataframe[col]):
            raise ValueError(f"{col} column must be datetime type")
        elif col in ["wind_speed","gust_speed","wind_direction"] and not pd.api.types.is_float_dtype(dataframe[col]):
            raise ValueError(f"{col} column must be float type")
    
    if month is not None and year is None:
        raise ValueError("Year must be provided if month is specified")
    
    if day is not None and (year is None or month is None):
        raise ValueError("Year and month must be provided if day is specified")
    
    df = dataframe.copy()
    
    df["year"] = df["date_time"].dt.year
    df["month"] = df["date_time"].dt.month
    df["day"] = df["date_time"].dt.day
    df["hour"] = df["date_time"].dt.hour + 1
    
    if year is not None:
        df = df[df["year"] == year]
    if month is not None:
        df = df[df["month"] == month]
    if day is not None: 
        df = df[df["day"] == day]
    
    result = df.groupby(["year", "month", "day", "hour"]).agg(avg_wind_speed=("wind_speed","mean")).reset_index()
    
    return result

In [7]:
diur = diurnal_variation_pandas(hotdog)

diur.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7022 entries, 0 to 7021
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   year            7022 non-null   int32  
 1   month           7022 non-null   int32  
 2   day             7022 non-null   int32  
 3   hour            7022 non-null   int32  
 4   avg_wind_speed  7022 non-null   float64
dtypes: float64(1), int32(4)
memory usage: 164.7 KB


In [None]:
def frequency_distribution_pandas(dataframe: pd.DataFrame, year : int = None, month : int = None) -> pd.DataFrame:
    
    if dataframe is None:
        raise ValueError("Input dataframe cannot be None")
    
    for label, value in {"year" : year, "month" : month}.items():
        if value is not None and not isinstance(value,int):
            raise ValueError(f"{label} must be an integer.")
        
    required_columns = ["date_time","wind_speed","gust_speed","wind_direction"]
    
    if not all(col in dataframe.columns for col in required_columns):
        raise ValueError("Dataframe does not have required columns")
    
    for col in dataframe.columns:
        if col == "date_time" and not pd.api.types.is_datetime64_any_dtype(dataframe[col]):
            raise ValueError(f"{col} column must be datetime type")
        elif col in ["wind_speed","gust_speed","wind_direction"] and not pd.api.types.is_float_dtype(dataframe[col]):
            raise ValueError(f"{col} column must be float type")
    
    if month is not None and year is None:
        raise ValueError("Year must be provided if month is specified")
    
    df = dataframe.copy()
    
    df["year"] = df["date_time"].dt.year
    df["month"] = df["date_time"].dt.month
    
    
    
    
    
    
    