In [None]:
"""
__author__ = "Nate Cutler"
__maintainer__ = "Nate Cutler"
__email__ = "ncutler211@gmail.com"
"""

In [66]:
import pandas as pd
import os
import numpy as np
import plotly.express as px
import dask.dataframe as dd

In [105]:
def germ_read_data(directory):
    """
    Reads data from .dat files in a specified directory into a Pandas DataFrame and combines them.

    Parameters:
        directory (str): The path to the directory containing the .dat files.

    Returns:
        pd.DataFrame: A combined DataFrame with data from all .dat files in the directory.

    Example:
        directory = "F:/UofA/ISTA_498_Capstone/Data ETL/Germany/Extract/data/Uncompressed Original"
        combined_data = read_data_into_dataframe(directory)
    """
    datafiles = []

    # Iterate over each .dat file in the specified directory
    for filename in os.listdir(directory):
        if filename.endswith(".dat"):
            file_path = os.path.join(directory, filename)

            # Read the .dat file into a DataFrame, specifying delimiter and encoding
            df = pd.read_csv(file_path, delimiter='|', encoding='latin1')

            # Rename the DataFrame columns to standard names
            new_column_names = ["Postal code", "city", "start_date", "reading"]
            df.rename(columns=dict(zip(df.columns, new_column_names)), inplace=True)

            # Append the DataFrame to the list of datafiles
            datafiles.append(df)

    # Combine all DataFrames into a single DataFrame
    combined_data = pd.concat(datafiles, ignore_index=True)

    return combined_data

def germ_transformations(combined_data):
    """
    Perform data transformations on Germany data.

    This function applies several transformations to the input DataFrame to prepare it for further analysis or storage.

    Parameters:
        combined_data (pd.DataFrame): The input DataFrame to be transformed.

    Returns:
        pd.DataFrame: The transformed DataFrame.
    """
    combined_data = combined_data.copy()
    combined_data["unit"] = 'μSv/h'
    combined_data = combined_data.drop("Postal code", axis=1)
    combined_data['start_date'] = pd.to_datetime(combined_data['start_date'])
    combined_data['end_date'] = combined_data['start_date']
    combined_data['cid'] = '04'
    combined_data['state'] = ""
    combined_data['lat'] = np.nan
    combined_data['long'] = np.nan
    combined_data['comment'] = ""

    # Replace zero entries with the average of the entry before and after
    for i in range(1, len(combined_data) - 1):
        if combined_data['reading'].iloc[i] == 0.0:
            # Replace 0.0 with the average of the entry before and after
            combined_data.at[i, 'reading'] = (combined_data['reading'].iloc[i - 1] + combined_data['reading'].iloc[i + 1]) / 2.0

    # Handle the first and last entries if they are 0.0
    if combined_data['reading'].iloc[0] == 0.0:
        combined_data.at[0, 'reading'] = combined_data['reading'].iloc[1]

    if combined_data['reading'].iloc[-1] == 0.0:
        combined_data.at[len(combined_data) - 1, 'reading'] = combined_data['reading'].iloc[-2]

    # Reorganize columns to a specific order
    combined_data = combined_data[["start_date", "end_date", "reading", "unit", "city", "state", "cid", "lat", "long", "comment"]]
    return combined_data


def germ_plot_transform(df):
    # Convert Pandas DataFrame to Dask DataFrame
    ddf = dd.from_pandas(df, npartitions=1)

    # Group by 'city' and resample 'end_date' to monthly frequency, calculate mean
    ddf = ddf.groupby('city').apply(lambda x: x.set_index('end_date').resample('M').mean()).reset_index()

    # Group by 'end_date' and calculate the overall mean
    ddf = ddf.groupby('end_date')['reading'].mean().reset_index()

    # Compute the result as a Pandas DataFrame
    df_result = ddf.compute()

    return df_result

def germ_weekly(df):
    df = df.groupby('city').resample('W-Mon', on='end_date')['reading'].mean().reset_index()
    return df

def germ_monthly(df):
    df = df.groupby('city').resample('M', on='end_date', closed='right')['reading'].mean().reset_index()
    return df

In [172]:
def japan_transformations(dir):
    """
    Perform data transformations specific to Japan data.

    Parameters:
     combined_data (DataFrame): Input DataFrame containing the data.

    Returns:
     DataFrame: Transformed DataFrame
    """
    combined_data = pd.read_csv(dir)
    #Convert to datetime
    combined_data['START_DATE'] = pd.to_datetime(combined_data['START_DATE'], format='%Y-%m-%d')
    combined_data['END_DATE'] = pd.to_datetime(combined_data['END_DATE'], format='%Y-%m-%d')
    #Lowercase all column headers
    combined_data.columns = combined_data.columns.str.lower()
    #Fill in CID
    combined_data['cid'] = '03'
    #Drop country
    combined_data = combined_data.drop("country", axis=1)
    #Change prefecture to state
    combined_data.rename(columns={'prefecture': 'state'}, inplace=True)

    return combined_data

def japan_to_country(df):
    # Convert Pandas DataFrame to Dask DataFrame
    ddf = dd.from_pandas(df, npartitions=1)

    # Group by 'city' and resample 'end_date' to monthly frequency, calculate mean
    ddf = ddf.groupby('city').apply(lambda x: x.set_index('end_date').resample('M').mean()).reset_index()

    # Group by 'end_date' and calculate the overall mean
    ddf = ddf.groupby('end_date')['reading'].mean().reset_index()

    # Compute the result as a Pandas DataFrame
    df_result = ddf.compute()

    df_result = df_result.sort_values(by='end_date')

    return df_result


In [106]:
#read germany data in
dir = r"F:\UofA\ISTA_498_Capstone\Data ETL\Germany\Extract\data\Uncompressed Original"
germ_df = germ_read_data(dir)
#transform germany data
germ_comb = germ_transformations(germ_df)
germ_country_mean = germ_plot_transform(germ_comb)

In [173]:
#read japan data in
dir = r"F:\UofA\ISTA_498_Capstone\Data ETL\Japan\Combined_Japan_Data.csv"
japan_df = japan_transformations(dir)
japan_df = japan_to_country(japan_df)


`meta` is not specified, inferred from partial data. Please provide `meta` if the result is unexpected.
  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result



In [176]:
# Visualization for Average monthly reading in μSv/h for Japan
japan_df = japan_df.sort_values(by='end_date')
fig = px.line(
    japan_df,
    x='end_date',
    y='reading',
    markers=True,
    title='Average Monthly MicroSieverts Per Hour for Japan',
)
fig.update_layout(
    xaxis_title='Year',
    yaxis_title='Reading in μSv/h',
    title_font=dict(family='Arial', size=20),
    title_x=0.5,
    xaxis=dict(dtick='M12'), 
    plot_bgcolor='#9ecae1'
)
fig.show()

In [153]:
# Visualization for Average monthly reading in μSv/h for Germany
fig = px.line(
    germ_country_mean,
    x='end_date',
    y='reading',
    markers=True,
    title='Average Monthly MicroSieverts Per Hour for Germany',
)
fig.update_layout(
    xaxis_title='Year',
    yaxis_title='Reading in μSv/h',
    title_font=dict(family='Arial', size=20),
    title_x=0.5,
    xaxis=dict(dtick='M12'), 
    plot_bgcolor='#9ecae1'
)
fig.show()