# Prepare Weather Data
The ODE model takes the data from a csv file in the format of daily averages of (date, temp, humidity).

## Combine data

In [1]:
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 26 11:35:18 2024

@author: Tanaka Akiyama
"""

import netCDF4 as nc
import numpy as np
from datetime import datetime

# Function to calculate daily averages
def calculate_daily_average(data):
    # Number of hours in a day
    hours_per_day = 24

    # Reshape data to represent daily segments
    data = data.reshape(-1, hours_per_day)

    # Calculate daily averages
    day_averages = np.nanmean(data, axis=1)
    
    return day_averages

# Function to calculate bi-weekly averages
def calculate_biweekly_averages(data):
    # Number of hours in two weeks
    hours_per_biweek = 24 * 14

    # Determine the number of complete bi-weekly segments
    num_biweeks = len(data) // hours_per_biweek

    # Adjust the data to include only the necessary number of hours
    data = data[:num_biweeks * hours_per_biweek]

    # Reshape data to represent bi-weekly segments
    data = data.reshape(-1, hours_per_biweek)

    # Calculate bi-weekly averages
    biweekly_averages = np.mean(data, axis=1)
    
    return biweekly_averages

# Load NetCDF file
def load_netcdf(file_path):
    dataset = nc.Dataset(file_path)
    return dataset

    
'''
Combines multiple years of data into the same dataframe
'''
def combine_variables(file_paths, average='biweekly'):
    temps = []
    humidities = []
    times = []
    
    for file_path in file_paths:
        # Load NetCDF file
        dataset = load_netcdf(file_path)

        # Extract temperature and humidity data
        temperature_kelvin = dataset.variables['t'][:, 0, 0]  # Assuming only one latitude and longitude point
        humidity = dataset.variables['r'][:, 0, 0]  # Assuming only one latitude and longitude point
        time = dataset.variables['time'][:]
        
        ''' Seems like scales and offsets are already applied?
        # Extract scale and offset factors from attributes for temperature and humidity
        temperature_scale = dataset.variables['t'].scale_factor
        temperature_offset = dataset.variables['t'].add_offset
        humidity_scale = dataset.variables['r'].scale_factor
        humidity_offset = dataset.variables['r'].add_offset

        # Apply scale and offset factors to temperature and humidity data and convert to celcius
        temperature = (temperature_kelvin * temperature_scale) + temperature_offset - 273.15
        humidity = (humidity * humidity_scale + humidity_offset)
        '''

        #temperature = temperature_kelvin - 273.15
        temperature = temperature_kelvin

        # Set missing values to NaN
        temperature[temperature == -32767] = np.nan
        humidity[humidity == -32767] = np.nan

        # Calculate averages
        if average=='biweekly':
            temp_average = calculate_biweekly_averages(temperature)
            humidity_average = calculate_biweekly_averages(humidity)
            time_average = calculate_biweekly_averages(time)
        else:
            temp_average = calculate_daily_average(temperature)
            humidity_average = calculate_daily_average(humidity)
            time_average = calculate_daily_average(time)

        temps.append(temp_average)
        humidities.append(humidity_average)
        times.append(time_average)

        dataset.close()

    all_temperature = np.concatenate(temps)
    all_humidity = np.concatenate(humidities)
    all_dates = np.concatenate(times)

    # Convert numeric time values to datetime objects
    all_dates = nc.num2date(all_dates, units="hours since 1900-01-01 00:00:00.0", calendar="gregorian")
    
    # Extract only the date portion from datetime objects
    all_dates = [datetime(d.year, d.month, d.day) for d in all_dates]

    return all_temperature, all_humidity, all_dates, 
    

# print_netcdf_metadata(netcdf_file_path)
file_paths = ['../../data/raw/weather/temp_relhum_2013.nc', '../../data/raw/weather/temp_relhum_2014.nc', \
                    '../../data/raw/weather/temp_relhum_2015.nc', '../../data/raw/weather/temp_relhum_2016.nc', \
             '../../data/raw/weather/temp_relhum_2017.nc', '../../data/raw/weather/2018.nc', \
             '../../data/raw/weather/temp_relhum_2019.nc', '../../data/raw/weather/2020.nc', \
             '../../data/raw/weather/2021.nc', '../../data/raw/weather/2022.nc']

temps, humidities, dates = combine_variables(file_paths)

## Save data to csv files

In [2]:
import pandas as pd
import os

# Main function
def save_to_csv(temps, humidities, dates, train_percent=0.93):
    # Load NetCDF files for each year
    datasets = [load_netcdf(file_path) for file_path in file_paths]
    
    # Create DataFrame with temperature, humidity, and index columns
    df = pd.DataFrame({
        'date': dates,
        'temperature': temps,
        'humidity': humidities
    })

    # Split the data into train and test sets
    train_size = int(len(df) * train_percent)
    #train_df = df.iloc[:train_size]
    #test_df = df.iloc[train_size:]

    # Save train and test DataFrames to CSV files
    output_folder = '../../data/processed'
    os.makedirs(output_folder, exist_ok=True)
    train_output_file = os.path.join(output_folder, 'train_weather_data.csv')
    #test_output_file = os.path.join(output_folder, 'test_weather_data.csv')
    df.to_csv(train_output_file, index=False)
    #test_df.to_csv(test_output_file, index=False)
    print("Train data saved to:", train_output_file)
    #print("Test data saved to:", test_output_file)

save_to_csv(temps, humidities, dates)

Train data saved to: ../../data/processed\train_weather_data.csv
