In [None]:
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 26 11:35:18 2024

@author: Tanaka Akiyama
"""

import os
import numpy as np
import netCDF4 as nc
import pandas as pd
from datetime import datetime

# Function to calculate daily averages
def calculate_daily_averages(data):
    # Number of hours in two weeks
    hours_per_day = 24

    # Reshape data to represent bi-weekly segments
    data = data.reshape(-1, hours_per_day)

    # Calculate bi-weekly averages
    day_averages = np.mean(data, axis=1)
    
    return day_averages

# Function to calculate bi-weekly averages
def calculate_biweekly_averages(data):
    # Number of hours in two weeks
    hours_per_biweek = 24 * 14

    # Determine the number of complete bi-weekly segments
    num_biweeks = len(data) // hours_per_biweek

    # Adjust the data to include only the necessary number of hours
    data = data[:num_biweeks * hours_per_biweek]

    # Reshape data to represent bi-weekly segments
    data = data.reshape(-1, hours_per_biweek)

    # Calculate bi-weekly averages
    biweekly_averages = np.mean(data, axis=1)
    
    return biweekly_averages

# Load NetCDF file
def load_netcdf(file_path):
    dataset = nc.Dataset(file_path)
    return dataset

# Main function
def process_netcdf(file_paths, train_percent=0.93):
    # Load NetCDF files for each year
    datasets = [load_netcdf(file_path) for file_path in file_paths]
    
    # Initialize empty lists to store temperature and humidity data
    temperatures = []
    humidities = []
    dates = []

    # Loop through each dataset
    for dataset in datasets:
        latitude = dataset.variables['latitude'][:]
        longitude = dataset.variables['longitude'][:]
        time = dataset.variables['time'][:]
        
        # Extract the values corresponding to the third index (middle value)
        middle_lat_index = len(latitude) // 2
        middle_lon_index = len(longitude) // 2
        temperature = dataset.variables['t'][:, middle_lat_index, middle_lon_index]
        humidity = dataset.variables['r'][:, middle_lat_index, middle_lon_index]

        # Extract scale and offset factors from attributes for temperature and humidity
        temperature_scale = dataset.variables['t'].scale_factor
        print(temperature_scale)
        temperature_offset = dataset.variables['t'].add_offset
        humidity_scale = dataset.variables['r'].scale_factor
        humidity_offset = dataset.variables['r'].add_offset

        # Apply scale and offset factors to temperature and humidity data
        temperature = temperature * temperature_scale + temperature_offset
        humidity = humidity * humidity_scale + humidity_offset

        # Append temperature and humidity data to lists
        temperatures.append(temperature)
        humidities.append(humidity)
        dates.append(time)
        
        dataset.close()

    # Concatenate temperature and humidity data for all years into single arrays
    all_temperature = np.concatenate(temperatures)
    all_humidity = np.concatenate(humidities)
    all_dates = np.concatenate(dates)
    print(all_dates)

    '''
    # Calculate bi-weekly averages for temperature and humidity
    temperature_biweekly = calculate_biweekly_averages(temperature)
    humidity_biweekly = calculate_biweekly_averages(humidity)

    # Create DataFrame with temperature, humidity, and index columns
    df = pd.DataFrame({
        'temperature': temperature_biweekly,
        'humidity': humidity_biweekly,
        'index': range(1, len(temperature_biweekly) + 1)  # 1 to 24
    })

    # Save DataFrame to CSV file
    output_folder = 'processed'
    os.makedirs(output_folder, exist_ok=True)
    output_file = os.path.join(output_folder, 'processed_weather_data.csv')
    df.to_csv(output_file, index=False)
    print("Processed data saved to:", output_file)
    '''

    # Calculate bi-weekly averages for temperature and humidity
    temperature_daily = calculate_daily_averages(all_temperature)
    humidity_daily = calculate_daily_averages(all_humidity)
    dates_daily = calculate_daily_averages(all_dates)
    print(all_dates.shape)
    print(dates_daily.shape)

    # Convert time from hours since 1900-01-01 00:00:00.0 to date
    dates_daily = nc.num2date(dates_daily, "hours since 1900-01-01 00:00:00.0", "gregorian")

    # Extract only the date portion from datetime objects
    dates_daily = [datetime(d.year, d.month, d.day) for d in dates_daily]
    
    # Create DataFrame with temperature, humidity, and index columns
    df = pd.DataFrame({
        'date': dates_daily,
        'temperature': temperature_daily,
        'humidity': humidity_daily,
        'index': range(1, len(temperature_daily) + 1) 
    })

    # Split the data into train and test sets
    train_size = int(len(df) * train_percent)
    train_df = df.iloc[:train_size]
    test_df = df.iloc[train_size:]

    # Save train and test DataFrames to CSV files
    output_folder = '../../data/processed'
    os.makedirs(output_folder, exist_ok=True)
    train_output_file = os.path.join(output_folder, 'train_weather_data_daily.csv')
    test_output_file = os.path.join(output_folder, 'test_weather_data_daily.csv')
    train_df.to_csv(train_output_file, index=False)
    test_df.to_csv(test_output_file, index=False)
    print("Train data saved to:", train_output_file)
    print("Test data saved to:", test_output_file)
    

'''
Prints netcdf file metadata.

Parameters:
file_path - string path to netcdf file
'''
def print_netcdf_metadata(file_path):
    try:
        dataset = nc.Dataset(file_path)
        print("NetCDF file metadata:")
        print("Variables:")
        for var_name in dataset.variables:
            var = dataset.variables[var_name]
            print("\tVariable name:", var_name)
            print("\tDimensions:", var.dimensions)
            print("\tShape:", var.shape)
            print("\tUnits:", var.units)
            print("\tAttributes:")
            for attr_name in var.ncattrs():
                print("\t\t", attr_name, ":", getattr(var, attr_name))
            print("\n")
        print("Global attributes:")
        for attr_name in dataset.ncattrs():
            print("\t", attr_name, ":", getattr(dataset, attr_name))
    except Exception as e:
        print("Error:", e)
        


# print_netcdf_metadata(netcdf_file_path)
netcdf_file_paths = ['../../data/raw/weather/temp_relhum_2013.nc', '../../data/raw/weather/temp_relhum_2014.nc', \
                    '../../data/raw/weather/temp_relhum_2015.nc']

# Example usage
#process_netcdf(netcdf_file_paths)

for file_path in netcdf_file_paths:
    print_netcdf_metadata(file_path)

