In [79]:
"""

Goal: Taking a netCDF file and transforming it into a timeseries .csv file

Input: The netCDF file contained daily average temperature and humidity variables
for 2015 at multiple locations.

Result: A time series .csv file saved into a specific directory. Contains the 
average daily temperature variable for 2015 for the grid space accounting for
Brookhaven National Laboratory (BNL).

"""

import netCDF4 as nc #Used to read in the .nc files
import numpy as np
from netCDF4 import Dataset
import xarray as xr #Used for analyzing .nc files
import pandas as pd

# Check versions
print("Xarray:", xr.__version__) #Should be: 2024.6.0 ??
print("netCDF4:", nc.__version__) #Should be 1.7.1 ??


Xarray: 2024.6.0
netCDF4: 1.7.1.post1


In [73]:
# Reading in the netCDF file
data = Dataset('/Users/gabbyvaillant/EDA-MRI-ESM/source_gcm_data/temp_humi_day_MRI-ESM2-0_ssp585_r1i1p1f1_gn_20150101-20151231.nc', 'r')

# Storing the lat and lon data into the variables
lat = data.variables['lat'][:] #degrees north
lon = data.variables['lon'][:] #degrees east
tas = data.variables['tas'][:]

# Storing the lat and lon of my office building (Building 490 at BNL) into variables
lat_address = 40.8673561087823
lon_address = -72.88334488861287

# Since the netCDF contains the longitude value in degrees east we have to adjust 
# the values because the google map coordinates are not in this format
new_lon_address = 360 - (-lon_address)
#print(new_lon_address)

# Squared difference of lat and lon
sq_diff_lat = (lat - lat_address)**2
sq_diff_lon = (lon - new_lon_address)**2

# Identifying the index of the minimum sq diff for lat and lon
# This will tell me the indexes to access the weather data for the grid covering my address
min_index_lat = sq_diff_lat.argmin()
min_index_lon = sq_diff_lon.argmin()

# Looking at an example
#The daily average temperature at BNL on 01/01/2015
#print(tas[0, 116, 255], tas.units)
#ANSWER: 271.17395 K (29.184 Farenheit)

In [87]:
# Taking the netCDF file and transforming it to a time series dataset
# Creating an empty pandas dataframe with date index

# Accessing the starting date for this file
starting_date = data.variables['time'].units[11:21]

# Accessing the ending date for this file
# Only extracting the year then we are adding the month-day bc we know what it is (12-31)
ending_date = data.variables['time'].units[11:15] + '-12-31'

# Using pandas to get all the dates
date_range = pd.date_range(start = starting_date, end = ending_date)
date_range

# Filling the DataFrame with zeros originally, specifying float data type
df = pd.DataFrame(0.0, columns = ['Temperature'], index = date_range)

# Range of time indices
dt = np.arange(0, data.variables['time'].size)

for time_index in dt:
    df.iloc[time_index] = tas[time_index, min_index_lat, min_index_lon]

df.index.name = 'Date'

# Specifying the directory where the CSV file should be saved
output_directory = '/Users/gabbyvaillant/Downloads/BNL'
output_file = 'temperature_BNL_tas_2015.csv'
output_path = f"{output_directory}/{output_file}"

# Saving the time series into a csv
df.to_csv(output_path)
print("Saved to:", output_path)

<class 'pandas.core.indexes.datetimes.DatetimeIndex'>
Saved to: /Users/gabbyvaillant/Downloads/BNL/temperature_BNL_tas_2015.csv


In [80]:
"""

Next steps: 
1. This is only for one year, at one specific location, figure out the
best way to convert the netCDF files to timeseries for every location
during the year, for multiple years
2. Try to input this new timeseries .csv file into the TimeGAN and see how it works


"""


'\n\nNext steps: \n1. This is only for one year, at one specific location, figure out the\nbest way to convert the netCDF files to timeseries for every location\nduring the year, for multiple years\n2. Try to input this new timeseries .csv file into the TimeGAN and see how it works\n\n\n'