In [2]:
import pandas as pd
import os
import glob as gl
import io


In [3]:
# Specify the directory you want to search
dir = (r'C:\Users\daryl\OneDrive\Documents\GDAA3000\ProjectDischarge'
       r'\RdrsSample\LstmDatasets\NsWeatherData\01ED007\Hourly')

# Get a list of all CSV files in the directory
csv_files = gl.glob(dir + '/*8202000*.csv')
dfs = []

for csv in csv_files:
    try:
        # Try to read the CSV file with 'utf-8' encoding
        with open(csv, 'rb') as f:
            content = f.read().decode('utf-8')
        df = pd.read_csv(io.StringIO(content))
    except UnicodeDecodeError:
        # If there's a UnicodeDecodeError, try a different encoding
        with open(csv, 'rb') as f:
            content = f.read().decode('ISO-8859-1')
        df = pd.read_csv(io.StringIO(content))
    dfs.append(df)

df_all = pd.concat(dfs, ignore_index=True)
df_all = df_all.drop(columns=['Longitude (x)', 'Latitude (y)', 'Station Name',
                              'Year', 'Month', 'Day', 'Temp Flag', 'Dew Point Temp Flag',
                              'Rel Hum Flag', 'Precip. Amount (mm)', 'Precip. Amount Flag',
                              'Wind Dir Flag', 'Visibility (km)', 'Visibility Flag',
                              'Stn Press Flag', 'Hmdx','Hmdx Flag', 'Wind Chill',
                              'Wind Chill Flag', 'Weather', 'Wind Dir (10s deg)', 
                              'Wind Spd (km/h)', 'Wind Spd Flag', 'Stn Press (kPa)',
                              'Time (LST)'] )

# # Make Date/Time (LST) column the index
# df_all['Date/Time (LST)'] = pd.to_datetime(df_all['Date/Time (LST)'])
# # Convert Date/Time (LST) to datetime UTC
# df_all['Date/Time (UTC)'] = df_all['Date/Time (LST)'].dt.tz_localize(
#     'America/Toronto', nonexistent='shift_forward', ambiguous='NaT').dt.tz_convert('UTC')
# df_all = df_all.drop(columns=['Date/Time (LST)'])

df_all = df_all.rename(columns={'Date/Time (LST)': 'date', 'Temp (°C)': 'temp_C',
                                'Dew Point Temp (°C)': 'dew_point_C',
                                'Rel Hum (%)': 'rel_hum_percent', 'Date/Time (UTC)': 'date'})
# Rename 'Date/Time (UTC)' to 'date'
df_all = df_all.rename(columns={'Date/Time (UTC)': 'date'})
# Convert 'date' column to datetime
df_all['date'] = pd.to_datetime(df_all['date'])

print(df_all.columns)
print(df_all.head())


Index(['Climate ID', 'date', 'temp_C', 'dew_point_C', 'rel_hum_percent'], dtype='object')
   Climate ID                date  temp_C  dew_point_C  rel_hum_percent
0     8202000 2011-01-01 00:00:00     3.0          0.9             86.0
1     8202000 2011-01-01 01:00:00     3.8          1.5             85.0
2     8202000 2011-01-01 02:00:00     4.0          2.2             88.0
3     8202000 2011-01-01 03:00:00     3.8          2.0             88.0
4     8202000 2011-01-01 04:00:00     3.6          2.0             89.0


Convert hourly to daily

In [4]:
# Convert 'date' column to datetime object and assign dataframe to df_all.


# Set 'date' column as index
df_all.set_index('date', inplace=True)

# Resample to daily frequency and calculate the mean
df_daily = df_all[['dew_point_C', 'rel_hum_percent']].resample('D').mean()
# Resample to daily frequency and calculate the mean
df_daily = df_all[['dew_point_C', 'rel_hum_percent']].resample('D').mean()

# df_daily.to_csv(file_path)

# # Set the date as the index
# df_all.set_index('date', inplace=True)

# Check if the index is regularly spaced
print(df_all.index.freq)

# Make the index regularly spaced (e.g., daily)
df_all = df_all.asfreq('D')

# Check if the index is regularly spaced
print(df_all.index.freq)
# Print the first and last dates of the index
print(df_all.index[0], df_all.index[-1])

print(df_daily.head())

# Save to CSV
dst = (r'C:\Users\daryl\OneDrive\Documents\GDAA3000\ProjectDischarge\RdrsSample'
       r'\LstmDatasets\NsWeatherData\01ED007')
f = '01ED007_weather_from_hourly_2011-2021.csv'
df_all.to_csv(os.path.join(dst, f))

None
<Day>
2011-01-01 00:00:00 2021-12-31 00:00:00
            dew_point_C  rel_hum_percent
date                                    
2011-01-01     2.333333        87.875000
2011-01-02     0.050000        96.166667
2011-01-03    -2.150000        88.208333
2011-01-04    -8.120833        70.708333
2011-01-05    -9.475000        63.000000


In [6]:
src = (r'C:\Users\daryl\OneDrive\Documents\GDAA3000\ProjectDischarge\RdrsSample'
       r'\LstmDatasets\NsWeatherData\01ED007')
f = '01ED007_weather_2011-2021.csv'

df_1 = pd.read_csv(os.path.join(src, f))

df_1['date'] = pd.to_datetime(df_1['date'])

# # Set the date as the index
df_1.set_index('date', inplace=True)

# Check if the index is regularly spaced
print(df_1.index.freq)

# Make the index regularly spaced (e.g., daily)
df_1 = df_1.asfreq('D')

# Check if the index is regularly spaced
print(df_1.index.freq)
# Print the first and last dates of the index
print(df_1.index[0], df_1.index[-1])

# Concatenate the two dataframes
df_all = pd.concat([df_1, df_daily], axis=1)

# Save to CSV
save_path = os.path.join(dst, '01ED007_weather_hourly-daily_joined_2011-2021_daily.csv')
df_all.to_csv(save_path)
print(df_all.columns)
print(df_all.head())


None
<Day>
2011-01-01 00:00:00 2021-12-31 00:00:00
Index(['Climate ID', 'max_tem_deg_c', 'min_temp_deg_c', 'mean_temp_deg_c',
       'heat_deg_days_deg_c', 'cool_deg_days_deg_c', 'rain_mm', 'snow_cm',
       'total_prcp_mm', 'snow_on_grnd_cm', 'dir_max_gust_10s_deg',
       'spd_max_gust_km_h', 'dew_point_C', 'rel_hum_percent'],
      dtype='object')
            Climate ID  max_tem_deg_c  min_temp_deg_c  mean_temp_deg_c  \
date                                                                     
2011-01-01     8202000            6.3             0.1              3.2   
2011-01-02     8202000            1.9            -1.2              0.4   
2011-01-03     8202000            1.6            -4.2             -1.3   
2011-01-04     8202000           -1.4            -6.0             -3.7   
2011-01-05     8202000           -1.1            -6.0             -3.6   

            heat_deg_days_deg_c  cool_deg_days_deg_c  rain_mm  snow_cm  \
date                                                  

In [None]:

# # Compare the 'date' columns of df_1 and df_daily to find non-matching dates
# # Sort df_1 by 'date'
# df_1 = df_1.sort_values('date')
# # Find the dates in df_1 that are not in df_daily
# missing_dates = df_1[~df_1['date'].isin(df_daily.index)]
# # Count the number of missing dates and print the result
# print(len(missing_dates))
# print(missing_dates)

# # Find the dates in df_daily that are not in df_1
# missing_dates_in_daily = df_daily[~df_daily.index.isin(df_1['date'])]
# # Count the number of missing dates and print the result
# print(len(missing_dates_in_daily))
# print(missing_dates_in_daily)