In [3]:
import pandas as pd
import os

In [5]:
# Environment Variables
RAW_WEATHER_DATA_DIR = "Database/csv/Raw Datasets/Weather Data"
DERIVED_WEATHER_DATA_DIR = "Database/csv/Derived Datasets/Weather Data"
SAMPLE_DATA_DIR = "Database/csv/Raw Datasets/Weather Data/Arizona_Weather_Data.csv"

In [17]:
sample_df = pd.read_csv(SAMPLE_DATA_DIR)
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 816168 entries, 0 to 816167
Data columns (total 10 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   location_id               816168 non-null  int64  
 1   latitude                  816168 non-null  float64
 2   longitude                 816168 non-null  float64
 3   elevation                 816168 non-null  float64
 4   timezone_abbreviation     816168 non-null  object 
 5   time                      816168 non-null  object 
 6   temperature_2m (°C)       816168 non-null  float64
 7   relative_humidity_2m (%)  816168 non-null  int64  
 8   dew_point_2m (°C)         816168 non-null  float64
 9   precipitation (mm)        816168 non-null  float64
dtypes: float64(6), int64(2), object(2)
memory usage: 62.3+ MB


In [18]:
sample_df['time'] = pd.to_datetime(sample_df['time'], format='%Y-%m-%dT%H:%M:%S')

sample_df['hour'] = sample_df['time'].dt.hour

sample_df['hour']

0          0
1          1
2          2
3          3
4          4
          ..
816163    19
816164    20
816165    21
816166    22
816167    23
Name: hour, Length: 816168, dtype: int64

In [23]:
# Parse the raw weather data
all_dfs = []

for file in os.listdir(RAW_WEATHER_DATA_DIR):
    if file.endswith(".csv"):
        # Read the raw weather data
        df = pd.read_csv(f"{RAW_WEATHER_DATA_DIR}/{file}")

        # Add the state column
        df['state'] = f'{file.split("_")[0]}' 

        # Extract the relevant columns
        df = df[['location_id', 'time', 'latitude', 'longitude', 'temperature_2m (°C)', 'relative_humidity_2m (%)', 'state']]

        # Convert the time column to datetime
        df['date'] = pd.to_datetime(df['time'], format='%Y-%m-%dT%H:%M:%S')

        # Drop the time column
        df = df.drop(columns=['time'])

        df['year'] = df['date'].dt.year

        df['month'] = df['date'].dt.month

        df['day'] = df['date'].dt.day

        df['time'] = df['date'].dt.time

        all_dfs.append(df)

all_dfs[0]

Unnamed: 0,location_id,latitude,longitude,temperature_2m (°C),relative_humidity_2m (%),state,date,year,month,day,time
0,0,30.333918,-97.80774,25.7,85,Texas,2021-06-10 00:00:00,2021,6,10,00:00:00
1,0,30.333918,-97.80774,25.0,89,Texas,2021-06-10 01:00:00,2021,6,10,01:00:00
2,0,30.333918,-97.80774,24.6,91,Texas,2021-06-10 02:00:00,2021,6,10,02:00:00
3,0,30.333918,-97.80774,24.0,94,Texas,2021-06-10 03:00:00,2021,6,10,03:00:00
4,0,30.333918,-97.80774,23.6,96,Texas,2021-06-10 04:00:00,2021,6,10,04:00:00
...,...,...,...,...,...,...,...,...,...,...,...
1105771,41,32.724075,-102.63736,23.0,67,Texas,2024-06-10 19:00:00,2024,6,10,19:00:00
1105772,41,32.724075,-102.63736,27.9,33,Texas,2024-06-10 20:00:00,2024,6,10,20:00:00
1105773,41,32.724075,-102.63736,25.1,46,Texas,2024-06-10 21:00:00,2024,6,10,21:00:00
1105774,41,32.724075,-102.63736,23.7,58,Texas,2024-06-10 22:00:00,2024,6,10,22:00:00


## Hourly Mean temperature for each unique combination of location

1. **Initialize an Empty List**: `mean_temperature_by_location_id_df = []` initializes an empty list. This list will eventually hold the resulting DataFrames, each containing mean temperature data grouped by specific criteria.

2. **Loop Through DataFrames**: The `for df in all_dfs:` loop iterates over each DataFrame in the `all_dfs` collection. For each DataFrame (`df`), the following steps are performed:

    - **Group by Multiple Columns**: The DataFrame is grouped by `location_id`, `latitude`, `longitude`, `state`, `year`, `month`, `day`, and `time` using the `groupby` method. The mean temperature (`temperature_2m (°C)`) for each group is calculated. This results in a Series with a MultiIndex composed of the grouping columns.

    - **Convert to DataFrame and Rename Column**: The resulting Series is converted back into a DataFrame. The column containing the mean temperatures, originally named `temperature_2m (°C)`, is renamed to `mean_temperature_2m (°C)` for clarity.

    - **Reset the Index**: The `reset_index` method is called to convert the MultiIndex into regular columns, making the DataFrame easier to work with and understand.

    - **Append to List**: The processed DataFrame is appended to the `mean_temperature_by_location_id_df` list. This list accumulates the grouped and processed DataFrames from each iteration of the loop.

3. **Access the First Result**: After the loop completes, `mean_temperature_by_location_id_df[0]` accesses the first DataFrame in the list. This line is likely used for inspection or demonstration purposes to show the structure or contents of the first processed DataFrame.

Overall, this code is a systematic approach to aggregating and summarizing temperature data across multiple dimensions, making it a valuable tool for climate data analysis or any scenario where understanding temperature trends over time and space is essential.

In [35]:
hourly_mean_temperature_by_location_id_df = []
for df in all_dfs:
    # Group by state, year, month, day, and time
    grouped_df = pd.DataFrame(df.groupby(['location_id', 'latitude', 'longitude','state', 'month', 'day', 'time'])['temperature_2m (°C)'].mean())
    
    # Rename the column
    grouped_df = grouped_df.rename(columns={'temperature_2m (°C)': 'hourly_mean_temperature_2m (°C)'})
    
    # Reset the index
    grouped_df = grouped_df.reset_index()
    
    # Append the grouped dataframe to the list
    hourly_mean_temperature_by_location_id_df.append(grouped_df)

hourly_mean_temperature_by_location_id_df[0]

Unnamed: 0,location_id,latitude,longitude,state,month,day,time,hourly_mean_temperature_2m (°C)
0,0,30.333918,-97.80774,Texas,1,1,00:00:00,16.766667
1,0,30.333918,-97.80774,Texas,1,1,01:00:00,16.833333
2,0,30.333918,-97.80774,Texas,1,1,02:00:00,16.400000
3,0,30.333918,-97.80774,Texas,1,1,03:00:00,16.033333
4,0,30.333918,-97.80774,Texas,1,1,04:00:00,15.533333
...,...,...,...,...,...,...,...,...
368923,41,32.724075,-102.63736,Texas,12,31,19:00:00,12.333333
368924,41,32.724075,-102.63736,Texas,12,31,20:00:00,11.666667
368925,41,32.724075,-102.63736,Texas,12,31,21:00:00,10.600000
368926,41,32.724075,-102.63736,Texas,12,31,22:00:00,9.533333


# Daily Mean Temperature

In [38]:
daily_mean_temperature_by_location_id_df = []
for df in all_dfs:
    # Group by state, year, month, day, and time
    grouped_df = pd.DataFrame(df.groupby(['location_id', 'latitude', 'longitude','state', 'month', 'day'])['temperature_2m (°C)'].mean())
    
    # Rename the column
    grouped_df = grouped_df.rename(columns={'temperature_2m (°C)': 'daily_mean_temperature_2m (°C)'})
    
    # Reset the index
    grouped_df = grouped_df.reset_index()
    
    # Sort the dataframe by date
    grouped_df = grouped_df.sort_values(by=['location_id','month', 'day'])

    # Append the grouped dataframe to the list
    daily_mean_temperature_by_location_id_df.append(grouped_df)

daily_mean_temperature_by_location_id_df[0]

Unnamed: 0,location_id,latitude,longitude,state,month,day,daily_mean_temperature_2m (°C)
0,0,30.333918,-97.80774,Texas,1,1,16.059722
1,0,30.333918,-97.80774,Texas,1,2,9.720833
2,0,30.333918,-97.80774,Texas,1,3,9.780556
3,0,30.333918,-97.80774,Texas,1,4,10.750000
4,0,30.333918,-97.80774,Texas,1,5,11.730556
...,...,...,...,...,...,...,...
15367,41,32.724075,-102.63736,Texas,12,27,9.375000
15368,41,32.724075,-102.63736,Texas,12,28,11.105556
15369,41,32.724075,-102.63736,Texas,12,29,9.215278
15370,41,32.724075,-102.63736,Texas,12,30,8.569444
