# Arizona Weather Analysis

#### Importing libraries and merging the individual station's datasets into a master dataset

In [193]:
import pandas as pd
import os

In [194]:
# Environment Variables
ARIZONA_DB_CSV = "Database/csv/Arizona/Arizona.csv"
ARIZONA_STATION_METADATA_CSV = "Database/csv/Arizona/station_metadata.csv"
ARIZONA_GROUPED_DATA_CSV = 'Database/csv/Arizona/Arizona_Yearly_Mean_Temp.csv'

In [None]:
# Read the CSV files
# Empty list that will be populated with the datasets
all_dfs = []

# Loops through the files in the Datasets directory
for file in os.listdir("Database/csv/Arizona/datasets"):

    # Executes only the .csv files
    if(file.endswith('.csv')):

        # Creates a new DataFrame with STATION, DATE, Hourly Dry Bulb Temperature, and Hourly Wet Bulb Temperature 
        new_df = pd.read_csv(f'Database/csv/Arizona/datasets/{file}', usecols=['STATION', 'DATE', 'HourlyWetBulbTemperature'], dtype={'STATION': 'string', 'DATE': 'string'})

        # Appends the new DataFrame in the list
        all_dfs.append(new_df)

        # Deletes the new DataFrame from the memory
        del new_df

# Created a DataFrame that contains all the DataFrames 
master_df = pd.concat(all_dfs)

In [196]:
master_df

Unnamed: 0,STATION,DATE,HourlyWetBulbTemperature
0,72281353146,2021-06-12T00:56:00,47.0
1,72281353146,2021-06-12T01:56:00,46.0
2,72281353146,2021-06-12T02:56:00,45.0
3,72281353146,2021-06-12T03:56:00,46.0
4,72281353146,2021-06-12T04:56:00,44.0
...,...,...,...
90596,72378303195,2024-06-10T20:54:00,42.0
90597,72378303195,2024-06-10T21:54:00,40.0
90598,72378303195,2024-06-10T22:54:00,38.0
90599,72378303195,2024-06-10T23:54:00,37.0


## Data Cleaning
### Cleaning the Hourly Wet Bulb Temperature Column

In [197]:
master_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2204117 entries, 0 to 90600
Data columns (total 3 columns):
 #   Column                    Dtype 
---  ------                    ----- 
 0   STATION                   string
 1   DATE                      string
 2   HourlyWetBulbTemperature  object
dtypes: object(1), string(2)
memory usage: 67.3+ MB


In [198]:
# Deletes the rows with no hourly wet bulb temperature data
master_df.dropna(subset=['HourlyWetBulbTemperature'], inplace=True)

# Convert HourlyWetBulbTemperature to float
master_df['HourlyWetBulbTemperature'] = master_df['HourlyWetBulbTemperature'].apply(pd.to_numeric, errors='coerce')

master_df

master_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 902213 entries, 0 to 90599
Data columns (total 3 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   STATION                   902213 non-null  string 
 1   DATE                      902213 non-null  string 
 2   HourlyWetBulbTemperature  902188 non-null  float64
dtypes: float64(1), string(2)
memory usage: 27.5 MB


### Cleaning the Date Column

In [199]:
# Converting the DATE column to datetime type
master_df['DATE'] = pd.to_datetime(master_df['DATE'], format='%Y-%m-%dT%H:%M:%S')

# Extracting Year from the DATE column
master_df['Year'] = master_df['DATE'].dt.year

# Extracting Month from the DATE column
master_df['Month'] = master_df['DATE'].dt.month

# Extracting Day from the DATE column
master_df['Day'] = master_df['DATE'].dt.day

# Extracting Time from the DATE column
master_df['Time'] = master_df['DATE'].dt.time

# Extracting the Hour from the Time column
master_df['Hour'] = master_df['Time'].apply(lambda x: x.hour)

master_df[['DATE', 'Year', 'Time', 'Month', 'Day', 'Hour']]

Unnamed: 0,DATE,Year,Time,Month,Day,Hour
0,2021-06-12 00:56:00,2021,00:56:00,6,12,0
1,2021-06-12 01:56:00,2021,01:56:00,6,12,1
2,2021-06-12 02:56:00,2021,02:56:00,6,12,2
3,2021-06-12 03:56:00,2021,03:56:00,6,12,3
4,2021-06-12 04:56:00,2021,04:56:00,6,12,4
...,...,...,...,...,...,...
90595,2024-06-10 19:54:00,2024,19:54:00,6,10,19
90596,2024-06-10 20:54:00,2024,20:54:00,6,10,20
90597,2024-06-10 21:54:00,2024,21:54:00,6,10,21
90598,2024-06-10 22:54:00,2024,22:54:00,6,10,22


### Cleaning Station Column

In [200]:
master_df.columns

Index(['STATION', 'DATE', 'HourlyWetBulbTemperature', 'Year', 'Month', 'Day',
       'Time', 'Hour'],
      dtype='object')

In [201]:
master_df['STATION'].info()

<class 'pandas.core.series.Series'>
Int64Index: 902213 entries, 0 to 90599
Series name: STATION
Non-Null Count   Dtype 
--------------   ----- 
902213 non-null  string
dtypes: string(1)
memory usage: 13.8 MB


## Data Processing

#### Spliting the first 6 characters and renaming it to WBAN ID

In [202]:
master_df[['STATION']]

Unnamed: 0,STATION
0,72281353146
1,72281353146
2,72281353146
3,72281353146
4,72281353146
...,...
90595,72378303195
90596,72378303195
90597,72378303195
90598,72378303195


In [203]:
# Spliting the first 6 characters and renaming it to WBAN ID
master_df['STATION'] = master_df['STATION'].str[6:]
master_df[['STATION']]

Unnamed: 0,STATION
0,53146
1,53146
2,53146
3,53146
4,53146
...,...
90595,03195
90596,03195
90597,03195
90598,03195


### Merging the station metadata into master Dataframe
#### Converting the WBAN ID to string and filling it with zeros

In [204]:
# Reading the station metadata
station_metadata_df = pd.read_csv(ARIZONA_STATION_METADATA_CSV)

station_metadata_df['WBAN ID'].info() 

<class 'pandas.core.series.Series'>
RangeIndex: 37 entries, 0 to 36
Series name: WBAN ID
Non-Null Count  Dtype
--------------  -----
37 non-null     int64
dtypes: int64(1)
memory usage: 428.0 bytes


In [205]:
# Converting the WBAN ID to string and filling it with zeros
station_metadata_df['WBAN ID'] = station_metadata_df['WBAN ID'].astype('string').str.zfill(5)

station_metadata_df[['WBAN ID']]

Unnamed: 0,WBAN ID
0,226
1,3914
2,53128
3,53129
4,23109
5,93026
6,53132
7,3103
8,53146
9,3124


#### Merging station metadata with master dataframe

In [206]:
# Merge station metadata with master dataframe
master_df = master_df.merge(station_metadata_df[['WBAN ID', 'Station Name', 'Latitude', 'Longitude']], how='left', left_on='STATION', right_on='WBAN ID')

# Drop the redundant WBAN ID column
master_df.drop(columns='WBAN ID', inplace=True)

master_df

Unnamed: 0,STATION,DATE,HourlyWetBulbTemperature,Year,Month,Day,Time,Hour,Station Name,Latitude,Longitude
0,53146,2021-06-12 00:56:00,47.0,2021,6,12,00:56:00,0,"FORT HUACHUCA PIONEER AIRFIELD, AZ US",31.60563°,-110.42809°
1,53146,2021-06-12 01:56:00,46.0,2021,6,12,01:56:00,1,"FORT HUACHUCA PIONEER AIRFIELD, AZ US",31.60563°,-110.42809°
2,53146,2021-06-12 02:56:00,45.0,2021,6,12,02:56:00,2,"FORT HUACHUCA PIONEER AIRFIELD, AZ US",31.60563°,-110.42809°
3,53146,2021-06-12 03:56:00,46.0,2021,6,12,03:56:00,3,"FORT HUACHUCA PIONEER AIRFIELD, AZ US",31.60563°,-110.42809°
4,53146,2021-06-12 04:56:00,44.0,2021,6,12,04:56:00,4,"FORT HUACHUCA PIONEER AIRFIELD, AZ US",31.60563°,-110.42809°
...,...,...,...,...,...,...,...,...,...,...,...
902208,03195,2024-06-10 19:54:00,47.0,2024,6,10,19:54:00,19,"GRAND CANYON NATIONAL PARK AIRPORT, AZ US",35.94581°,-112.15536°
902209,03195,2024-06-10 20:54:00,42.0,2024,6,10,20:54:00,20,"GRAND CANYON NATIONAL PARK AIRPORT, AZ US",35.94581°,-112.15536°
902210,03195,2024-06-10 21:54:00,40.0,2024,6,10,21:54:00,21,"GRAND CANYON NATIONAL PARK AIRPORT, AZ US",35.94581°,-112.15536°
902211,03195,2024-06-10 22:54:00,38.0,2024,6,10,22:54:00,22,"GRAND CANYON NATIONAL PARK AIRPORT, AZ US",35.94581°,-112.15536°


#### Checking for missing values in the master dataframe

In [207]:
# Checking for missing values in the master dataframe
missing_values = master_df.isnull().sum()
print(missing_values)

STATION                      0
DATE                         0
HourlyWetBulbTemperature    25
Year                         0
Month                        0
Day                          0
Time                         0
Hour                         0
Station Name                 0
Latitude                     0
Longitude                    0
dtype: int64


####  Rearranging the columns


In [208]:
# Rearranging the columns
master_df = master_df[['DATE', 'Year', 'Month', 'Day', 'Time', 'Hour', 'STATION', 'Station Name', 'Latitude', 'Longitude', 'HourlyWetBulbTemperature']]
master_df

Unnamed: 0,DATE,Year,Month,Day,Time,Hour,STATION,Station Name,Latitude,Longitude,HourlyWetBulbTemperature
0,2021-06-12 00:56:00,2021,6,12,00:56:00,0,53146,"FORT HUACHUCA PIONEER AIRFIELD, AZ US",31.60563°,-110.42809°,47.0
1,2021-06-12 01:56:00,2021,6,12,01:56:00,1,53146,"FORT HUACHUCA PIONEER AIRFIELD, AZ US",31.60563°,-110.42809°,46.0
2,2021-06-12 02:56:00,2021,6,12,02:56:00,2,53146,"FORT HUACHUCA PIONEER AIRFIELD, AZ US",31.60563°,-110.42809°,45.0
3,2021-06-12 03:56:00,2021,6,12,03:56:00,3,53146,"FORT HUACHUCA PIONEER AIRFIELD, AZ US",31.60563°,-110.42809°,46.0
4,2021-06-12 04:56:00,2021,6,12,04:56:00,4,53146,"FORT HUACHUCA PIONEER AIRFIELD, AZ US",31.60563°,-110.42809°,44.0
...,...,...,...,...,...,...,...,...,...,...,...
902208,2024-06-10 19:54:00,2024,6,10,19:54:00,19,03195,"GRAND CANYON NATIONAL PARK AIRPORT, AZ US",35.94581°,-112.15536°,47.0
902209,2024-06-10 20:54:00,2024,6,10,20:54:00,20,03195,"GRAND CANYON NATIONAL PARK AIRPORT, AZ US",35.94581°,-112.15536°,42.0
902210,2024-06-10 21:54:00,2024,6,10,21:54:00,21,03195,"GRAND CANYON NATIONAL PARK AIRPORT, AZ US",35.94581°,-112.15536°,40.0
902211,2024-06-10 22:54:00,2024,6,10,22:54:00,22,03195,"GRAND CANYON NATIONAL PARK AIRPORT, AZ US",35.94581°,-112.15536°,38.0


## Data Aggregation
#### Calculating the minimum hourly wet bulb temperature for each month, day, and hour

In [209]:
# Calculating the minimum hourly wet bulb temperature for each month, day, and hour
master_df.groupby(['STATION','Month', 'Day', 'Hour'])['HourlyWetBulbTemperature'].mean()

STATION  Month  Day  Hour
00121    1      1    0       48.666667
                     1       47.000000
                     2       49.333333
                     3       49.666667
                     4       50.333333
                               ...    
93084    12     31   19      40.666667
                     20      41.333333
                     21      44.000000
                     22      40.333333
                     23      40.000000
Name: HourlyWetBulbTemperature, Length: 226164, dtype: float64

#### Create a dataframe from the grouped data

In [210]:
# Create a dataframe from the grouped data
df_grouped = pd.DataFrame(master_df.groupby(['STATION','Month', 'Day', 'Hour'])['HourlyWetBulbTemperature'].mean())

df_grouped.rename(columns={'HourlyWetBulbTemperature': 'Mean Hourly Wet Bulb Temperature'}, inplace=True)

df_grouped.reset_index(inplace=True)

df_grouped

Unnamed: 0,STATION,Month,Day,Hour,Mean Hourly Wet Bulb Temperature
0,00121,1,1,0,48.666667
1,00121,1,1,1,47.000000
2,00121,1,1,2,49.333333
3,00121,1,1,3,49.666667
4,00121,1,1,4,50.333333
...,...,...,...,...,...
226159,93084,12,31,19,40.666667
226160,93084,12,31,20,41.333333
226161,93084,12,31,21,44.000000
226162,93084,12,31,22,40.333333


#### Merge the grouped dataframe with the station metadata

In [211]:
# Merge the grouped dataframe with the station metadata
df_grouped = df_grouped.merge(station_metadata_df[['WBAN ID', 'Station Name', 'Latitude', 'Longitude']], how='left', left_on='STATION', right_on='WBAN ID')

# Drop the redundant columns
df_grouped.drop(columns='WBAN ID', inplace=True)


df_grouped

Unnamed: 0,STATION,Month,Day,Hour,Mean Hourly Wet Bulb Temperature,Station Name,Latitude,Longitude
0,00121,1,1,0,48.666667,"RYAN FIELD AIRPORT, AZ US",32.15°,-111.167°
1,00121,1,1,1,47.000000,"RYAN FIELD AIRPORT, AZ US",32.15°,-111.167°
2,00121,1,1,2,49.333333,"RYAN FIELD AIRPORT, AZ US",32.15°,-111.167°
3,00121,1,1,3,49.666667,"RYAN FIELD AIRPORT, AZ US",32.15°,-111.167°
4,00121,1,1,4,50.333333,"RYAN FIELD AIRPORT, AZ US",32.15°,-111.167°
...,...,...,...,...,...,...,...,...
226159,93084,12,31,19,40.666667,"SAFFORD MUNICIPAL AIRPORT, AZ US",32.85654°,-109.63603°
226160,93084,12,31,20,41.333333,"SAFFORD MUNICIPAL AIRPORT, AZ US",32.85654°,-109.63603°
226161,93084,12,31,21,44.000000,"SAFFORD MUNICIPAL AIRPORT, AZ US",32.85654°,-109.63603°
226162,93084,12,31,22,40.333333,"SAFFORD MUNICIPAL AIRPORT, AZ US",32.85654°,-109.63603°


#### Rearranging Columns

In [212]:
# Rearranging the columns
df_grouped = df_grouped[['STATION', 'Station Name', 'Latitude', 'Longitude', 'Month', 'Day', 'Hour', 'Mean Hourly Wet Bulb Temperature']]

df_grouped

Unnamed: 0,STATION,Station Name,Latitude,Longitude,Month,Day,Hour,Mean Hourly Wet Bulb Temperature
0,00121,"RYAN FIELD AIRPORT, AZ US",32.15°,-111.167°,1,1,0,48.666667
1,00121,"RYAN FIELD AIRPORT, AZ US",32.15°,-111.167°,1,1,1,47.000000
2,00121,"RYAN FIELD AIRPORT, AZ US",32.15°,-111.167°,1,1,2,49.333333
3,00121,"RYAN FIELD AIRPORT, AZ US",32.15°,-111.167°,1,1,3,49.666667
4,00121,"RYAN FIELD AIRPORT, AZ US",32.15°,-111.167°,1,1,4,50.333333
...,...,...,...,...,...,...,...,...
226159,93084,"SAFFORD MUNICIPAL AIRPORT, AZ US",32.85654°,-109.63603°,12,31,19,40.666667
226160,93084,"SAFFORD MUNICIPAL AIRPORT, AZ US",32.85654°,-109.63603°,12,31,20,41.333333
226161,93084,"SAFFORD MUNICIPAL AIRPORT, AZ US",32.85654°,-109.63603°,12,31,21,44.000000
226162,93084,"SAFFORD MUNICIPAL AIRPORT, AZ US",32.85654°,-109.63603°,12,31,22,40.333333


In [213]:
# Save the grouped data to a CSV file
df_grouped.to_csv(ARIZONA_GROUPED_DATA_CSV, index=False)