### Weather Cleaning and Organization Notebook
This notebook will focus on converting the raw METAR and SPECI weather data into one usable data frame and exporting it to a CSV file that will be used to perform predictive analysis for on time departures out of San Diego International Airport.

In [135]:
# Import libraries
import pandas as pd
import re

from datetime import datetime
from metar import Metar

In [136]:
# Import the data files
df1 = pd.read_csv('Raw_data/raw_weather/KSAN_2023_wx.csv')
df2 = pd.read_csv('Raw_data/raw_weather/KSAN_2024_wx.csv')
df3 = pd.read_csv('Raw_data/raw_weather/KSAN_2025_wx.csv')

# create list of dfs
dfs = [df1, df2, df3]

  df1 = pd.read_csv('Raw_data/raw_weather/KSAN_2023_wx.csv')
  df2 = pd.read_csv('Raw_data/raw_weather/KSAN_2024_wx.csv')


In [137]:
# Removing unnecessary rows besides REM which houses the raw METAR data and concatenating the dataframes
metar_dfs = []

for i, df in enumerate(dfs, start=1):
    if 'REM' in df.columns:
        metar_dfs.append(df[['REM']])
    else:
        print('REM column not found in df{i}')

# concatenating the dfs
comb_metar_df = pd.concat(metar_dfs, ignore_index=True)

In [138]:
comb_metar_df.head()

Unnamed: 0,REM
0,SYN08672290 12366 8//06 10161 20128 30148 4016...
1,MET10512/31/22 16:10:03 SPECI KSAN 010010Z 180...
2,MET12712/31/22 16:51:03 METAR KSAN 010051Z 160...
3,MET12612/31/22 17:51:03 METAR KSAN 010151Z 150...
4,MET10512/31/22 18:16:03 SPECI KSAN 010216Z 140...


In [139]:
comb_metar_df.tail()

Unnamed: 0,REM
30326,MET14103/06/25 21:01:01 SPECI KSAN 070501Z 330...
30327,MET13803/06/25 21:13:01 SPECI KSAN 070513Z 350...
30328,MET16903/06/25 21:51:01 METAR KSAN 070551Z 350...
30329,MET11903/06/25 22:51:01 METAR KSAN 070651Z 330...
30330,SOD77324 HR PRECIPITATION (IN): 0.59 WEATHER(C...


In [140]:
# Filter so that the data only contains METAR data and not extra synthetic information that does not conform to the standard METAR format we will be extracting from
comb_metar_df = comb_metar_df[comb_metar_df['REM'].str.startswith('MET', na=False)]

In [141]:
# reset the index of the dataframe
comb_metar_df.reset_index(drop=True, inplace=True)

In [142]:
print(comb_metar_df.head())
print(comb_metar_df.tail())

                                                 REM
0  MET10512/31/22 16:10:03 SPECI KSAN 010010Z 180...
1  MET12712/31/22 16:51:03 METAR KSAN 010051Z 160...
2  MET12612/31/22 17:51:03 METAR KSAN 010151Z 150...
3  MET10512/31/22 18:16:03 SPECI KSAN 010216Z 140...
4  MET10812/31/22 18:38:03 SPECI KSAN 010238Z 140...
                                                     REM
24583  MET19203/06/25 20:51:01 METAR KSAN 070451Z 350...
24584  MET14103/06/25 21:01:01 SPECI KSAN 070501Z 330...
24585  MET13803/06/25 21:13:01 SPECI KSAN 070513Z 350...
24586  MET16903/06/25 21:51:01 METAR KSAN 070551Z 350...
24587  MET11903/06/25 22:51:01 METAR KSAN 070651Z 330...


In [143]:
# establish a prefix pattern to extract the information before the actual beginning of the METAR data

prefix_pattern = re.compile(
    # the MET pattern and 3 digit code that we do not need
    r'^MET\d{3}'
    # local month
    r'(?P<month>\d{2})/'
    # local day
    r'(?P<day>\d{2})/'
    #local year
    r'(?P<year>\d{2})\s+'
    # local hour
    r'(?P<hour>\d{2}):'
    # local minute
    r'(?P<minute>\d{2}):'
    # local second
    r'(?P<second>\d{2})\s+'
    # rest of the actual metar
    r'(?P<metar>.*)$'
)

In [144]:
# create function that will parse the prefix data, dropping the MET and the three digit code but making a date and time column 
def parse_prefix(line):
    match = prefix_pattern.match(line)
    if not match:
        return None
    
    local_date = f"{match.group('month')}/{match.group('day')}/{match.group('year')}"
    local_time = f"{match.group('hour')}:{match.group('minute')}:{match.group('second')}"
    metar = match.group('metar')

    return {
        'local_date': local_date,
        'local_time': local_time,
        'metar': metar
    }

In [145]:
# create function that will parse the raw METAR data.
def parse_metar(metar):
    try:
        parsed = Metar.Metar(metar)

        return {
            'station_id': parsed.station_id,
            'wind_dir_degrees': parsed.wind_dir.value() if parsed.wind_dir else None,
            'wind_speed_kt': parsed.wind_speed.value() if parsed.wind_speed else None,
            'wind_gust_kt': parsed.wind_gust.value() if parsed.wind_gust else None,
            'visibility_statute_mi': parsed.vis.value() if parsed.vis else None,
            'temperature_c': parsed.temp.value() if parsed.temp else None,
            'dewpoint_c': parsed.dewpt.value() if parsed.dewpt else None,
            'altimeter_hpa': parsed.press.value() if parsed.press else None,
        }
    except Exception as e:
        return None

In [146]:
# tie all the functions together in order to break up each field of every METAR observation into their own columns

parsed_metar_data = []

# process each line of the comb_metar_df
for line in comb_metar_df['REM']:
    prefix_data = parse_prefix(line)
    if prefix_data is None:
        continue
    
    metar_data = parse_metar(prefix_data['metar'])
    if metar_data is None:
        continue

    combined_record = {**prefix_data, **metar_data}
    parsed_metar_data.append(combined_record)

# create the new df from parsed records
organized_metar_df = pd.DataFrame(parsed_metar_data)
organized_metar_df.reset_index(drop=True, inplace=True)

In [147]:
print(organized_metar_df.head())

  local_date local_time                                              metar  \
0   12/31/22   16:10:03  SPECI KSAN 010010Z 18004KT 10SM SCT009 BKN026 ...   
1   12/31/22   16:51:03  METAR KSAN 010051Z 16006KT 8SM -RA SCT008 BKN0...   
2   12/31/22   17:51:03  METAR KSAN 010151Z 15006KT 10SM FEW008 BKN021 ...   
3   12/31/22   18:16:03  SPECI KSAN 010216Z 14010KT 10SM FEW009 BKN031 ...   
4   12/31/22   18:38:03  SPECI KSAN 010238Z 14010G21KT 10SM FEW009 BKN0...   

  station_id  wind_dir_degrees  wind_speed_kt  wind_gust_kt  \
0       KSAN             180.0            4.0           NaN   
1       KSAN             160.0            6.0           NaN   
2       KSAN             150.0            6.0           NaN   
3       KSAN             140.0           10.0           NaN   
4       KSAN             140.0           10.0          21.0   

   visibility_statute_mi  temperature_c  dewpoint_c  altimeter_hpa  
0                   10.0           15.6        12.8          30.01  
1             

In [148]:
# drop the metar column
organized_metar_df.drop(columns=['metar'], inplace=True)
print(organized_metar_df.head())

  local_date local_time station_id  wind_dir_degrees  wind_speed_kt  \
0   12/31/22   16:10:03       KSAN             180.0            4.0   
1   12/31/22   16:51:03       KSAN             160.0            6.0   
2   12/31/22   17:51:03       KSAN             150.0            6.0   
3   12/31/22   18:16:03       KSAN             140.0           10.0   
4   12/31/22   18:38:03       KSAN             140.0           10.0   

   wind_gust_kt  visibility_statute_mi  temperature_c  dewpoint_c  \
0           NaN                   10.0           15.6        12.8   
1           NaN                    8.0           15.6        13.3   
2           NaN                   10.0           15.6        12.8   
3           NaN                   10.0           15.6        12.8   
4          21.0                   10.0           15.6        12.8   

   altimeter_hpa  
0          30.01  
1          30.00  
2          29.98  
3          29.97  
4          29.96  


In [149]:
# replace all NaN values in the wind_gust_kt column with 0
organized_metar_df.fillna({'wind_gust_kt':0}, inplace=True)
print(organized_metar_df.head())

  local_date local_time station_id  wind_dir_degrees  wind_speed_kt  \
0   12/31/22   16:10:03       KSAN             180.0            4.0   
1   12/31/22   16:51:03       KSAN             160.0            6.0   
2   12/31/22   17:51:03       KSAN             150.0            6.0   
3   12/31/22   18:16:03       KSAN             140.0           10.0   
4   12/31/22   18:38:03       KSAN             140.0           10.0   

   wind_gust_kt  visibility_statute_mi  temperature_c  dewpoint_c  \
0           0.0                   10.0           15.6        12.8   
1           0.0                    8.0           15.6        13.3   
2           0.0                   10.0           15.6        12.8   
3           0.0                   10.0           15.6        12.8   
4          21.0                   10.0           15.6        12.8   

   altimeter_hpa  
0          30.01  
1          30.00  
2          29.98  
3          29.97  
4          29.96  


In [150]:
# output the data frame to a csv
organized_metar_df.to_csv('clean_data/organized_metar_data.csv', index=False)