# Build Metadata.csv

## Columns:

* StationID: str
* lat: float
* lon: float
* availability: list of years: String or Int
* ....

# Task 1: get a list of available files:

In [26]:
import requests
from bs4 import BeautifulSoup
import re

In [32]:
url = "https://www.ndbc.noaa.gov/data/historical/stdmet/"
response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')
    body = soup.body
else:
    print('Error fetching website.')

In [40]:
pattern = r'\xa0\n(.*?\.txt\.gz)'  # Example: \xa0\n0y2w3h2012.txt.gz
file_names = re.findall(pattern, body.text)

In [41]:
file_names

['0y2w3h2012.txt.gz',
 '0y2w3h2013.txt.gz',
 '0y2w3h2014.txt.gz',
 '0y2w3h2015.txt.gz',
 '4cONFh2020.txt.gz',
 '4h361h2017.txt.gz',
 '4h361h2020.txt.gz',
 '4h361h2021.txt.gz',
 '4h363h2020.txt.gz',
 '4h363h2021.txt.gz',
 '4h364h2017.txt.gz',
 '4h365h2020.txt.gz',
 '4h365h2021.txt.gz',
 '4h390h2017.txt.gz',
 '4h390h2020.txt.gz',
 '4h390h2021.txt.gz',
 '4h394h2017.txt.gz',
 '4h394h2020.txt.gz',
 '4h394h2021.txt.gz',
 '18ci3h2014.txt.gz',
 '18ci3h2015.txt.gz',
 '20cm4h2014.txt.gz',
 '20cm4h2015.txt.gz',
 '32st1h2009.txt.gz',
 '32st1h2010.txt.gz',
 '32st1h2022.txt.gz',
 '32st2h2007.txt.gz',
 '32st2h2008.txt.gz',
 '32st2h2009.txt.gz',
 '32st2h2010.txt.gz',
 '32st2h2011.txt.gz',
 '32st2h2012.txt.gz',
 '32st2h2013.txt.gz',
 '32st2h2014.txt.gz',
 '32st2h2015.txt.gz',
 '32st2h2016.txt.gz',
 '32st2h2017.txt.gz',
 '32st2h2018.txt.gz',
 '32st2h2019.txt.gz',
 '32st2h2020.txt.gz',
 '32st2h2021.txt.gz',
 '41nt1h2007.txt.gz',
 '41nt1h2008.txt.gz',
 '41nt1h2009.txt.gz',
 '41nt1h2010.txt.gz',
 '41nt1h20

In [42]:
len(file_names)

14941

In [49]:
existing_files = {}

for filename in file_names:

    pattern = r'(.+?)h(\d{4})\.txt\.gz'
    match = re.match(pattern, filename)

    station = match.group(1)
    year = int(match.group(2))

    if station not in existing_files:
        existing_files[station] = { year }
    else:
        existing_files[station].add(year)

In [50]:
existing_files

{'0y2w3': {2012, 2013, 2014, 2015},
 '4cONF': {2020},
 '4h361': {2017, 2020, 2021},
 '4h363': {2020, 2021},
 '4h364': {2017},
 '4h365': {2020, 2021},
 '4h390': {2017, 2020, 2021},
 '4h394': {2017, 2020, 2021},
 '18ci3': {2014, 2015},
 '20cm4': {2014, 2015},
 '32st1': {2009, 2010, 2022},
 '32st2': {2007,
  2008,
  2009,
  2010,
  2011,
  2012,
  2013,
  2014,
  2015,
  2016,
  2017,
  2018,
  2019,
  2020,
  2021},
 '41nt1': {2007,
  2008,
  2009,
  2010,
  2012,
  2013,
  2014,
  2015,
  2016,
  2017,
  2018,
  2019,
  2020,
  2021,
  2022},
 '41nt2': {2008, 2010},
 '42a02': {2000},
 '42a03': {2000},
 '42otp': {2000},
 '43WSL': {2019, 2020},
 '45T01': {2018},
 '46a35': {2001},
 '46a54': {1996},
 '51wh1': {2008,
  2009,
  2010,
  2011,
  2012,
  2013,
  2014,
  2015,
  2016,
  2017,
  2018,
  2019,
  2020,
  2021,
  2022},
 '51wh2': {2007, 2008, 2009, 2010, 2011, 2012},
 '53ANF': {2020},
 '53MKF': {2020},
 '23020': {2008, 2009, 2010},
 '31201': {2005},
 '32012': {2007,
  2008,
  2009,
 

existing_files is a dictionary of stations. The value of each station is a set of the years where a file exists!