# NOAA-NCEI Data Scraping
---
Menne, Matthew J., Imke Durre, Bryant Korzeniewski, Shelley McNeill, Kristy Thomas, Xungang Yin, Steven Anthony, Ron Ray, Russell S. Vose, Byron E.Gleason, and Tamara G. Houston (2012): Global Historical Climatology Network - Daily (GHCN-Daily), Version 3. FIPS:06 PRCP. NOAA National Climatic Data Center. doi:10.7289/V5D21VHZ.

 Matthew J. Menne, Imke Durre, Russell S. Vose, Byron E. Gleason, and Tamara G. Houston, 2012: An Overview of the Global Historical Climatology Network-Daily Database. J. Atmos. Oceanic Technol., 29, 897-910. doi:10.1175/JTECH-D-11-00103.1.

https://www.ncdc.noaa.gov/cdo-web/webservices/v2#da
https://www.ncei.noaa.gov/pub/data/cdo/documentation/GHCND_documentation.pdf
https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt
ta

In [5]:
# Dependencies
import pandas as pd
import requests
import time
from api_key import noaa_key

In [6]:
# Setup to retrive daily summary data from NOAA NCEI by year
year = 2012
dts = pd.date_range(str(year)+"-01-01", str(year)+"-12-31", freq="D")
dts_chk = []

In [None]:
# Setup parameters for queries
url = "https://www.ncei.noaa.gov/cdo-web/api/v2/data?"
ds_id='GHCND'
dt_id='PRCP'
loc_id='FIPS:06' # California
limit=1000
# loop through dates and retrieve data and write to csv by date
for dt in dts:
    d = str(dt.date())
    q_url = f"{url}datasetid={ds_id}&datatypeid={dt_id}&locationid={loc_id}&startdate={d}&enddate={d}&units=metric&limit={limit}"
    try: 
        response = requests.get(q_url, headers={'token': noaa_key}).json()
        df = pd.DataFrame(response['results'])
        df['station'] = df['station'].str.slice(6,17)
        df = df[['station','value']].rename(columns={'value':d})
        # df.to_csv(f'../outputs/ncei-noaa/{year}/csv/{d}.csv',index=False)
    except:
        dts_chk.append(d)
        print(f'{d}')
    time.sleep(1)

In [None]:
# Check list for failed requests
dts_chk

In [None]:
# loop through failed dates and retrieve data and write to csv by date
for d in dts_chk:
    print(d)
    q_url = f"{url}datasetid={ds_id}&datatypeid={dt_id}&locationid={loc_id}&startdate={d}&enddate={d}&units=metric&limit={limit}"
    try: 
        response = requests.get(q_url, headers={'token': noaa_key}).json()
        df = pd.DataFrame(response['results'])
        df['station'] = df['station'].str.slice(6,17)
        df = df[['station','value']].rename(columns={'value':d})
        df.to_csv(f'outputs/prcp/{year}/csv/{d}.csv',index=False)
    except:
        dts_chk.append(d)
        print(f'{d} failed')
    time.sleep(1)

In [3]:
# Retrieve data by singular date for dates that failed second attempt to see if there are errors
d='2022-01-01'
q_url = f"{url}datasetid={ds_id}&datatypeid={dt_id}&locationid={loc_id}&startdate={d}&enddate={d}&units=metric&limit={limit}"
response = requests.get(q_url, headers={'token': noaa_key}).json()
df = pd.DataFrame(response['results'])
df

Unnamed: 0,date,datatype,station,attributes,value
0,2022-01-01T00:00:00,PRCP,GHCND:US1CAAL0001,",,N,0700",0.0
1,2022-01-01T00:00:00,PRCP,GHCND:US1CAAL0003,",,N,0700",0.0
2,2022-01-01T00:00:00,PRCP,GHCND:US1CAAL0004,",,N,0700",0.0
3,2022-01-01T00:00:00,PRCP,GHCND:US1CAAL0012,",,N,0700",0.0
4,2022-01-01T00:00:00,PRCP,GHCND:US1CAAL0030,",,N,0700",0.0
...,...,...,...,...,...
817,2022-01-01T00:00:00,PRCP,GHCND:USW00093230,",,W,2400",0.0
818,2022-01-01T00:00:00,PRCP,GHCND:USW00093241,",,W,2400",0.0
819,2022-01-01T00:00:00,PRCP,GHCND:USW00093242,",,W,",0.0
820,2022-01-01T00:00:00,PRCP,GHCND:USW00093245,",,R,",0.0


In [4]:
# Isolate station information to station id
df['station'] = df['station'].str.slice(6,17)
df = df[['station','value']].rename(columns={'value':d})
df

Unnamed: 0,station,2022-01-01
0,US1CAAL0001,0.0
1,US1CAAL0003,0.0
2,US1CAAL0004,0.0
3,US1CAAL0012,0.0
4,US1CAAL0030,0.0
...,...,...
817,USW00093230,0.0
818,USW00093241,0.0
819,USW00093242,0.0
820,USW00093245,0.0


In [6]:
# export data to csv
df.to_csv(f'../outputs/prcp/{year}/csv/{d}.csv',index=False)