# NOAA-NCEI Data Scrubbing
---


## 0 Initialize

In [1]:
# Dependencies
import pandas as pd

In [11]:
# Steup data frame and dates for daily precipitation data by year
year = 2012
df = pd.DataFrame(columns=['station'])
dts = pd.date_range(str(year) + "-01-01", str(year) + "-12-31", freq="D")

In [12]:
# Retrieve daily precipitation data for year and merge into dataframe
for dt in dts:
    d = str(dt.date())
    dt_df = pd.read_csv(f'../outputs/noaa-ncei/{year}/csv/{d}.csv')
    df = df.merge(dt_df, on='station', how='outer')
df.tail(1)

Unnamed: 0,station,2012-01-01,2012-01-02,2012-01-03,2012-01-04,2012-01-05,2012-01-06,2012-01-07,2012-01-08,2012-01-09,...,2012-12-22,2012-12-23,2012-12-24,2012-12-25,2012-12-26,2012-12-27,2012-12-28,2012-12-29,2012-12-30,2012-12-31
1011,USW00094299,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.6,8.4,0.0,0.0,2.3,0.3,0.5,0.0,0.3,0.0


In [5]:
# Compare stations from precipiation data to ghcnd-station.csv and check to make sure there isn't missing data
ghcnd_df = pd.read_csv('../outputs/noaa-ncei/ghcnd-stations.csv')
r = [x for x in df['station'].to_list() if not x in ghcnd_df['station'].to_list()]
len(r)

0

In [6]:
# Get station info from ghcnd-station.csv for station in precipitation data
stn_df = ghcnd_df[ghcnd_df['station'].isin(df['station'].to_list())].reset_index(drop=True)
stn_df.tail(1)

Unnamed: 0,station,lat,lon,elev,name
1011,USW00094299,41.4836,-120.5614,1335.9,CA ALTURAS MUNI AP


In [13]:
# Add Station Information to Dataframe
df = df.merge(stn_df, on='station')

In [14]:
# Fill na with zeros and preview
df = df.fillna(0.0)
df.tail(1)

Unnamed: 0,station,2012-01-01,2012-01-02,2012-01-03,2012-01-04,2012-01-05,2012-01-06,2012-01-07,2012-01-08,2012-01-09,...,2012-12-26,2012-12-27,2012-12-28,2012-12-29,2012-12-30,2012-12-31,lat,lon,elev,name
1011,USW00094299,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.3,0.3,0.5,0.0,0.3,0.0,41.4836,-120.5614,1335.9,CA ALTURAS MUNI AP


In [95]:
# Output csv for relevant station info
stn_df.to_csv(f'../outputs/noaa-ncei/CA_{year}_coords.csv',index=False)

In [98]:
# Output dataframe to csv
df.to_csv(f'../outputs/noaa-ncei/CA_{year}_prcp_data.csv',index=False)