# Asset Discover

1. Exploring `narr_catalog.csv`, and uncovering the schema behind their naming conditions.
2. Writing logic to transform csv to metadata about data flow.


In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('../scraped/narr_catalog.csv')

In [4]:
df.head()

Unnamed: 0,name,url,p1,p2,p3,p4
0,acpcp.1979.nc,https://psl.noaa.gov/thredds/catalog/Datasets/...,monolevel,Dailies,NARR,catalog.html?dataset=Datasets
1,acpcp.1980.nc,https://psl.noaa.gov/thredds/catalog/Datasets/...,monolevel,Dailies,NARR,catalog.html?dataset=Datasets
2,acpcp.1981.nc,https://psl.noaa.gov/thredds/catalog/Datasets/...,monolevel,Dailies,NARR,catalog.html?dataset=Datasets
3,acpcp.1982.nc,https://psl.noaa.gov/thredds/catalog/Datasets/...,monolevel,Dailies,NARR,catalog.html?dataset=Datasets
4,acpcp.1983.nc,https://psl.noaa.gov/thredds/catalog/Datasets/...,monolevel,Dailies,NARR,catalog.html?dataset=Datasets


## Grouping


In [8]:
# Checking if .nc in all rows of "name"

df['name'].str.endswith('.nc').all()

np.True_

In [9]:
# Removing .nc from all rows of "name"

df['name'] = df['name'].str.replace('.nc', '')
df.head()

Unnamed: 0,name,url,p1,p2,p3
0,acpcp.1979,https://psl.noaa.gov/thredds/catalog/Datasets/...,monolevel,Dailies,NARR
1,acpcp.1980,https://psl.noaa.gov/thredds/catalog/Datasets/...,monolevel,Dailies,NARR
2,acpcp.1981,https://psl.noaa.gov/thredds/catalog/Datasets/...,monolevel,Dailies,NARR
3,acpcp.1982,https://psl.noaa.gov/thredds/catalog/Datasets/...,monolevel,Dailies,NARR
4,acpcp.1983,https://psl.noaa.gov/thredds/catalog/Datasets/...,monolevel,Dailies,NARR


In [12]:
df.tail()

Unnamed: 0,name,url,p1,p2,p3
18387,tsoil.202406,https://psl.noaa.gov/thredds/catalog/Datasets/...,subsurface,NARR,catalog.html?dataset=Datasets
18388,tsoil.202407,https://psl.noaa.gov/thredds/catalog/Datasets/...,subsurface,NARR,catalog.html?dataset=Datasets
18389,tsoil.202408,https://psl.noaa.gov/thredds/catalog/Datasets/...,subsurface,NARR,catalog.html?dataset=Datasets
18390,hgt.sfc,https://psl.noaa.gov/thredds/catalog/Datasets/...,time_invariant,NARR,catalog.html?dataset=Datasets
18391,land,https://psl.noaa.gov/thredds/catalog/Datasets/...,time_invariant,NARR,catalog.html?dataset=Datasets


In [14]:
# Splitting "name" into "name" and "plus", will have Nan if no plus

df[['name', 'plus']] = df['name'].str.split('.', n=1, expand=True)

In [15]:
df.head()

Unnamed: 0,name,url,p1,p2,p3,plus
0,acpcp,https://psl.noaa.gov/thredds/catalog/Datasets/...,monolevel,Dailies,NARR,1979
1,acpcp,https://psl.noaa.gov/thredds/catalog/Datasets/...,monolevel,Dailies,NARR,1980
2,acpcp,https://psl.noaa.gov/thredds/catalog/Datasets/...,monolevel,Dailies,NARR,1981
3,acpcp,https://psl.noaa.gov/thredds/catalog/Datasets/...,monolevel,Dailies,NARR,1982
4,acpcp,https://psl.noaa.gov/thredds/catalog/Datasets/...,monolevel,Dailies,NARR,1983


In [16]:
df.tail()

Unnamed: 0,name,url,p1,p2,p3,plus
18387,tsoil,https://psl.noaa.gov/thredds/catalog/Datasets/...,subsurface,NARR,catalog.html?dataset=Datasets,202406
18388,tsoil,https://psl.noaa.gov/thredds/catalog/Datasets/...,subsurface,NARR,catalog.html?dataset=Datasets,202407
18389,tsoil,https://psl.noaa.gov/thredds/catalog/Datasets/...,subsurface,NARR,catalog.html?dataset=Datasets,202408
18390,hgt,https://psl.noaa.gov/thredds/catalog/Datasets/...,time_invariant,NARR,catalog.html?dataset=Datasets,sfc
18391,land,https://psl.noaa.gov/thredds/catalog/Datasets/...,time_invariant,NARR,catalog.html?dataset=Datasets,


In [24]:
# Value counts of length of "plus"

lengths = set(map(int, df['plus'].str.len().fillna(0)))
lengths.remove(0)
lengths

{3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14}

In [26]:
# Sample values of each length type

for i in lengths:
    subset = df.loc[df['plus'].str.len() == i, 'plus']
    if len(subset) >= 5:
        print(subset.sample(5))
    else:
        print(subset)

18390    sfc
Name: plus, dtype: object
11070    1979
3359     1980
10347    1992
1537     1998
1955     2002
Name: plus, dtype: object
7068     199811
12966    198309
5805     198412
13900    201511
16889    199012
Name: plus, dtype: object
9233    mon.ltm
9421    2m.1986
9425    2m.1990
845     2m.1996
9195    mon.ltm
Name: plus, dtype: object
11936    hl1.2017
11006    sfc.2007
10766    hl1.1997
9500     sfc.2019
1572     hl1.1987
Name: plus, dtype: object
12098    ntat.1995
11979    ntat.2014
12102    ntat.1999
2714     ntat.1979
11947    ntat.1982
Name: plus, dtype: object
12487    tropo.2016
12490    tropo.2019
10453    tropo.2006
11178    tropo.1995
3126     tropo.2023
Name: plus, dtype: object
9260    10m.mon.ltm
9297    2m.mon.mean
9247    10m.mon.ltm
9253    10m.mon.ltm
9280    2m.mon.mean
Name: plus, dtype: object
9314    sfc.mon.mean
9345    10m.mon.mean
9313    hl1.mon.mean
9339    sfc.mon.mean
9352    10m.mon.mean
Name: plus, dtype: object
9335    ntat.mon.mean
9254    tro

## META information: A tree


In [5]:
df.p4.unique()

array(['catalog.html?dataset=Datasets', nan], dtype=object)

In [6]:
df.drop(columns=['p4'], inplace=True)

In [None]:
# A tree of the data, p3 being root

## Understanding `name`
