# Scraping Data Dictionary for Water and Temperature Data
---
This notebook scrapes US Geological Survey data for our data dictionary. This data was expansive, therefore the data dictionary process was automated. This program can be run in any order, before or after other programs in the Data Cleaning Folder.

# Contents
- [Water Usage Data from the USGS](#Water-Usage-Data-from-the-USGS)
- [Weather and Temperature Data from Keith Spangler via Figshare](#Weather-and-Temperature-Data-from-Keith-Spangler-via-Figshare)
---

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

### Water Usage Data from the USGS

In [2]:
url = 'https://www.sciencebase.gov/catalog/file/get/5af3311be4b0da30c1b245d8?f=__disk__7c%2Fd0%2Fda%2F7cd0da1f354d12c5870f0dee7446cb41ad5a013f&transform=1&allowOpen=true'
res = requests.get(url)
res.status_code

200

In [3]:
soup = BeautifulSoup(res.content)

In [4]:
len('Attribute Label:   ')

19

In [5]:
soup.body.find_all('b', text = 'Attribute Label:   ')

[<b><i><font color="#488AC7">Attribute Label:   </font></i></b>,
 <b><i><font color="#488AC7">Attribute Label:   </font></i></b>,
 <b><i><font color="#488AC7">Attribute Label:   </font></i></b>,
 <b><i><font color="#488AC7">Attribute Label:   </font></i></b>,
 <b><i><font color="#488AC7">Attribute Label:   </font></i></b>,
 <b><i><font color="#488AC7">Attribute Label:   </font></i></b>,
 <b><i><font color="#488AC7">Attribute Label:   </font></i></b>,
 <b><i><font color="#488AC7">Attribute Label:   </font></i></b>,
 <b><i><font color="#488AC7">Attribute Label:   </font></i></b>,
 <b><i><font color="#488AC7">Attribute Label:   </font></i></b>,
 <b><i><font color="#488AC7">Attribute Label:   </font></i></b>,
 <b><i><font color="#488AC7">Attribute Label:   </font></i></b>,
 <b><i><font color="#488AC7">Attribute Label:   </font></i></b>,
 <b><i><font color="#488AC7">Attribute Label:   </font></i></b>,
 <b><i><font color="#488AC7">Attribute Label:   </font></i></b>,
 <b><i><font color="#488A

In [6]:
soup.body.find('b', text = 'Attribute Label:   ').next_sibling #cite: chatgpt on finding the next value after a specified text value.

'STATE'

In [7]:
attribute_labels = []
attribute_definitions = []
data_dict = {}
for dt in soup.body.find_all('dt'):
    label = dt.find('b', text = 'Attribute Label:   ')
    definition = dt.find('b', text = 'Attribute Definition:')
    if label:
        attribute_labels.append(label.next_sibling)
    if definition:
        attribute_definitions.append(dt.find('b', text = 'Attribute Definition:').find_next('dd').text.strip())        

In [8]:
water_use_dict_df = pd.DataFrame()
water_use_dict_df['attribute'] = attribute_labels
water_use_dict_df['definition'] = attribute_definitions
water_use_dict_df['description'] = water_use_dict_df.definition.str.split(', in ').str.get(0)
water_use_dict_df['units'] = water_use_dict_df.definition.str.split(', in ').str.get(1)
water_use_dict_df['source']= 'USGS Water Usage'

In [9]:
water_use_dict_df['attribute'] = water_use_dict_df['attribute'].str.lower().str.replace('-','_')

In [10]:
combined_df_columns = list(pd.read_csv('clean-data/combined.csv').columns)
water_use_dict_df = water_use_dict_df.loc[water_use_dict_df['attribute'].isin(combined_df_columns)]
water_use_dict_df

Unnamed: 0,attribute,definition,description,units,source
0,state,United States postal abbreviation for one of t...,United States postal abbreviation for one of t...,,USGS Water Usage
4,fips,Concatenation of STATEFIPS and COUNTYFIPS attr...,Concatenation of STATEFIPS and COUNTYFIPS attr...,,USGS Water Usage
6,tp_totpop,"Total population of area, in thousands",Total population of area,thousands,USGS Water Usage
7,ps_gwpop,"Public Supply, population served by groundwate...","Public Supply, population served by groundwater",thousands,USGS Water Usage
8,ps_swpop,"Public Supply, population served by surface wa...","Public Supply, population served by surface water",thousands,USGS Water Usage
...,...,...,...,...,...
128,pc_power,"Thermoelectric recirculating, power generated,...","Thermoelectric recirculating, power generated",gigawatt-hours,USGS Water Usage
131,to_wgwto,"Total groundwater withdrawals, total (fresh+sa...","Total groundwater withdrawals, total (fresh+sa...",Mgal/d,USGS Water Usage
134,to_wswto,"Total surface-water withdrawals, total (fresh+...","Total surface-water withdrawals, total (fresh+...",Mgal/d,USGS Water Usage
137,to_wtotl,"Total withdrawals, total (fresh+saline), in Mg...","Total withdrawals, total (fresh+saline)",Mgal/d,USGS Water Usage


In [11]:
water_use_dict_df = water_use_dict_df[['attribute','description', 'definition', 'units', 'source']]

In [12]:
# water_use_dict = dict(zip(water_use_dict_df['attribute'], water_use_dict_df['description']))

### Weather and Temperature Data from Keith Spangler via Figshare

In [13]:
url2 = 'https://www.nature.com/articles/s41597-022-01405-3/tables/4'
res2 = requests.get(url2)
res2.status_code

200

In [14]:
# res2.text

In [15]:
soup = BeautifulSoup(res2.content,'lxml')

In [16]:
#cite: 5.02 Web Scraping Lesson
table = soup.find('table', attrs = {'data last-table'})

## Headers
head = table.find('thead')
columns = []
for col in head.find_all('p'):
    columns.append(col.text)

## Row Values
table_vals = []
body = table.find('tbody')
for tr in body.find_all('tr'):
    row = []
    for td in tr.find_all('td'):
        row.append(td.text)
    table_vals.append(row)

temp_vals_df = pd.DataFrame(table_vals, columns = columns)

In [17]:
pd.read_csv('clean-data/annual_temperature_2000-2020_FIPS.csv').columns

Index(['Year', 'StCoFIPS', 'Tmean_C', 'TDmean_C', 'NETmean_C', 'HImean_C',
       'HXmean_C', 'WBGTmean_C', 'Flag_T', 'Flag_TD', 'Flag_NET', 'Flag_HI',
       'Flag_HX', 'Flag_WBGT'],
      dtype='object')

In [18]:
temp_vals_df = temp_vals_df.loc[temp_vals_df['Variable Name (Short)'].isin(pd.read_csv('clean-data/annual_temperature_2000-2020_FIPS.csv').columns)]

In [19]:
temp_vals_df.rename(columns = {
    'Variable Name (Short)' : 'attribute',
    'Variable Name (Long)' : 'description',
    'Description / Format' : 'definition',
    'Units' : 'units'
}, inplace = True)
temp_vals_df['source'] = 'Spangler, Liang, Wellenius 2022'

In [20]:
temp_vals_df.attribute = temp_vals_df.attribute.str.lower()

In [21]:
combined_dict_df = pd.concat([water_use_dict_df, temp_vals_df],axis = 0)

In [22]:
combined_dict_df.head()

Unnamed: 0,attribute,description,definition,units,source
0,state,United States postal abbreviation for one of t...,United States postal abbreviation for one of t...,,USGS Water Usage
4,fips,Concatenation of STATEFIPS and COUNTYFIPS attr...,Concatenation of STATEFIPS and COUNTYFIPS attr...,,USGS Water Usage
6,tp_totpop,Total population of area,"Total population of area, in thousands",thousands,USGS Water Usage
7,ps_gwpop,"Public Supply, population served by groundwater","Public Supply, population served by groundwate...",thousands,USGS Water Usage
8,ps_swpop,"Public Supply, population served by surface water","Public Supply, population served by surface wa...",thousands,USGS Water Usage


In [288]:
combined_dict_df.to_csv('clean-data/data_dict.csv', index = False)