In [1]:
from bs4 import BeautifulSoup
import requests
import json
import os
import time

from csv import writer
from tqdm import tqdm, trange
from string import digits
from os.path import exists

## National Park

Source: https://www.nps.gov/index.htm

ATTENTION: DO NOT RUN IF YOU WANT TO USE THIS DATASET!

In [23]:
class NationalSite:
    def __init__(self, name, state, title, description):
        # Init instance of national site
        self.name = name
        self.state = state
        self.title = title
        self.description = description
    
    def info(self):
        # Return a string representation of itself
        rep = '"' + '","'.join([self.name, self.state, self.title, self.description]) + '"\n'    
        return rep
    
    def write_csv(self):
        try:
            with open("documents.csv", "x") as file:
                file.write("Name,State,Title,Description\n")
        except:
            pass
        with open("documents.csv", "a", encoding='utf-8') as file:
            file.write(self.info())

In [69]:
def build_state_url_dict():
    response = requests.get("https://www.nps.gov").text
    soup = BeautifulSoup(response, 'html.parser')

    selector = 'ul[role="menu"] > li > a'
    state_urls = soup.select(selector)
    state_dict = {
        state_url.string.lower(): 'https://www.nps.gov' + \
            state_url["href"] for state_url in state_urls
    }
    return state_dict

In [70]:
def get_sites_urls_for_state(state_url):
    response = requests.get(state_url).text
    soup = BeautifulSoup(response, 'html.parser')

    selector = "h3 > a"
    site_hrefs = soup.select(selector)
    site_urls = ["https://www.nps.gov" + site_href["href"] for site_href in site_hrefs]

    return site_urls

In [71]:
def get_site_instance(state, site_url):
    response = requests.get(site_url).text
    soup = BeautifulSoup(response, 'html.parser')

    name = soup.find_all("a", {"class": "Hero-title"})[0].get_text()
    title = soup.find_all("div", {"class": "Component text-content-size text-content-style"})[0].find_all("h1")[0].get_text()
    description = soup.find_all("div", {"class": "Component text-content-size text-content-style"})[0].find_all("p")[0].get_text()
    
    nationalsite = NationalSite(name.replace('"', ""), 
                                state.replace('"', ""), 
                                title.replace('"', ""), 
                                description.replace('"', ""))
    
    return nationalsite

In [72]:
# save to csv
state_dict = build_state_url_dict()
for state, state_url in tqdm(state_dict.items()):
    site_urls = get_sites_urls_for_state(state_url)
    for site_url in site_urls:
        site = get_site_instance(state, site_url)
        site.write_csv()

100%|██████████████████████████████████████████████████████████████████████████████████| 56/56 [01:18<00:00,  1.39s/it]


## Tourist attractions in US cities

Source: https://www.city-data.com/articles/


In [2]:
class TouristSite:
    def __init__(self, name, state, title, description):
        # Init instance of national site
        self.name = name
        self.state = state
        self.title = title
        self.description = description
    
    def info(self):
        # Return a string representation of itself
        rep = '"' + '","'.join([self.name, self.state, self.title, self.description]) + '"\n'    
        return rep
            
    def write_single_csv(self):
        filename = './data/' + self.state + '_documents.csv'
        if not exists(filename):
            with open(filename, "x") as file:
                file.write("Name,State,Title,Description\n")
        with open(filename, "a", encoding='utf-8') as file:
            file.write(self.info())
            
    def write_whole_csv(self):
        filename = 'documents.csv'
        if not exists(filename):
            with open(filename, "x") as file:
                file.write("Name,State,Title,Description\n")
        with open(filename, "a", encoding='utf-8') as file:
            file.write(self.info())

In [3]:
def get_state_url_dict(source = "https://www.city-data.com/articles/"):
    # get a dict of state page name and its urls to state page
    response = requests.get(source).text
    soup = BeautifulSoup(response, 'html.parser')
    
    selector = 'div[id="tabs_by_category"] > ul[class="tab-list tab-list-short"] > li > a'
    state_urls = soup.select(selector)
    state_dict = {
        state_url['href'].split('/')[-1].split('.')[0]: \
            'https:' + state_url['href'] \
            for state_url in state_urls
    }
    return state_dict # key: state_page_name, value: url

In [4]:
def cache_sites_for_state_page(state_page_name, state_url):
    # cache the sites in a state page and store it into json file, like Florida6.json
    response = requests.get(state_url).text
    soup = BeautifulSoup(response, 'html.parser')
    
    # read the site pages
    selector = 'div[id="listing"] > div[class="col-md-4"] > a'
    site_urls = soup.select(selector)
    print(f'-> Reading site pages from state page <{state_page_name}>...')
    site_dict = {}
    for site_url in tqdm(site_urls):
        key = 'https://www.city-data.com' + site_url['href']
        value = requests.get('https://www.city-data.com' + site_url['href']).text
        site_dict[key] = value
        time.sleep(1) # add a gap between two request to prevent failure

    # write to cache file
    print(f'-> Caching state page <{state_page_name}>...')
    filename = './cache/' + state_page_name + '.json'
    with open(filename, 'w') as json_file:
        json.dump(site_dict, json_file, indent=4)
    print('Done.')
    
    return site_urls # a list of site urls in state page
    
def get_cache_sites_for_state_page(state_page_name):
    # get a list of site urls given the state page
    filename = './cache/' + state_page_name + '.json'
    try:
        with open(filename) as json_file:
            site_dict = json.load(json_file)
    except:
        return None
    
    site_list = list(site_dict.keys())
    return site_list

In [5]:
def get_cache_site_page(state_page_name, site_url):
    # helper function: get response of a site page, return None if not exist
    filename = './cache/' + state_page_name + '.json'
    try:
        with open(filename) as json_file:
            site_dict = json.load(json_file)
            response = site_dict[site_url]
    except:
        return None
    
    return response

def get_site_instance(state_page_name, site_url):
    # get the site instance for a site page
    response = get_cache_site_page(state_page_name, site_url)
    soup = BeautifulSoup(response, 'html.parser')

    state = state_page_name.translate(str.maketrans('', '', digits))
    title = soup.select('h1[class="city"] > span')[0].contents[0]
    paragraphs = [p.contents[0] for p in soup.select('div[class="well"] > p')]
    description = ' '.join(paragraphs)
    if title.count(' - ') == 2:
        name = title.split(' - ')[0]
    elif title.count(', ') == 2:
        name = title.split(', ')[0]
    else:
        name = title
    
    touristsite = TouristSite(name.replace('"', ""),
                              state.replace('"', ""), 
                              title.replace('"', ""), 
                              description.replace('"', ""))
    
    return touristsite

In [6]:
# --- Main --- #
state_dict = get_state_url_dict()

# cache all sites
# RUN THIS ONCE ONLY!!!
print("--- START CACHING FROM SOURCE WEBSITE ---")
for state_page_name, state_url in tqdm(state_dict.items()):
    filename = './cache/' + state_page_name + '.json'
    if not state_page_name[-1].isdigit() and not exists(filename):
        cache_sites_for_state_page(state_page_name, state_url)
print("---> CACHING DONE!")

  0%|                                                                                           | 0/77 [00:00<?, ?it/s]

--- START CACHING FROM SOURCE WEBSITE ---



  0%|                                                                                          | 0/300 [00:00<?, ?it/s][A

-> Reading site pages from state page <California3>...



  0%|▎                                                                                 | 1/300 [00:01<06:15,  1.25s/it][A
  1%|▌                                                                                 | 2/300 [00:02<06:12,  1.25s/it][A
  1%|▊                                                                                 | 3/300 [00:03<06:11,  1.25s/it][A
  1%|█                                                                                 | 4/300 [00:04<06:09,  1.25s/it][A
  2%|█▎                                                                                | 5/300 [00:06<06:09,  1.25s/it][A
  2%|█▋                                                                                | 6/300 [00:07<06:07,  1.25s/it][A
  2%|█▉                                                                                | 7/300 [00:08<06:05,  1.25s/it][A
  3%|██▏                                                                               | 8/300 [00:10<06:05,  1.25s/it][A
  3%|██▍       

 22%|██████████████████                                                               | 67/300 [01:24<04:53,  1.26s/it][A
 23%|██████████████████▎                                                              | 68/300 [01:25<04:50,  1.25s/it][A
 23%|██████████████████▋                                                              | 69/300 [01:26<04:49,  1.25s/it][A
 23%|██████████████████▉                                                              | 70/300 [01:27<04:48,  1.26s/it][A
 24%|███████████████████▏                                                             | 71/300 [01:28<04:46,  1.25s/it][A
 24%|███████████████████▍                                                             | 72/300 [01:30<04:45,  1.25s/it][A
 24%|███████████████████▋                                                             | 73/300 [01:31<04:44,  1.26s/it][A
 25%|███████████████████▉                                                             | 74/300 [01:32<04:43,  1.25s/it][A
 25%|███████████

 44%|███████████████████████████████████▍                                            | 133/300 [02:46<03:28,  1.25s/it][A
 45%|███████████████████████████████████▋                                            | 134/300 [02:47<03:28,  1.25s/it][A
 45%|████████████████████████████████████                                            | 135/300 [02:48<03:26,  1.25s/it][A
 45%|████████████████████████████████████▎                                           | 136/300 [02:50<03:26,  1.26s/it][A
 46%|████████████████████████████████████▌                                           | 137/300 [02:51<03:24,  1.26s/it][A
 46%|████████████████████████████████████▊                                           | 138/300 [02:52<03:22,  1.25s/it][A
 46%|█████████████████████████████████████                                           | 139/300 [02:53<03:21,  1.25s/it][A
 47%|█████████████████████████████████████▎                                          | 140/300 [02:55<03:20,  1.25s/it][A
 47%|███████████

 66%|█████████████████████████████████████████████████████                           | 199/300 [04:08<02:05,  1.24s/it][A
 67%|█████████████████████████████████████████████████████▎                          | 200/300 [04:09<02:04,  1.24s/it][A
 67%|█████████████████████████████████████████████████████▌                          | 201/300 [04:11<02:03,  1.25s/it][A
 67%|█████████████████████████████████████████████████████▊                          | 202/300 [04:12<02:02,  1.25s/it][A
 68%|██████████████████████████████████████████████████████▏                         | 203/300 [04:13<02:01,  1.25s/it][A
 68%|██████████████████████████████████████████████████████▍                         | 204/300 [04:14<01:59,  1.25s/it][A
 68%|██████████████████████████████████████████████████████▋                         | 205/300 [04:16<01:59,  1.26s/it][A
 69%|██████████████████████████████████████████████████████▉                         | 206/300 [04:17<01:58,  1.27s/it][A
 69%|███████████

 88%|██████████████████████████████████████████████████████████████████████▋         | 265/300 [05:31<00:43,  1.25s/it][A
 89%|██████████████████████████████████████████████████████████████████████▉         | 266/300 [05:32<00:42,  1.25s/it][A
 89%|███████████████████████████████████████████████████████████████████████▏        | 267/300 [05:33<00:40,  1.24s/it][A
 89%|███████████████████████████████████████████████████████████████████████▍        | 268/300 [05:35<00:39,  1.25s/it][A
 90%|███████████████████████████████████████████████████████████████████████▋        | 269/300 [05:36<00:38,  1.25s/it][A
 90%|████████████████████████████████████████████████████████████████████████        | 270/300 [05:37<00:37,  1.25s/it][A
 90%|████████████████████████████████████████████████████████████████████████▎       | 271/300 [05:38<00:36,  1.25s/it][A
 91%|████████████████████████████████████████████████████████████████████████▌       | 272/300 [05:40<00:34,  1.25s/it][A
 91%|███████████

-> Caching state page <California3>...


 12%|█████████▋                                                                         | 9/77 [06:15<47:19, 41.76s/it]

Done.



  0%|                                                                                          | 0/300 [00:00<?, ?it/s][A

-> Reading site pages from state page <California4>...



  0%|▎                                                                                 | 1/300 [00:01<06:13,  1.25s/it][A
  1%|▌                                                                                 | 2/300 [00:02<06:12,  1.25s/it][A
  1%|▊                                                                                 | 3/300 [00:03<06:11,  1.25s/it][A
  1%|█                                                                                 | 4/300 [00:04<06:09,  1.25s/it][A
  2%|█▎                                                                                | 5/300 [00:06<06:07,  1.24s/it][A
  2%|█▋                                                                                | 6/300 [00:07<06:04,  1.24s/it][A
  2%|█▉                                                                                | 7/300 [00:08<06:04,  1.24s/it][A
  3%|██▏                                                                               | 8/300 [00:09<06:02,  1.24s/it][A
  3%|██▍       

 22%|██████████████████                                                               | 67/300 [01:23<04:52,  1.26s/it][A
 23%|██████████████████▎                                                              | 68/300 [01:25<04:50,  1.25s/it][A
 23%|██████████████████▋                                                              | 69/300 [01:26<04:49,  1.25s/it][A
 23%|██████████████████▉                                                              | 70/300 [01:27<04:46,  1.25s/it][A
 24%|███████████████████▏                                                             | 71/300 [01:28<04:44,  1.24s/it][A
 24%|███████████████████▍                                                             | 72/300 [01:29<04:43,  1.24s/it][A
 24%|███████████████████▋                                                             | 73/300 [01:31<04:41,  1.24s/it][A
 25%|███████████████████▉                                                             | 74/300 [01:32<04:41,  1.25s/it][A
 25%|███████████

 44%|███████████████████████████████████▍                                            | 133/300 [02:46<03:28,  1.25s/it][A
 45%|███████████████████████████████████▋                                            | 134/300 [02:47<03:27,  1.25s/it][A
 45%|████████████████████████████████████                                            | 135/300 [02:48<03:26,  1.25s/it][A
 45%|████████████████████████████████████▎                                           | 136/300 [02:49<03:25,  1.25s/it][A
 46%|████████████████████████████████████▌                                           | 137/300 [02:51<03:25,  1.26s/it][A
 46%|████████████████████████████████████▊                                           | 138/300 [02:52<03:24,  1.26s/it][A
 46%|█████████████████████████████████████                                           | 139/300 [02:53<03:22,  1.26s/it][A
 47%|█████████████████████████████████████▎                                          | 140/300 [02:55<03:20,  1.25s/it][A
 47%|███████████

ConnectionError: HTTPSConnectionPool(host='www.city-data.com', port=443): Max retries exceeded with url: /articles/Hollywood-Hills.html (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000226A345F820>: Failed to establish a new connection: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应，连接尝试失败。'))

In [7]:
# write to the csv file
print("--- START WRITING FILES TO DOC DATA ---")
for state_page_name, state_url in tqdm(state_dict.items()):
    try:
        site_list = get_cache_sites_for_state_page(state_page_name)
        print(f"Writing sites from <{state_page_name}> to csv file...")
        filename = './data/' + state_page_name + '_documents.csv'
        if not exists(filename):
            for site_url in tqdm(site_list):
                site = get_site_instance(state_page_name, site_url)
                site.write_single_csv()
                site.write_whole_csv()
    except:
        pass

  0%|                                                                                           | 0/77 [00:00<?, ?it/s]

--- START WRITING FILES TO DOC DATA ---


  1%|█                                                                                  | 1/77 [00:00<00:33,  2.24it/s]

Writing sites from <Alaska> to csv file...


  3%|██▏                                                                                | 2/77 [00:00<00:29,  2.55it/s]

Writing sites from <Alabama> to csv file...


  4%|███▏                                                                               | 3/77 [00:01<00:28,  2.60it/s]

Writing sites from <Arkansas> to csv file...


  5%|████▎                                                                              | 4/77 [00:01<00:32,  2.23it/s]

Writing sites from <Arizona> to csv file...



  0%|                                                                                          | 0/300 [00:00<?, ?it/s][A

Writing sites from <Arizona2> to csv file...



  0%|▎                                                                                 | 1/300 [00:00<03:12,  1.56it/s][A
  1%|▌                                                                                 | 2/300 [00:01<03:08,  1.58it/s][A
  1%|▊                                                                                 | 3/300 [00:01<03:04,  1.61it/s][A
  1%|█                                                                                 | 4/300 [00:02<03:04,  1.60it/s][A
  2%|█▎                                                                                | 5/300 [00:03<03:03,  1.61it/s][A
  2%|█▋                                                                                | 6/300 [00:03<03:02,  1.61it/s][A
  2%|█▉                                                                                | 7/300 [00:04<03:02,  1.60it/s][A
  3%|██▏                                                                               | 8/300 [00:04<02:59,  1.63it/s][A
  3%|██▍       

 22%|██████████████████                                                               | 67/300 [00:40<02:20,  1.66it/s][A
 23%|██████████████████▎                                                              | 68/300 [00:41<02:19,  1.66it/s][A
 23%|██████████████████▋                                                              | 69/300 [00:42<02:19,  1.66it/s][A
 23%|██████████████████▉                                                              | 70/300 [00:42<02:17,  1.67it/s][A
 24%|███████████████████▏                                                             | 71/300 [00:43<02:17,  1.67it/s][A
 24%|███████████████████▍                                                             | 72/300 [00:43<02:17,  1.66it/s][A
 24%|███████████████████▋                                                             | 73/300 [00:44<02:16,  1.66it/s][A
 25%|███████████████████▉                                                             | 74/300 [00:45<02:15,  1.67it/s][A
 25%|███████████

 44%|███████████████████████████████████▍                                            | 133/300 [01:21<01:41,  1.65it/s][A
 45%|███████████████████████████████████▋                                            | 134/300 [01:21<01:40,  1.65it/s][A
 45%|████████████████████████████████████                                            | 135/300 [01:22<01:39,  1.66it/s][A
 45%|████████████████████████████████████▎                                           | 136/300 [01:22<01:38,  1.67it/s][A
 46%|████████████████████████████████████▌                                           | 137/300 [01:23<01:36,  1.68it/s][A
 46%|████████████████████████████████████▊                                           | 138/300 [01:24<01:36,  1.68it/s][A
 46%|█████████████████████████████████████                                           | 139/300 [01:24<01:36,  1.67it/s][A
 47%|█████████████████████████████████████▎                                          | 140/300 [01:25<01:37,  1.64it/s][A
 47%|███████████

 66%|█████████████████████████████████████████████████████                           | 199/300 [02:01<01:01,  1.63it/s][A
 67%|█████████████████████████████████████████████████████▎                          | 200/300 [02:01<01:00,  1.65it/s][A
 67%|█████████████████████████████████████████████████████▌                          | 201/300 [02:02<00:59,  1.66it/s][A
 67%|█████████████████████████████████████████████████████▊                          | 202/300 [02:02<00:58,  1.68it/s][A
 68%|██████████████████████████████████████████████████████▏                         | 203/300 [02:03<00:58,  1.66it/s][A
 68%|██████████████████████████████████████████████████████▍                         | 204/300 [02:03<00:57,  1.67it/s][A
 68%|██████████████████████████████████████████████████████▋                         | 205/300 [02:04<00:56,  1.68it/s][A
 69%|██████████████████████████████████████████████████████▉                         | 206/300 [02:05<00:55,  1.69it/s][A
 69%|███████████

 88%|██████████████████████████████████████████████████████████████████████▋         | 265/300 [02:40<00:20,  1.68it/s][A
 89%|██████████████████████████████████████████████████████████████████████▉         | 266/300 [02:41<00:20,  1.66it/s][A
 89%|███████████████████████████████████████████████████████████████████████▏        | 267/300 [02:41<00:19,  1.67it/s][A
 89%|███████████████████████████████████████████████████████████████████████▍        | 268/300 [02:42<00:19,  1.66it/s][A
 90%|███████████████████████████████████████████████████████████████████████▋        | 269/300 [02:43<00:18,  1.67it/s][A
 90%|████████████████████████████████████████████████████████████████████████        | 270/300 [02:43<00:17,  1.67it/s][A
 90%|████████████████████████████████████████████████████████████████████████▎       | 271/300 [02:44<00:17,  1.69it/s][A
 91%|████████████████████████████████████████████████████████████████████████▌       | 272/300 [02:44<00:17,  1.64it/s][A
 91%|███████████

Writing sites from <Arizona3> to csv file...



  1%|▌                                                                                 | 1/156 [00:00<00:53,  2.90it/s][A
  1%|█                                                                                 | 2/156 [00:00<00:52,  2.94it/s][A
  2%|█▌                                                                                | 3/156 [00:01<00:51,  2.94it/s][A
  3%|██                                                                                | 4/156 [00:01<00:50,  2.99it/s][A
  3%|██▋                                                                               | 5/156 [00:01<00:50,  3.02it/s][A
  4%|███▏                                                                              | 6/156 [00:01<00:49,  3.06it/s][A
  4%|███▋                                                                              | 7/156 [00:02<00:49,  3.03it/s][A
  5%|████▏                                                                             | 8/156 [00:02<00:49,  3.00it/s][A
  6%|████▋     

 43%|██████████████████████████████████▊                                              | 67/156 [00:24<00:31,  2.82it/s][A
 44%|███████████████████████████████████▎                                             | 68/156 [00:24<00:30,  2.86it/s][A
 44%|███████████████████████████████████▊                                             | 69/156 [00:24<00:30,  2.86it/s][A
 45%|████████████████████████████████████▎                                            | 70/156 [00:25<00:30,  2.85it/s][A
 46%|████████████████████████████████████▊                                            | 71/156 [00:25<00:30,  2.80it/s][A
 46%|█████████████████████████████████████▍                                           | 72/156 [00:25<00:29,  2.85it/s][A
 47%|█████████████████████████████████████▉                                           | 73/156 [00:26<00:28,  2.88it/s][A
 47%|██████████████████████████████████████▍                                          | 74/156 [00:26<00:28,  2.92it/s][A
 48%|███████████

Writing sites from <California> to csv file...



  0%|▎                                                                                 | 1/300 [00:00<03:08,  1.58it/s][A
  1%|▌                                                                                 | 2/300 [00:01<03:04,  1.61it/s][A
  1%|▊                                                                                 | 3/300 [00:01<03:02,  1.63it/s][A
  1%|█                                                                                 | 4/300 [00:02<03:00,  1.64it/s][A
  2%|█▎                                                                                | 5/300 [00:03<03:00,  1.63it/s][A
  2%|█▋                                                                                | 6/300 [00:03<02:59,  1.64it/s][A
  2%|█▉                                                                                | 7/300 [00:04<02:57,  1.65it/s][A
  3%|██▏                                                                               | 8/300 [00:04<02:59,  1.62it/s][A
  3%|██▍       

 22%|██████████████████                                                               | 67/300 [00:40<02:16,  1.70it/s][A
 23%|██████████████████▎                                                              | 68/300 [00:40<02:15,  1.71it/s][A
 23%|██████████████████▋                                                              | 69/300 [00:41<02:15,  1.71it/s][A
 23%|██████████████████▉                                                              | 70/300 [00:42<02:15,  1.70it/s][A
 24%|███████████████████▏                                                             | 71/300 [00:42<02:15,  1.69it/s][A
 24%|███████████████████▍                                                             | 72/300 [00:43<02:14,  1.70it/s][A
 24%|███████████████████▋                                                             | 73/300 [00:43<02:16,  1.66it/s][A
 25%|███████████████████▉                                                             | 74/300 [00:44<02:14,  1.68it/s][A
 25%|███████████

 44%|███████████████████████████████████▍                                            | 133/300 [01:19<01:41,  1.65it/s][A
 45%|███████████████████████████████████▋                                            | 134/300 [01:19<01:39,  1.66it/s][A
 45%|████████████████████████████████████                                            | 135/300 [01:20<01:38,  1.68it/s][A
 45%|████████████████████████████████████▎                                           | 136/300 [01:21<01:36,  1.69it/s][A
 46%|████████████████████████████████████▌                                           | 137/300 [01:21<01:36,  1.70it/s][A
 46%|████████████████████████████████████▊                                           | 138/300 [01:22<01:35,  1.69it/s][A
 46%|█████████████████████████████████████                                           | 139/300 [01:22<01:35,  1.69it/s][A
 47%|█████████████████████████████████████▎                                          | 140/300 [01:23<01:34,  1.69it/s][A
 47%|███████████

 66%|█████████████████████████████████████████████████████                           | 199/300 [01:58<00:59,  1.70it/s][A
 67%|█████████████████████████████████████████████████████▎                          | 200/300 [01:58<00:58,  1.70it/s][A
 67%|█████████████████████████████████████████████████████▌                          | 201/300 [01:59<00:59,  1.67it/s][A
 67%|█████████████████████████████████████████████████████▊                          | 202/300 [02:00<00:58,  1.68it/s][A
 68%|██████████████████████████████████████████████████████▏                         | 203/300 [02:00<00:57,  1.68it/s][A
 68%|██████████████████████████████████████████████████████▍                         | 204/300 [02:01<00:57,  1.68it/s][A
 68%|██████████████████████████████████████████████████████▋                         | 205/300 [02:01<00:56,  1.69it/s][A
 69%|██████████████████████████████████████████████████████▉                         | 206/300 [02:02<00:55,  1.69it/s][A
 69%|███████████

 88%|██████████████████████████████████████████████████████████████████████▋         | 265/300 [02:37<00:20,  1.72it/s][A
 89%|██████████████████████████████████████████████████████████████████████▉         | 266/300 [02:37<00:19,  1.71it/s][A
 89%|███████████████████████████████████████████████████████████████████████▏        | 267/300 [02:38<00:19,  1.71it/s][A
 89%|███████████████████████████████████████████████████████████████████████▍        | 268/300 [02:39<00:19,  1.67it/s][A
 90%|███████████████████████████████████████████████████████████████████████▋        | 269/300 [02:39<00:18,  1.68it/s][A
 90%|████████████████████████████████████████████████████████████████████████        | 270/300 [02:40<00:17,  1.68it/s][A
 90%|████████████████████████████████████████████████████████████████████████▎       | 271/300 [02:40<00:17,  1.68it/s][A
 91%|████████████████████████████████████████████████████████████████████████▌       | 272/300 [02:41<00:16,  1.69it/s][A
 91%|███████████

Writing sites from <California2> to csv file...



  0%|▎                                                                                 | 1/300 [00:00<02:55,  1.70it/s][A
  1%|▌                                                                                 | 2/300 [00:01<02:53,  1.72it/s][A
  1%|▊                                                                                 | 3/300 [00:01<02:53,  1.71it/s][A
  1%|█                                                                                 | 4/300 [00:02<02:53,  1.71it/s][A
  2%|█▎                                                                                | 5/300 [00:02<02:52,  1.71it/s][A
  2%|█▋                                                                                | 6/300 [00:03<02:56,  1.66it/s][A
  2%|█▉                                                                                | 7/300 [00:04<02:54,  1.68it/s][A
  3%|██▏                                                                               | 8/300 [00:04<02:53,  1.68it/s][A
  3%|██▍       

 22%|██████████████████                                                               | 67/300 [00:39<02:18,  1.68it/s][A
 23%|██████████████████▎                                                              | 68/300 [00:40<02:17,  1.69it/s][A
 23%|██████████████████▋                                                              | 69/300 [00:40<02:16,  1.69it/s][A
 23%|██████████████████▉                                                              | 70/300 [00:41<02:14,  1.70it/s][A
 24%|███████████████████▏                                                             | 71/300 [00:41<02:13,  1.71it/s][A
 24%|███████████████████▍                                                             | 72/300 [00:42<02:13,  1.70it/s][A
 24%|███████████████████▋                                                             | 73/300 [00:42<02:13,  1.71it/s][A
 25%|███████████████████▉                                                             | 74/300 [00:43<02:13,  1.69it/s][A
 25%|███████████

 44%|███████████████████████████████████▍                                            | 133/300 [01:18<01:38,  1.70it/s][A
 45%|███████████████████████████████████▋                                            | 134/300 [01:18<01:37,  1.70it/s][A
 45%|████████████████████████████████████                                            | 135/300 [01:19<01:36,  1.71it/s][A
 45%|████████████████████████████████████▎                                           | 136/300 [01:19<01:35,  1.72it/s][A
 46%|████████████████████████████████████▌                                           | 137/300 [01:20<01:35,  1.71it/s][A
 46%|████████████████████████████████████▊                                           | 138/300 [01:21<01:34,  1.72it/s][A
 46%|█████████████████████████████████████                                           | 139/300 [01:21<01:33,  1.71it/s][A
 47%|█████████████████████████████████████▎                                          | 140/300 [01:22<01:33,  1.71it/s][A
 47%|███████████

 66%|█████████████████████████████████████████████████████                           | 199/300 [01:56<01:00,  1.68it/s][A
 67%|█████████████████████████████████████████████████████▎                          | 200/300 [01:57<00:59,  1.68it/s][A
 67%|█████████████████████████████████████████████████████▌                          | 201/300 [01:58<00:58,  1.68it/s][A
 67%|█████████████████████████████████████████████████████▊                          | 202/300 [01:58<00:57,  1.69it/s][A
 68%|██████████████████████████████████████████████████████▏                         | 203/300 [01:59<00:56,  1.70it/s][A
 68%|██████████████████████████████████████████████████████▍                         | 204/300 [01:59<00:56,  1.70it/s][A
 68%|██████████████████████████████████████████████████████▋                         | 205/300 [02:00<00:55,  1.70it/s][A
 69%|██████████████████████████████████████████████████████▉                         | 206/300 [02:01<00:54,  1.71it/s][A
 69%|███████████

 88%|██████████████████████████████████████████████████████████████████████▋         | 265/300 [02:35<00:20,  1.72it/s][A
 89%|██████████████████████████████████████████████████████████████████████▉         | 266/300 [02:36<00:19,  1.72it/s][A
 89%|███████████████████████████████████████████████████████████████████████▏        | 267/300 [02:36<00:19,  1.72it/s][A
 89%|███████████████████████████████████████████████████████████████████████▍        | 268/300 [02:37<00:18,  1.72it/s][A
 90%|███████████████████████████████████████████████████████████████████████▋        | 269/300 [02:38<00:18,  1.72it/s][A
 90%|████████████████████████████████████████████████████████████████████████        | 270/300 [02:38<00:17,  1.72it/s][A
 90%|████████████████████████████████████████████████████████████████████████▎       | 271/300 [02:39<00:16,  1.73it/s][A
 91%|████████████████████████████████████████████████████████████████████████▌       | 272/300 [02:39<00:16,  1.72it/s][A
 91%|███████████

Writing sites from <California3> to csv file...



  0%|▎                                                                                 | 1/300 [00:00<02:56,  1.70it/s][A
  1%|▌                                                                                 | 2/300 [00:01<02:55,  1.70it/s][A
  1%|▊                                                                                 | 3/300 [00:01<02:53,  1.71it/s][A
  1%|█                                                                                 | 4/300 [00:02<02:52,  1.71it/s][A
  2%|█▎                                                                                | 5/300 [00:02<02:50,  1.73it/s][A
  2%|█▋                                                                                | 6/300 [00:03<02:50,  1.72it/s][A
  2%|█▉                                                                                | 7/300 [00:04<03:13,  1.51it/s][A
 12%|█████████▍                                                                       | 9/77 [09:53<1:35:19, 84.10s/it]
0it [00:00, ?it/s]

Writing sites from <California4> to csv file...
Writing sites from <California5> to csv file...
Writing sites from <California6> to csv file...
Writing sites from <California7> to csv file...
Writing sites from <California8> to csv file...
Writing sites from <California9> to csv file...
Writing sites from <California10> to csv file...
Writing sites from <California11> to csv file...
Writing sites from <Colorado> to csv file...
Writing sites from <Colorado2> to csv file...
Writing sites from <Connecticut> to csv file...
Writing sites from <District-of-Columbia> to csv file...
Writing sites from <Delaware> to csv file...
Writing sites from <Florida> to csv file...
Writing sites from <Florida2> to csv file...
Writing sites from <Florida3> to csv file...
Writing sites from <Florida4> to csv file...
Writing sites from <Florida5> to csv file...
Writing sites from <Florida6> to csv file...
Writing sites from <Georgia> to csv file...
Writing sites from <Georgia2> to csv file...
Writing sites f


0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A
 97%|███████████████████████████████████████████████████████████████████████████████▊  | 75/77 [09:53<01:22, 41.21s/it]
0it [00:00, ?it/s][A


Writing sites from <Minnesota> to csv file...
Writing sites from <Missouri> to csv file...
Writing sites from <Mississippi> to csv file...
Writing sites from <Montana> to csv file...
Writing sites from <North-Carolina> to csv file...
Writing sites from <North-Dakota> to csv file...
Writing sites from <Nebraska> to csv file...
Writing sites from <New-Hampshire> to csv file...
Writing sites from <New-Jersey> to csv file...
Writing sites from <New-Mexico> to csv file...
Writing sites from <Nevada> to csv file...
Writing sites from <Nevada2> to csv file...
Writing sites from <New-York> to csv file...
Writing sites from <New-York2> to csv file...
Writing sites from <New-York3> to csv file...
Writing sites from <Ohio> to csv file...
Writing sites from <Oklahoma> to csv file...
Writing sites from <Oregon> to csv file...
Writing sites from <Pennsylvania> to csv file...
Writing sites from <Pennsylvania2> to csv file...
Writing sites from <Rhode-Island> to csv file...
Writing sites from <South-C

100%|██████████████████████████████████████████████████████████████████████████████████| 77/77 [09:53<00:00,  7.71s/it]

Writing sites from <Wyoming> to csv file...





In [12]:
state_dict

{'Alaska': 'https://www.city-data.com/articles/Alaska.html',
 'Alabama': 'https://www.city-data.com/articles/Alabama.html',
 'Arkansas': 'https://www.city-data.com/articles/Arkansas.html',
 'Arizona': 'https://www.city-data.com/articles/Arizona.html',
 'Arizona2': 'https://www.city-data.com/articles/Arizona2.html',
 'Arizona3': 'https://www.city-data.com/articles/Arizona3.html',
 'California': 'https://www.city-data.com/articles/California.html',
 'California2': 'https://www.city-data.com/articles/California2.html',
 'California3': 'https://www.city-data.com/articles/California3.html',
 'California4': 'https://www.city-data.com/articles/California4.html',
 'California5': 'https://www.city-data.com/articles/California5.html',
 'California6': 'https://www.city-data.com/articles/California6.html',
 'California7': 'https://www.city-data.com/articles/California7.html',
 'California8': 'https://www.city-data.com/articles/California8.html',
 'California9': 'https://www.city-data.com/articles/