# CalTrans District Web Scrape

### OBJECTIVE

- This notebook will scrape districts and counties from the wikipedia url listed below.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import requests
from bs4 import BeautifulSoup

import selenium

In [2]:
url = 'https://en.wikipedia.org/wiki/California_Department_of_Transportation'

In [3]:
res = requests.get(url)

In [4]:
soup = BeautifulSoup(res.content)

In [5]:
tables = soup.find_all('table')

In [6]:
main_header = [th.text.lower().strip().replace(' ','_') for th in tables[1].find_all('th')]
main_header.append('table')
main_header

['district[12]', 'area_(counties)', 'headquarters', 'table']

In [7]:
district_fires = []
for i, table in enumerate(tables):
    table_df = {}
    headers = {th.text.lower().strip().replace(' ','_'):index for index,th in enumerate(tables[1].find_all('th'))}

    for row in table.find_all('tr')[1:]:
        district_f = {}

        for j,feature in enumerate(main_header):
            try:
                district_f[feature] = row.find_all('td')[headers[feature]].text.strip()
            except:
                district_f[feature] = np.nan
                
        district_f['table'] = i
        district_fires.append(district_f)
pd.DataFrame(district_fires).to_csv('./Data/districts.csv',index=False)

In [8]:
df = pd.read_csv('./Data/districts.csv')

In [9]:
df.head()

Unnamed: 0,district[12],area_(counties),headquarters,table
0,,,,0
1,1972; 47 years ago (1972),,,0
2,California Bureau of HighwaysCalifornia Depart...,,,0
3,California State Government,,,0
4,"1120 N Street, Sacramento, California38°34′28″...",,,0


In [10]:
df.shape

(27, 4)

In [11]:
districts = df.loc[df['table'] == 1]

In [12]:
districts.head(50)

Unnamed: 0,district[12],area_(counties),headquarters,table
13,1,"Del Norte, Humboldt, Lake, Mendocino",Eureka,1
14,2,"Lassen, Modoc, Plumas, Shasta, Siskiyou, Teham...",Redding,1
15,3,"Butte, Colusa, El Dorado, Glenn, Nevada, Place...",Marysville,1
16,4,"Alameda, Contra Costa, Marin, Napa, San Franci...",Oakland,1
17,5,"Monterey, San Benito, San Luis Obispo, Santa B...",San Luis Obispo,1
18,6,"Madera, Fresno, Tulare, Kings, Kern",Fresno,1
19,7,"Los Angeles, Ventura",Los Angeles,1
20,8,"Riverside, San Bernardino",San Bernardino,1
21,9,"Inyo, Mono",Bishop,1
22,10,"Alpine, Amador, Calaveras, Mariposa, Merced, S...",Stockton,1


In [13]:
# Strip references from the cell
references = index.findAll("sup", {"class": "reference"})
if references:
    for ref in references:
        ref.extract()

# Strip sortkeys from the cell
sortkeys = index.findAll("span", {"class": "sortkey"})
if sortkeys:
    for ref in sortkeys:
        ref.extract()

# Strip footnotes from text and join into a single string
text_items = index.findAll(text=True)
no_footnotes = [text for text in text_items if text[0] != '[']
puretext = ''.join(no_footnotes)

# Replace non-breaking spaces with regular spaces and add quotes
puretext = puretext.replace('\xa0', ' ')

NameError: name 'index' is not defined

In [1]:
for col in districts.columns:
    print(col)
    districts[col] = districts[col].map(lambda x: re.sub(pattern='(\[\d+\])+$',repl='',string=x))

NameError: name 'districts' is not defined

In [14]:
districts.head(50)

Unnamed: 0,district[12],area_(counties),headquarters,table
13,1,"Del Norte, Humboldt, Lake, Mendocino",Eureka,1
14,2,"Lassen, Modoc, Plumas, Shasta, Siskiyou, Teham...",Redding,1
15,3,"Butte, Colusa, El Dorado, Glenn, Nevada, Place...",Marysville,1
16,4,"Alameda, Contra Costa, Marin, Napa, San Franci...",Oakland,1
17,5,"Monterey, San Benito, San Luis Obispo, Santa B...",San Luis Obispo,1
18,6,"Madera, Fresno, Tulare, Kings, Kern",Fresno,1
19,7,"Los Angeles, Ventura",Los Angeles,1
20,8,"Riverside, San Bernardino",San Bernardino,1
21,9,"Inyo, Mono",Bishop,1
22,10,"Alpine, Amador, Calaveras, Mariposa, Merced, S...",Stockton,1


In [15]:
districts.to_csv('./Data/districts.csv',index=False)

### SUMMARY

- This notebook scrapes California district information from a wikipedia page. The columns include district name, counties, and headquarters.