In [1]:
#import required libraries
import requests
from bs4 import BeautifulSoup

import pandas as pd

In [2]:
#set the webpage link to the variable "url"
url = "https://en.wikipedia.org/wiki/List_of_counties_in_Maryland#List_of_counties"

In [3]:
#connect to the website
webpage = requests.get(url)

In [4]:
#check verification & permission status
webpage.status_code

200

In [5]:
#save HTML content (string type) to a variable
HTML_str = webpage.text

In [6]:
#parse the string text into HTML/XML using BeautifulSoup function
soup = BeautifulSoup(HTML_str, "lxml")

In [7]:
#search for the <table> tag that has the class "wikisortable"
table = soup.find('table', class_='wikitable sortable')

In [8]:
#find all the <th> tags in the <table> tag that was found
#this is a list
table.find_all('th')

[<th scope="col" style="vertical-align: top; width: 90px;">County<br/>
 </th>,
 <th scope="col" style="vertical-align: top; width:75px;">FIPS code<sup class="reference" id="cite_ref-FIPS_5-0"><a href="#cite_note-FIPS-5">[5]</a></sup>
 </th>,
 <th scope="col" style="vertical-align: top; width: ;"><a href="/wiki/County_seat" title="County seat">County seat</a><sup class="reference" id="cite_ref-Manual_2-1"><a href="#cite_note-Manual-2">[2]</a></sup><sup class="reference" id="cite_ref-NACO_6-0"><a href="#cite_note-NACO-6">[6]</a></sup>
 </th>,
 <th scope="col" style="vertical-align: top; width: ;">Established<sup class="reference" id="cite_ref-Manual_2-2"><a href="#cite_note-Manual-2">[2]</a></sup><sup class="reference" id="cite_ref-NACO_6-1"><a href="#cite_note-NACO-6">[6]</a></sup>
 </th>,
 <th scope="col" style="vertical-align: top;">Origin<sup class="reference" id="cite_ref-Manual_2-3"><a href="#cite_note-Manual-2">[2]</a></sup>
 </th>,
 <th class="unsortable" scope="col" style="verti

In [9]:
#empty list to hold the names of counties
cnty_namels = []

#dictionary with empty lists as values to hold other table info
cnty_info = {'FIPSCode':[],
             'CountySeat':[],
             'YearEstablished':[], 
             'Origin':[], 
             'Etymology':[],
             'Population':[], 
             'Area':[]}

#for each <tr> (row) in all of the rows that exist in the table
for row in table.find_all('tr'):
    
    #get all <th> tags, save to variable
    th_tags = row.find_all('th')
    
    #get data from rows that only have 1 <th> tag
    #NOTE: column header row on table has 11 <th> tags, so it doesn't meet the check
    if len(th_tags) == 1: 
        cnty_namels.append(th_tags[0].find(text=True))
    
    #get all the <td> tags for the row
    td_tags = row.find_all('td')
    
    #check to see if the row 10 <td> tags
    if len(td_tags) == 10:
    
        #get data from each <td> tag needed; append to list in dict
        cnty_info['FIPSCode'].append(td_tags[0].find(text=True))
        cnty_info['CountySeat'].append(td_tags[1].find(text=True))
        cnty_info['YearEstablished'].append(td_tags[2].find(text=True))
        cnty_info['Origin'].append(td_tags[3].find(text=True))
        cnty_info['Etymology'].append(td_tags[4].find(text=True))
        cnty_info['Population'].append(td_tags[7].find(text=True))
        cnty_info['Area'].append(td_tags[8].find(text=True))

In [10]:
#check for data in "cnty_namels" variable
cnty_namels

['Allegany County',
 'Anne Arundel County',
 'Baltimore County',
 'Baltimore City',
 'Calvert County',
 'Caroline County',
 'Carroll County',
 'Cecil County',
 'Charles County',
 'Dorchester County',
 'Frederick County',
 'Garrett County',
 'Harford County',
 'Howard County',
 'Kent County',
 'Montgomery County',
 "Prince George's County",
 "Queen Anne's County",
 "Saint Mary's County",
 'Somerset County',
 'Talbot County',
 'Washington County',
 'Wicomico County',
 'Worcester County']

In [11]:
#check data in "Area" key in "cnty_info" dictionary
cnty_info['Area']

['430',
 '588',
 '682',
 '92',
 '345',
 '326',
 '452',
 '418',
 '643',
 '540',
 '667',
 '656',
 '527',
 '254',
 '414',
 '507',
 '498',
 '510',
 '611',
 '611',
 '477',
 '468',
 '400',
 '695']

In [12]:
#start a dataframe with one column made from "cnty_namels" variable
MDcounties_df = pd.DataFrame({'CountyName':cnty_namels})

In [13]:
#check info in dataframe
MDcounties_df

Unnamed: 0,CountyName
0,Allegany County
1,Anne Arundel County
2,Baltimore County
3,Baltimore City
4,Calvert County
5,Caroline County
6,Carroll County
7,Cecil County
8,Charles County
9,Dorchester County


In [14]:

# the .items() function gives back the key
# and value from a dictionary
for key, value in cnty_info.items():
    
    # use the key to create the column name 
    # on the dataframe (or update info in a column)
    #use the value to add the info to a new row
    MDcounties_df[key] = value

In [15]:

#check the dataframe contents
MDcounties_df

Unnamed: 0,CountyName,FIPSCode,CountySeat,YearEstablished,Origin,Etymology,Population,Area
0,Allegany County,1,Cumberland,1789,Formed from part of Washington County.,From the Lenape Indian word,74012,430
1,Anne Arundel County,3,Annapolis,1650,Formed from part of St. Mary's County.,Anne Arundell,550488,588
2,Baltimore County,5,Towson,1659,Formed from unorganized territory,"Cecil Calvert, 2nd Baron Baltimore",817455,682
3,Baltimore City,510,Baltimore City,1851,Founded in 1729. Detached in 1851 from Baltimo...,"Cecil Calvert, 2nd Baron Baltimore",621342,92
4,Calvert County,9,Prince Frederick,1654,Formed as Patuxent County from unorganized ter...,The,89628,345
5,Caroline County,11,Denton,1773,From parts of Dorchester County and Queen Anne...,"Lady Caroline Eden, daughter of",32718,326
6,Carroll County,13,Westminster,1837,From parts of Baltimore County and Frederick C...,Charles Carroll of Carrollton,167217,452
7,Cecil County,15,Elkton,1672,From parts of Baltimore County and Kent County,Cecil is an Anglicized form of the first name of,101696,418
8,Charles County,17,La Plata,1658,From unorganized territory,"Charles Calvert, 3rd Baron Baltimore",150592,643
9,Dorchester County,19,Cambridge,1668,From unorganized territory,Dorchester,32551,540
