In [4]:
# 1) Importing libraries

import pandas as pd
import numpy as np
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [5]:
# 2) Creating a function to get data in HTML format from website.

def getHTML(url):
    htmlcontent = urlopen(url)
    beautify = BeautifulSoup(htmlcontent, 'html.parser')
    return beautify

In [6]:
# 3) Using a function 'getHTML' to get data from Wikipedia

uscities = getHTML('https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population')


In [7]:
# 4) Get the whole table as well as links for the each city page.

links_table = uscities.find('table', {'class': 'wikitable sortable'})
rows = links_table.find_all('tr')

for i in range(1, len(rows)):
    tds = rows[i].find_all('td')    
    if len(tds) != 0:
        city_link = tds[1].find('a')
        city_info = [td.text.replace('\n', '').replace('\xa0', '').replace('\ufeff', '') for td in tds]
        #print(city_link.get('href'))
        #print(city_info) 

In [8]:
# 5) Defining a function to get data from individual city pages. Here we are extacting only name of 'Mayor' for each city.

def getAdditionalDetails(url):
    try:
        city_page = getHTML('https://en.wikipedia.org' + url) 
        table = city_page.find('table', {'class': 'infobox geography vcard'})
        additional_details = []
        read_content = False
        for tr in table.find_all('tr'):
            if (tr.get('class') == ['mergedtoprow'] and not read_content):
                link = tr.find('th')
                if (link and (link.get_text().strip() == 'Government')):
                    read_content = True
                    
            if ((tr.get('class') == ['mergedrow']) and read_content):               
                if (tr.find('th').get_text().strip() == '•\xa0Mayor'):
                        additional_details.append(tr.find('td').get_text().strip('\n')) 
                                   
        return additional_details
    except Exception as error:
        print('Error occured: {}'.format(error))
        return []


In [10]:
# 6) Combining all the collected data and defining a dataframe.

data_final = []
for i in range(1, len(rows)):
    tds = rows[i].find_all('td')    
    if len(tds) != 0:
        city_link = tds[1].find('a')
        city_info = [td.text.replace('\n', '').replace('\xa0', '').replace('\ufeff', '') for td in tds]
        city_details = getAdditionalDetails(city_link.get('href'))
        city_info += city_details
        data_final.append(city_info)
        
df = pd.DataFrame(data_final)

In [11]:
# 7) Define column headings

headers = rows[0].find_all('th')
headers = [header.get_text().strip('\n') for header in headers]
headers += ['Mayor']
headers.insert(7,'2016 Land Area(km2)')
headers.insert(9,'2016 Population Density(km2)')
#print(headers)
df.columns = headers
df.rename(columns={'State[c]': 'State'}, inplace = True)
df.rename(columns={'2016 land area': '2016 Land area(sqmi)'}, inplace = True)
df.rename(columns={'2016 population density': '2016 Population Density(sqmi)'}, inplace = True)
#print(df)

In [12]:
# 8) Data Cleaning

for column in df.columns:
    df[column] = df[column].str.replace(r"\(.*\)", "")
    df[column] = df[column].str.replace(r"\[.*\]", "")
    df['Change'] = df['Change'].str.strip('%')
    df['2016 Land area(sqmi)'] = df['2016 Land area(sqmi)'].str.strip("/sqmi")
    df['2016 Land Area(km2)'] = df['2016 Land Area(km2)'].str.strip("/km")
    df['2016 Population Density(sqmi)'] = df['2016 Population Density(sqmi)'].str.strip("/sqmi")
    df['2016 Population Density(km2)'] = df['2016 Population Density(km2)'].str.strip("/km")   
    #print(df)
    
df.to_csv("Final_Dataset.csv", index = False)