In [1]:
# Import Libraries
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [3]:
# URL of the Wikipedia page to scrape
url = "https://en.wikipedia.org/wiki/List_of_largest_cities"

In [5]:
# Send a GET request to fetch the page content
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

In [7]:
# Find the table in the page with the 'wikitable' class
table = soup.find('table', {'class': 'wikitable'})

In [9]:
# Read the table into a pandas DataFrame
df = pd.read_html(str(table))[0]

  df = pd.read_html(str(table))[0]


In [15]:
# Flatten the MultiIndex columns by joining them into single strings
df.columns = [' '.join(col).strip() for col in df.columns.values]

In [17]:
# Check the current column names to understand how to rename them
print("Flattened column names:")
print(df.columns)

Flatten column names:
Index(['City[a] City[a]', 'Country Country',
       'UN 2024 population estimates[b] UN 2024 population estimates[b]',
       'City proper[c] Definition', 'City proper[c] Population',
       'City proper[c] Area (km2)', 'City proper[c] Density (/km2)',
       'Urban area[8] Population', 'Urban area[8] Area (km2)',
       'Urban area[8] Density (/km2)', 'Metropolitan area[d] Population',
       'Metropolitan area[d] Area (km2)',
       'Metropolitan area[d] Density (/km2)'],
      dtype='object')


In [23]:
# Rename the relevant columns
df.rename(columns={
    'City[a] City[a]': 'City.a.', 
    'Country Country': 'Country', 
    'Urban area[8] Population': 'Population', 
    'Urban area[8] Area (km2)': 'Area.km2.', 
    'Urban area[8] Density (/km2)': 'Density..km2.'
}, inplace=True)

In [27]:
# Filter rows where 'Urban area' columns are relevant
urban_area_df = df[['City.a.', 'Country', 'Population', 'Area.km2.', 'Density..km2.']].copy()

In [29]:
# Convert numeric columns to proper numeric types, ignoring errors for any malformed data
urban_area_df['Population'] = pd.to_numeric(urban_area_df['Population'], errors='coerce')
urban_area_df['Area.km2.'] = pd.to_numeric(urban_area_df['Area.km2.'], errors='coerce')
urban_area_df['Density..km2.'] = pd.to_numeric(urban_area_df['Density..km2.'], errors='coerce')

In [31]:
# Drop rows with missing numeric data (optional, if necessary)
urban_area_df.dropna(subset=['Population', 'Area.km2.', 'Density..km2.'], inplace=True)

In [33]:
# Reset the index to start from 1 for cleaner output
urban_area_df.reset_index(drop=True, inplace=True)

In [35]:
# Display the first few rows of the resulting DataFrame
print(urban_area_df.head())

       City.a.      Country  Population  Area.km2.  Density..km2.
0    Chongqing        China  12135000.0     1580.0         7680.0
1        Seoul  South Korea  23016000.0     2769.0         8312.0
2        Dhaka   Bangladesh  18627000.0      619.0        30092.0
3        Cairo        Egypt  20296000.0     2010.0        10098.0
4  Mexico City       Mexico  21804000.0     2530.0         8618.0
