# Scrape Table Data for U.S. Air Force Bases and Installations for Active Duty, National Guard, and Reserve from Wikipedia

In [1]:
# Import Dependencies
from bs4 import BeautifulSoup as bs
from splinter import Browser
import pandas as pd
import json
import re
from pprint import pprint

In [2]:
# Launch browser
browser = Browser ('chrome')

The chromedriver version (125.0.6422.78) detected in PATH at /opt/homebrew/bin/chromedriver might not be compatible with the detected chrome version (126.0.6478.127); currently, chromedriver 126.0.6478.126 is recommended for chrome 126.*, so it is advised to delete the driver in PATH and retry


## Step 1: Visit the Website

In [3]:
# Visit the website
url = 'https://en.wikipedia.org/wiki/List_of_United_States_Air_Force_installations'
browser.visit(url)

## Step 2: Scrape the Website

In [4]:
# Create Beautiful Soup
html = browser.html
soup = bs(html, 'html.parser')
tables = soup.find_all('table', class_='wikitable sortable jquery-tablesorter')

## Step 3: Store the Results

In [5]:
# Initialize an empty list to hold data from both tables
all_data = []

In [6]:
# Standardize header names
def standardize_header_name(header):
    return header.replace(' ', '_').replace('-', '_').lower()

In [7]:
# Process each table and append the data
for index, table in enumerate(tables[:2]): # Limit to the first two tables
    
    # Extract header row
    header_row = table.find('tr')
    headers = [standardize_header_name(th.text.strip()) for th in header_row.find_all('th')]
    headers.extend(['geo_coordinates'])  # Add geo_coordinates to headers
    
    # Extract data rows and append to all_data with a table identifier
    rows = []
    for tr in table.find_all('tr')[1:]:  # skip the first row as it contains headers
        cells = [td.get_text(strip=True) for td in tr.find_all('td')]

        # Extract geographic coordinates
        geo_span = tr.find('span', class_='geo')
        geo_coordinates = geo_span.get_text(strip=True) if geo_span else "Not Found"
        cells.append(geo_coordinates)  # Append geo coordinates to the cell data
        
        if cells:
            rows.append(cells)

    # Save each table to DataFrame then append to all_data list
    df = pd.DataFrame(rows, columns=headers)
    all_data.append(df)

In [8]:
# Verify table data for Table 1 - AFB, Active Duty Locations
active_duty_df = all_data[0]
active_duty_df.head()

Unnamed: 0,name,location,state_or_area,coordinates,commanding_organization,wing_or_unit_emblem,host_wing_or_primary_unit,primary_missions_and_units,geo_coordinates
0,Altus Air Force Base,Altus,Oklahoma,34°39′59″N099°16′05″W﻿ / ﻿34.66639°N 99.26806°...,Air Education and Training Command,,97th Air Mobility Wing,The97th Air Mobility Wingtrains crews to opera...,34.66639; -99.26806
1,Joint Base Anacostia-Bolling,Southwest,"Washington, D.C.",38°50′34″N077°00′58″W﻿ / ﻿38.84278°N 77.01611°...,Air Force District of Washington,,11th Wing,USAF operatedjoint base. The11th Wingprovides ...,38.84278; -77.01611
2,Joint Base Andrews-Naval Air Facility Washington,Camp Springs,Maryland,38°48′39″N076°52′01″W﻿ / ﻿38.81083°N 76.86694°...,Air Force District of Washington,,316th Wing,USAF operatedjoint base. The316th Wingprovides...,38.81083; -76.86694
3,Arnold Air Force Base,Tullahoma,Tennessee,35°23′33″N086°05′09″W﻿ / ﻿35.39250°N 86.08583°...,Air Force Materiel Command,,Arnold Engineering Development Complex,"Non-flying installation, part of theAir Force ...",35.39250; -86.08583
4,Barksdale Air Force Base,Bossier City,Louisiana,32°30′07″N093°39′46″W﻿ / ﻿32.50194°N 93.66278°...,Air Force Global Strike Command,,2nd Bomb Wing,The2nd Bomb Wingand307th Bomb Wingoperate theB...,32.50194; -93.66278


In [9]:
# Verify table data for Table 2 - AFB, National Guard and Reserve Locations
reserve_df = all_data[1]
reserve_df.head()

Unnamed: 0,name,location,state,coordinates,commanding_organization,wing_or_unit_emblem,host_wing_or_primary_unit,primary_missions_and_units,geo_coordinates
0,Abston Air National Guard Station,Montgomery,Alabama,32°21′22″N086°20′48″W﻿ / ﻿32.35611°N 86.34667°...,Alabama Air National Guard,,226th Combat Communications Group,Non-flying installation. The226th Combat Commu...,32.35611; -86.34667
1,Atlantic City Air National Guard Base,Atlantic City,New Jersey,39°26′53″N074°34′54″W﻿ / ﻿39.44806°N 74.58167°...,New Jersey Air National Guard,,177th Fighter Wing,Airfield shared withAtlantic City Internationa...,39.44806; -74.58167
2,Bangor Air National Guard Base,Bangor,Maine,44°48′51″N068°49′51″W﻿ / ﻿44.81417°N 68.83083°...,Maine Air National Guard,,101st Air Refueling Wing,Airfield shared withBangor International Airpo...,44.81417; -68.83083
3,Barnes Air National Guard Base,Westfield,Massachusetts,42°09′56″N072°43′14″W﻿ / ﻿42.16556°N 72.72056°...,Massachusetts Air National Guard,,104th Fighter Wing,Airfield shared withWestfield-Barnes Regional ...,42.16556; -72.72056
4,Battle Creek Air National Guard Base,Springfield,Michigan,42°18′26.2″N85°15′05.3″W﻿ / ﻿42.307278°N 85.25...,Michigan Air National Guard,,110th Wing,Airfield shared withW. K. Kellogg Airport. The...,42.307278; -85.251472


In [10]:
# Rename columns without reassigning to a new variable
reserve_df.rename(columns={'state': 'state_or_area'}, inplace=True)
reserve_df.head()

Unnamed: 0,name,location,state_or_area,coordinates,commanding_organization,wing_or_unit_emblem,host_wing_or_primary_unit,primary_missions_and_units,geo_coordinates
0,Abston Air National Guard Station,Montgomery,Alabama,32°21′22″N086°20′48″W﻿ / ﻿32.35611°N 86.34667°...,Alabama Air National Guard,,226th Combat Communications Group,Non-flying installation. The226th Combat Commu...,32.35611; -86.34667
1,Atlantic City Air National Guard Base,Atlantic City,New Jersey,39°26′53″N074°34′54″W﻿ / ﻿39.44806°N 74.58167°...,New Jersey Air National Guard,,177th Fighter Wing,Airfield shared withAtlantic City Internationa...,39.44806; -74.58167
2,Bangor Air National Guard Base,Bangor,Maine,44°48′51″N068°49′51″W﻿ / ﻿44.81417°N 68.83083°...,Maine Air National Guard,,101st Air Refueling Wing,Airfield shared withBangor International Airpo...,44.81417; -68.83083
3,Barnes Air National Guard Base,Westfield,Massachusetts,42°09′56″N072°43′14″W﻿ / ﻿42.16556°N 72.72056°...,Massachusetts Air National Guard,,104th Fighter Wing,Airfield shared withWestfield-Barnes Regional ...,42.16556; -72.72056
4,Battle Creek Air National Guard Base,Springfield,Michigan,42°18′26.2″N85°15′05.3″W﻿ / ﻿42.307278°N 85.25...,Michigan Air National Guard,,110th Wing,Airfield shared withW. K. Kellogg Airport. The...,42.307278; -85.251472


In [11]:
# Verify that column headers match in both tables
print("Active Duty DF Columns:", active_duty_df.columns)
print("Reserve DF Columns:", reserve_df.columns)

Active Duty DF Columns: Index(['name', 'location', 'state_or_area', 'coordinates',
       'commanding_organization', 'wing_or_unit_emblem',
       'host_wing_or_primary_unit', 'primary_missions_and_units',
       'geo_coordinates'],
      dtype='object')
Reserve DF Columns: Index(['name', 'location', 'state_or_area', 'coordinates',
       'commanding_organization', 'wing_or_unit_emblem',
       'host_wing_or_primary_unit', 'primary_missions_and_units',
       'geo_coordinates'],
      dtype='object')


In [12]:
# Concatenate the DataFrames
combined_df = pd.concat([active_duty_df, reserve_df], ignore_index=True)
combined_df

Unnamed: 0,name,location,state_or_area,coordinates,commanding_organization,wing_or_unit_emblem,host_wing_or_primary_unit,primary_missions_and_units,geo_coordinates
0,Altus Air Force Base,Altus,Oklahoma,34°39′59″N099°16′05″W﻿ / ﻿34.66639°N 99.26806°...,Air Education and Training Command,,97th Air Mobility Wing,The97th Air Mobility Wingtrains crews to opera...,34.66639; -99.26806
1,Joint Base Anacostia-Bolling,Southwest,"Washington, D.C.",38°50′34″N077°00′58″W﻿ / ﻿38.84278°N 77.01611°...,Air Force District of Washington,,11th Wing,USAF operatedjoint base. The11th Wingprovides ...,38.84278; -77.01611
2,Joint Base Andrews-Naval Air Facility Washington,Camp Springs,Maryland,38°48′39″N076°52′01″W﻿ / ﻿38.81083°N 76.86694°...,Air Force District of Washington,,316th Wing,USAF operatedjoint base. The316th Wingprovides...,38.81083; -76.86694
3,Arnold Air Force Base,Tullahoma,Tennessee,35°23′33″N086°05′09″W﻿ / ﻿35.39250°N 86.08583°...,Air Force Materiel Command,,Arnold Engineering Development Complex,"Non-flying installation, part of theAir Force ...",35.39250; -86.08583
4,Barksdale Air Force Base,Bossier City,Louisiana,32°30′07″N093°39′46″W﻿ / ﻿32.50194°N 93.66278°...,Air Force Global Strike Command,,2nd Bomb Wing,The2nd Bomb Wingand307th Bomb Wingoperate theB...,32.50194; -93.66278
...,...,...,...,...,...,...,...,...,...
163,Warfield Air National Guard Base,Middle River,Maryland,39°19′32.38″N76°24′49.55″W﻿ / ﻿39.3256611°N 76...,Maryland Air National Guard,,175th Wing,Airfield shared withMartin State Airport. The1...,39.3256611; -76.4137639
164,Westover Air Reserve Base,Chicopee,Massachusetts,42°11′38″N072°32′05″W﻿ / ﻿42.19389°N 72.53472°...,Air Force Reserve Command,,439th Airlift Wing,The439th Airlift Wingoperates theC-5M Super Ga...,42.19389; -72.53472
165,Will Rogers Air National Guard Base,Oklahoma City,Oklahoma,35°23′35″N097°36′03″W﻿ / ﻿35.39306°N 97.60083°...,Oklahoma Air National Guard,,137th Special Operations Wing,Airfield shared withWill Rogers World Airport....,35.39306; -97.60083
166,Wyoming Air National Guard Base,Cheyenne,Wyoming,41°09′41″N104°49′10″W﻿ / ﻿41.16139°N 104.81944...,Wyoming Air National Guard,,153d Airlift Wing,Airfield shared withCheyenne Regional Airport....,41.16139; -104.81944


## Step 4: Clean the DataFrame

### Step 4a: Split the 'geo_coordinates' column into two new columns

In [13]:
# Split the 'geo_coordinates' column into 'latitude' and 'longitude'
combined_df[['latitude', 'longitude']] = combined_df['geo_coordinates'].str.split(';', expand=True)

In [14]:
# Ensure that both columns are free of any unwanted whitespace(s)
combined_df['latitude'] = combined_df['latitude'].str.strip()
combined_df['longitude'] = combined_df['longitude'].str.strip()

### Step 4b: Drop unnecessary columns

In [15]:
# Drop the columns that are no longer needed
combined_df = combined_df.drop(columns=[
    'coordinates',
    'geo_coordinates',
    'wing_or_unit_emblem'
])

combined_df.head()

Unnamed: 0,name,location,state_or_area,commanding_organization,host_wing_or_primary_unit,primary_missions_and_units,latitude,longitude
0,Altus Air Force Base,Altus,Oklahoma,Air Education and Training Command,97th Air Mobility Wing,The97th Air Mobility Wingtrains crews to opera...,34.66639,-99.26806
1,Joint Base Anacostia-Bolling,Southwest,"Washington, D.C.",Air Force District of Washington,11th Wing,USAF operatedjoint base. The11th Wingprovides ...,38.84278,-77.01611
2,Joint Base Andrews-Naval Air Facility Washington,Camp Springs,Maryland,Air Force District of Washington,316th Wing,USAF operatedjoint base. The316th Wingprovides...,38.81083,-76.86694
3,Arnold Air Force Base,Tullahoma,Tennessee,Air Force Materiel Command,Arnold Engineering Development Complex,"Non-flying installation, part of theAir Force ...",35.3925,-86.08583
4,Barksdale Air Force Base,Bossier City,Louisiana,Air Force Global Strike Command,2nd Bomb Wing,The2nd Bomb Wingand307th Bomb Wingoperate theB...,32.50194,-93.66278


## Step 6: Save the DataFrame as a JSON file

In [16]:
# Convert DataFrame to a JSON string
json_str = combined_df.to_json(orient='records')

# Parse the JSON string back into a Python list of dictionaries
data = json.loads(json_str)

# Define the file path
file_path = '../output/all_us_air_force_bases.json'

# Write the JSON file with indentation for better readability
with open(file_path, 'w') as f:
    json.dump(data, f, indent=4)

# Optionally, print the JSON string to the console for verification
# print(json.dumps(data, indent=4))

In [17]:
browser.quit()