In [7]:
import os
import urllib.request
import time

import pandas as pd
from bs4 import BeautifulSoup
import requests
from geopy.distance import geodesic
from geopy.geocoders import ArcGIS

URL = "https://travel.state.gov/content/travel/en/us-visas/visa-information-resources/global-visa-wait-times.html"

In [8]:
data_dir = os.path.join(os.getcwd(), "data")
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
html_path = os.path.join(data_dir, os.path.basename(URL))
if not os.path.exists(html_path):
    urllib.request.urlretrieve(URL, html_path)

csv_path = os.path.join(data_dir, "visa_wait_times.csv")

In [9]:
with open(html_path, "r", encoding="utf-8") as f:
    soup = BeautifulSoup(f, "html.parser")

# Parse the table
table = soup.find("table")
rows = table.find_all("tr")

# Extract table headers
headers = [header.text.strip() for header in rows[0].find_all("th")]
prefix_to_remove = "Interview Required\n"
for i, h in enumerate(headers):
    if h.startswith(prefix_to_remove):
        headers[i] = h[len(prefix_to_remove):]


In [10]:
headers

['City/Post',
 'Student/Exchange Visitors (F, M, J)',
 'Petition-Based Temporary Workers (H, L, O, P, Q)',
 'Crew and Transit\xa0(C, D, C1/D)',
 'Visitors (B1/B2)']

In [11]:
data = []
for row in rows[1:]:
    cols = row.find_all("td")
    cols = [col.text.strip() for col in cols]
    data.append(cols)

# Create DataFrame
df = pd.DataFrame(data, columns=headers)

# Clean city names
df["City/Post"] = df["City/Post"].str.split("(").str[0].str.strip()
days_re = r'^(\d+) Days?$'
df = df[df['Visitors (B1/B2)'].str.match(days_re)]
for col in df.columns[1:]:
    df[col] = df[col].replace(days_re, r'\1', regex=True)
    # replace empty values with inf
    df[col] = df[col].replace("", float("inf"))
    df[col] = df[col].replace("Same Day", 0.)
    df[col] = df[col].astype(float)

In [12]:
len(df)

205

In [13]:
df.head()

Unnamed: 0,City/Post,"Student/Exchange Visitors (F, M, J)","Petition-Based Temporary Workers (H, L, O, P, Q)","Crew and Transit (C, D, C1/D)",Visitors (B1/B2)
0,Abidjan,78.0,1.0,2.0,266.0
1,Abu Dhabi,35.0,43.0,inf,416.0
2,Abuja,190.0,28.0,521.0,521.0
3,Accra,390.0,75.0,117.0,440.0
5,Addis Ababa,93.0,156.0,54.0,156.0


In [14]:
# Get coordinates for each city
geolocator = ArcGIS()
def get_coordinates(city):
    try:
        location = geolocator.geocode(city)
        time.sleep(1)
        return location.latitude, location.longitude
    except:
        return None, None

df[['Latitude', 'Longitude']] = df['City/Post'].apply(lambda x: pd.Series(get_coordinates(x)))


In [None]:
df.head()