<a href="https://colab.research.google.com/github/gillbatesiii/m2m-capstone1-borderdata/blob/master/capstone1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Analysis on land border crossings from Canada to the US

## Initial setup

In [2]:
import pandas as pd
from sodapy import Socrata
from typing import Final
from google.colab import userdata


In [3]:
# Constants
APP_TOKEN: Final = userdata.get('CAPSTONE1_SOCRATA_APP_TOKEN')
WHERE_CLAUSE: Final = "border = 'US-Canada Border' AND date >= '2017-01-01'"
DATASET_IDENTIFIER: Final = "keg4-3bc2"

In [5]:
app_token_status = ""
if APP_TOKEN:
    app_token_status = "Successfully loaded app token."
else:
    app_token_status = "Warning: APP_TOKEN not found."
print(app_token_status)

Successfully loaded app token.


In [6]:
client = Socrata("data.bts.gov", APP_TOKEN)


## Data Fetching

In [8]:
# Fetch number of expected results for desired Sodapy query
row_count = client.get(
    DATASET_IDENTIFIER,
    query=f"SELECT count(*) WHERE {WHERE_CLAUSE}",
)[0]["count"]
print("row_count", row_count)

row_count 57534


In [9]:
row_count = int(row_count)
results = client.get(DATASET_IDENTIFIER, limit=row_count, where=WHERE_CLAUSE)
client.close()
print("length", len(results))


length 57534


In [17]:
# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)
results_df

Unnamed: 0,port_name,state,port_code,border,date,measure,value,latitude,longitude,point
0,International Falls,Minnesota,3604,US-Canada Border,2017-01-01T00:00:00.000,Buses,18,48.6078,-93.401355,"{'type': 'Point', 'coordinates': [-93.401355, ..."
1,Metaline Falls,Washington,3025,US-Canada Border,2017-01-01T00:00:00.000,Personal Vehicles,1857,48.999972,-117.299444,"{'type': 'Point', 'coordinates': [-117.299444,..."
2,Alcan,Alaska,3104,US-Canada Border,2017-01-01T00:00:00.000,Bus Passengers,0,62.614961,-141.001444,"{'type': 'Point', 'coordinates': [-141.001444,..."
3,Westhope,North Dakota,3419,US-Canada Border,2017-01-01T00:00:00.000,Personal Vehicles,490,48.999611,-101.017277,"{'type': 'Point', 'coordinates': [-101.017277,..."
4,Calais,Maine,0115,US-Canada Border,2017-01-01T00:00:00.000,Rail Containers Loaded,60,45.188548,-67.275381,"{'type': 'Point', 'coordinates': [-67.275381, ..."
...,...,...,...,...,...,...,...,...,...,...
57529,Point Roberts,Washington,3017,US-Canada Border,2025-06-01T00:00:00.000,Buses,28,49.0020555547,-123.068055556,"{'type': 'Point', 'coordinates': [-123.0680555..."
57530,Ogdensburg,New York,0701,US-Canada Border,2025-06-01T00:00:00.000,Personal Vehicle Passengers,29047,44.7330898624,-75.4577501759,"{'type': 'Point', 'coordinates': [-75.45775017..."
57531,Limestone,Maine,0118,US-Canada Border,2025-06-01T00:00:00.000,Personal Vehicle Passengers,1228,46.924555,-67.789597,"{'type': 'Point', 'coordinates': [-67.789597, ..."
57532,Van Buren,Maine,0108,US-Canada Border,2025-06-01T00:00:00.000,Truck Containers Empty,584,47.159645,-67.930799,"{'type': 'Point', 'coordinates': [-67.930799, ..."


## Data cleaning

### Remove unneeded columns

In [18]:
# remove unneeded columns
results_df.drop(columns=["point", "latitude", "longitude"], inplace=True)
results_df

Unnamed: 0,port_name,state,port_code,border,date,measure,value
0,International Falls,Minnesota,3604,US-Canada Border,2017-01-01T00:00:00.000,Buses,18
1,Metaline Falls,Washington,3025,US-Canada Border,2017-01-01T00:00:00.000,Personal Vehicles,1857
2,Alcan,Alaska,3104,US-Canada Border,2017-01-01T00:00:00.000,Bus Passengers,0
3,Westhope,North Dakota,3419,US-Canada Border,2017-01-01T00:00:00.000,Personal Vehicles,490
4,Calais,Maine,0115,US-Canada Border,2017-01-01T00:00:00.000,Rail Containers Loaded,60
...,...,...,...,...,...,...,...
57529,Point Roberts,Washington,3017,US-Canada Border,2025-06-01T00:00:00.000,Buses,28
57530,Ogdensburg,New York,0701,US-Canada Border,2025-06-01T00:00:00.000,Personal Vehicle Passengers,29047
57531,Limestone,Maine,0118,US-Canada Border,2025-06-01T00:00:00.000,Personal Vehicle Passengers,1228
57532,Van Buren,Maine,0108,US-Canada Border,2025-06-01T00:00:00.000,Truck Containers Empty,584


In [19]:
# Null values
nulls_df = results_df[results_df.isnull().any(axis=1)]
print("Null values")
print(nulls_df)

# As of 8/17/2025, the "state" field of the new port of entry "Chief Mountain Mt Poe" hasn't been populated yet.
# Will set "state" to "MT" for all records with port_code == "3315"
results_df["port_code"] = results_df["port_code"].astype(str)
results_df.loc[results_df["port_code"] == "3315", "state"] = "MT"

# Check null values again
print("Check null values again")
print(results_df[results_df.isnull().any(axis=1)])

print("Check rows with port code 3315")
print(results_df.loc[results_df["port_code"] == "3315"])

Null values
                   port_name state port_code            border  \
57091  Chief Mountain Mt Poe   NaN      3315  US-Canada Border   
57199  Chief Mountain Mt Poe   NaN      3315  US-Canada Border   

                          date                      measure value  
57091  2025-06-01T00:00:00.000  Personal Vehicle Passengers  8844  
57199  2025-06-01T00:00:00.000            Personal Vehicles  3554  
Check null values again
Empty DataFrame
Columns: [port_name, state, port_code, border, date, measure, value]
Index: []
Check rows with port code 3315
                   port_name state port_code            border  \
57091  Chief Mountain Mt Poe    MT      3315  US-Canada Border   
57199  Chief Mountain Mt Poe    MT      3315  US-Canada Border   

                          date                      measure value  
57091  2025-06-01T00:00:00.000  Personal Vehicle Passengers  8844  
57199  2025-06-01T00:00:00.000            Personal Vehicles  3554  


In [None]:
# Sanity check
print(results_df)

# New Section