In [175]:
import pandas as pd
from data_mountain_query.query import get_ambient_tweets
from data_mountain_query.connection import get_connection
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
from datetime import timedelta
import plotly.express as px
from shapely import Point
import os
import warnings
warnings.filterwarnings("ignore")

Get all the games from the 2011, 2012, 2013, 2014 seasons (September-February) where the team is playing

In [176]:
# Team specific inputs
TEAM_ABBR = "ARI"                     
TEAM_NAME = "Arizona Cardinals"       
CITY_NAME = "Phoenix"                 

CITY_CENTER_LAT = 33.4483
CITY_CENTER_LON = -112.0725

POP_BASELINE_FRAC = 0.7

In [177]:
games = pd.read_csv("/Users/elisabethkollrack/Thesis/EK-thesis/games.csv")
games['gameday'] = pd.to_datetime(games['gameday'], format='%m/%d/%y')

team_games = games[
    (games['season'] >= 2011) & (games['season'] <= 2014) &
    ((games['home_team'] == TEAM_ABBR) | (games['away_team'] == TEAM_ABBR))
].sort_values(['season', 'gameday'])

In [178]:
collection, client = get_connection(geotweets=True)

Connecting on mgmt1.vacc.uvm.edu


Find all the tweets (and their corresponding seasons) that mentioned a team-related anchor within 3 days before or after a game

In [179]:
%%capture

all_tweets = []
for index, game in team_games.iterrows():
    gameday = game['gameday']
    
    # Determine opponent
    if game['home_team'] == TEAM_ABBR:
        opponent = game['away_team']
    else:
        opponent = game['home_team']

    anchors = [
        "#ArizonaCardinals", 
        "#Cardinals",
        f"#ARIvs{opponent}",
        f"#{opponent}vsARI"
    ]

    # 3 days before and after game
    start_date = gameday - timedelta(days=3)
    end_date = gameday + timedelta(days=3)
    dates = pd.date_range(start_date, end_date, freq='D')


    # go through all the anchors and get tweets and season
    for anchor in anchors:
        tweets_list1 = [t for t in get_ambient_tweets(anchor, dates, collection)]
        for t in tweets_list1:
            t["season"] = game["season"]
        all_tweets.extend(tweets_list1)

Save the tweets to a dataframe, remove missing values and duplicates (tweets with multiple anchors) and extract the geo coordinates

In [180]:
geo_df = pd.DataFrame(all_tweets)
geo_df = geo_df.drop_duplicates(subset="_id")
geo_df = geo_df.dropna(subset=["tweet_created_at", "geo"])

# Ensure datetime
geo_df["tweet_created_at"] = pd.to_datetime(geo_df["tweet_created_at"], errors="coerce")

# Extract lon/lat from geo field
geo_df["lon"] = geo_df["geo"].apply(lambda x: x["coordinates"][0] if isinstance(x, dict) else None)
geo_df["lat"] = geo_df["geo"].apply(lambda x: x["coordinates"][1] if isinstance(x, dict) else None)
geo_df = geo_df.dropna(subset=["lon", "lat"])

# Tweets are WGS84
tweets_gdf = gpd.GeoDataFrame(
    geo_df,
    geometry=gpd.points_from_xy(geo_df.lon, geo_df.lat),
    crs="EPSG:4326"
)

Load U.S. metropolitan area boundaries and merge them with Census population estimates for Metropolitan Statistical Areas (2011–2014). Population is averaged across these years for the overall radius calculations. The dataset is restricted to major metropolitan areas (LSAD = “M1”). Tweets are then converted to a GeoDataFrame and spatially joined to the metropolitan area in which each tweet is located.

In [181]:
# Load CBSA shapefile
cities = gpd.read_file(
    "/Users/elisabethkollrack/Thesis/EK-Thesis/tl_2014_us_cbsa/tl_2014_us_cbsa.shp"
)

# Load population CSV
pop_data = pd.read_csv(
    "/Users/elisabethkollrack/Thesis/EK-Thesis/cbsa_population.csv",
    encoding="latin1",
    dtype={"CBSA": str}
)

# Keep only Metropolitan Statistical Areas
pop_metro = pop_data.loc[
    pop_data["LSAD"] == "Metropolitan Statistical Area",
    ["CBSA", "POPESTIMATE2011", "POPESTIMATE2012", "POPESTIMATE2013", "POPESTIMATE2014"]
].copy()

# Rename population columns
pop_metro = pop_metro.rename(columns={
    "POPESTIMATE2011": "population_2011",
    "POPESTIMATE2012": "population_2012",
    "POPESTIMATE2013": "population_2013",
    "POPESTIMATE2014": "population_2014",
})

# Average population for overall radius
pop_metro["population_avg_2011_2014"] = pop_metro[
    ["population_2011", "population_2012", "population_2013", "population_2014"]
].mean(axis=1)


# Merge population into shapefile using CBSAFP
cities = cities.merge(
    pop_metro,
    left_on="CBSAFP",
    right_on="CBSA",
    how="left"
)

# Keep only metros
cities = cities[cities["LSAD"] == "M1"].copy()


For each NFL season, a season-specific tweet window is defined using the first and last regular-season game dates for the team. Tweets are filtered to each season window and spatially joined to metropolitan areas. Distances from the team’s home city center are calculated after projecting to a meters-based coordinate system. Tweet activity is aggregated by metro area and normalized by population (tweets per 100,000 residents). Metropolitan areas are ordered by distance and cumulative population is used to define a baseline outer region representing a fixed fraction of total population. The fandom radius for each season is defined as the minimum distance at which tweet activity falls to or below this baseline level.

In [182]:
# Project city center to meters once
city_proj = gpd.GeoSeries(
    [Point(CITY_CENTER_LON, CITY_CENTER_LAT)],
    crs="EPSG:4326"
).to_crs(epsg=5070).iloc[0]

# Ensure tweets and metro polygons share CRS for spatial join
if tweets_gdf.crs != cities.crs:
    tweets_gdf = tweets_gdf.to_crs(cities.crs)

results = []

# Compute fandom radius per season (game-window tweets)
for season in sorted(tweets_gdf["season"].dropna().unique()):

    season = int(season)
    pop_col = f"population_{season}"
    season_tweets = tweets_gdf[tweets_gdf["season"] == season].copy()

    # Assign tweets to metropolitan areas
    tweets_with_city = gpd.sjoin(
        season_tweets,
        cities,
        how="inner",
        predicate="within"
    )

    # Project to meters for distance calculations
    tweets_proj = tweets_with_city.to_crs(epsg=5070)

    # Distance from team city center (km)
    tweets_proj["distance_km"] = tweets_proj.geometry.distance(city_proj) / 1000

    # Aggregate by metro area
    metro_dist = (
        tweets_proj.groupby(["CBSAFP", "NAME"])
        .agg(
            tweet_count=("geometry", "count"),
            mean_distance_km=("distance_km", "mean"),
            population=(pop_col, "first")
        )
        .reset_index()
        .dropna(subset=["population"])
    )

    # Normalize tweet activity by population
    metro_dist["tweets_per_100k"] = (
        metro_dist["tweet_count"] / metro_dist["population"] * 100_000
    )

    # Order by distance and compute cumulative population
    metro_dist = metro_dist.sort_values("mean_distance_km")
    metro_dist["cum_population"] = metro_dist["population"].cumsum()

    # Define baseline from outer population fraction
    total_pop = metro_dist["population"].sum()
    pop_baseline = POP_BASELINE_FRAC * total_pop

    baseline_start_dist = metro_dist.loc[
        metro_dist["cum_population"] >= pop_baseline, "mean_distance_km"
    ].min()

    baseline_activity = metro_dist.loc[
        metro_dist["mean_distance_km"] >= baseline_start_dist, "tweets_per_100k"
    ].mean()

    # Fandom radius = first distance where activity drops to baseline or below
    R_km = metro_dist.loc[
        metro_dist["tweets_per_100k"] <= baseline_activity, "mean_distance_km"
    ].min()

    # If no drop to baseline, set radius to max distance
    if pd.isna(R_km):
        R_km = metro_dist["mean_distance_km"].max()
        print("No drop to baseline; setting radius to max distance.")

    # Store results
    results.append({
        "team": TEAM_ABBR,
        "city": CITY_NAME,
        "season": season,
        "radius_km": R_km,
        "n_tweets": len(season_tweets),
        "n_metros": len(metro_dist)
    })

    print(f"Season {season}: radius = {int(R_km)} km, tweets = {len(season_tweets)}")
    # Save each season to a csv
    results_df = pd.DataFrame(results)
    results_df.to_csv(f"{TEAM_ABBR}_fandom_radius_by_season.csv", index=False)

Season 2011: radius = 410 km, tweets = 369
Season 2012: radius = 408 km, tweets = 463
Season 2013: radius = 185 km, tweets = 732
Season 2014: radius = 419 km, tweets = 609


This block computes an overall fandom radius using all tweets collected within 3 days of games across all seasons. Tweets are spatially joined to metropolitan areas, projected to a meters-based coordinate system, and distances from the team’s home city center are calculated. Tweet activity is aggregated by metro area and normalized by population using average metropolitan populations (2011–2014). Metropolitan areas are ordered by distance and cumulative population is used to define an outer baseline region. The overall fandom radius is defined as the minimum distance at which normalized tweet activity falls to or below this baseline level.

In [183]:
# Overall fandom radius using all game-window tweets
tweets_overall = tweets_gdf.copy()


# Assign tweets to metropolitan areas
tweets_with_city = gpd.sjoin(
    tweets_overall,
    cities,
    how="inner",
    predicate="within"
)

# Project to meters for distance calculation
tweets_proj = tweets_with_city.to_crs(epsg=5070)

# Distance from team city center (km)
tweets_proj["distance_km"] = tweets_proj.geometry.distance(city_proj) / 1000

# Aggregate by metro using average population
metro_dist = (
    tweets_proj.groupby(["CBSAFP", "NAME"])
    .agg(
        tweet_count=("geometry", "count"),
        mean_distance_km=("distance_km", "mean"),
        population=("population_avg_2011_2014", "first")
    )
    .reset_index()
    .dropna(subset=["population"])
)

# Normalize tweet activity by population
metro_dist["tweets_per_100k"] = (
    metro_dist["tweet_count"] / metro_dist["population"] * 100_000
)

# Order by distance and compute cumulative population
metro_dist = metro_dist.sort_values("mean_distance_km")
metro_dist["cum_population"] = metro_dist["population"].cumsum()

# Baseline from outer population fraction
total_pop = metro_dist["population"].sum()
pop_baseline = POP_BASELINE_FRAC * total_pop

baseline_start_dist = metro_dist.loc[
    metro_dist["cum_population"] >= pop_baseline, "mean_distance_km"
].min()

baseline_activity = metro_dist.loc[
    metro_dist["mean_distance_km"] >= baseline_start_dist, "tweets_per_100k"
].mean()

# Overall fandom radius
R_km_overall = metro_dist.loc[
    metro_dist["tweets_per_100k"] <= baseline_activity, "mean_distance_km"
].min()

# If no drop to baseline, set radius to max distance
if pd.isna(R_km):
    R_km = metro_dist["mean_distance_km"].max()
    print("No drop to baseline; setting radius to max distance.")


print(f"Overall: radius = {int(R_km_overall)} km, tweets = {len(tweets_overall)}")


Overall: radius = 411 km, tweets = 2173


This block visualizes the overall fandom radius on a U.S. map. A geodesic circle is generated around the team’s home city using the estimated fandom radius, accounting for Earth’s curvature. Metropolitan areas are plotted at their mean tweet coordinates, with marker size scaled by population-normalized tweet activity. The team’s home city and the estimated fandom boundary are overlaid to illustrate the spatial extent of fan engagement across the United States.

In [184]:
def geodesic_circle(lat, lon, radius_km, n_points=360):
    """
    Generate latitude and longitude points for a geodesic circle
    on the Earth's surface centered at (lat, lon) with given radius.

    Uses spherical trigonometry to account for Earth's curvature.
    """
    earth_radius_km = 6371.0

    # Angles around the circle
    angles = np.linspace(0, 2 * np.pi, n_points)

    # Convert center point to radians
    lat_rad = np.radians(lat)
    lon_rad = np.radians(lon)

    # Compute circle coordinates in radians
    circle_lats = np.arcsin(
        np.sin(lat_rad) * np.cos(radius_km / earth_radius_km) +
        np.cos(lat_rad) * np.sin(radius_km / earth_radius_km) * np.cos(angles)
    )

    circle_lons = lon_rad + np.arctan2(
        np.sin(angles) * np.sin(radius_km / earth_radius_km) * np.cos(lat_rad),
        np.cos(radius_km / earth_radius_km) - np.sin(lat_rad) * np.sin(circle_lats)
    )

    # Convert back to degrees
    return np.degrees(circle_lats), np.degrees(circle_lons)


# Generate circle for overall fandom radius
circle_lat, circle_lon = geodesic_circle(
    CITY_CENTER_LAT, CITY_CENTER_LON, R_km_overall
)

# Average tweet coordinates per metro (for display only)
metro_coords = (
    tweets_with_city.groupby(["CBSAFP", "NAME"])
    .agg(lat=("lat", "mean"), lon=("lon", "mean"))
    .reset_index()
)

# Merge spatial coordinates with metro activity statistics
metro_summary = metro_dist.merge(metro_coords, on=["CBSAFP", "NAME"])

# Plot interactive U.S. map
fig = px.scatter_geo(
    metro_summary,
    lat="lat",
    lon="lon",
    size="tweets_per_100k",
    hover_name="NAME",
    title=f"Geographic Radius of {TEAM_NAME} Fandom (2011–2014 Overall)",
    scope="usa"
)

# Add team home city marker
fig.add_trace(px.scatter_geo(
    lat=[CITY_CENTER_LAT],
    lon=[CITY_CENTER_LON]
).data[0])

fig.data[-1].update(
    marker=dict(size=8, color="red"),
    name=CITY_NAME,
    showlegend=True
)


# Add fandom radius circle
fig.add_trace(px.line_geo(lat=circle_lat, lon=circle_lon).data[0])
fig.data[-1].update(
    name=f"Fandom radius ≈ {int(R_km_overall)} km",
    showlegend=True
)

# Formatting
fig.update_layout(title_x=0.5, title_font_size=20)
fig.show()

# Save interactive HTML version
fig.write_html(
    f"/Users/elisabethkollrack/Thesis/EK-Thesis/Fandom Radii/Interactive Graphs/{TEAM_ABBR}_fandom_radius.html"
)
