In [1]:
import pandas as pd
from data_mountain_query.query import get_ambient_tweets
from data_mountain_query.connection import get_connection
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
from datetime import timedelta
import plotly.express as px
from shapely import Point

Get all the games from the 2013, 2014, 2015, 2016, 2017 regular seasons where the team is playing

In [2]:
# Team specific inputs
TEAM_ABBR = "NYJ"                     
TEAM_NAME = "New York Jets"      
CITY_NAME = "New York"                 

CITY_CENTER_LAT = 40.7128
CITY_CENTER_LON = -74.0060

MIN_POP = 50_000
POP_BASELINE = 50_000_000

In [3]:
games = pd.read_csv("/Users/elisabethkollrack/Thesis/EK-thesis/games.csv")
games = games[games['game_type'] == 'REG']
games['gameday'] = pd.to_datetime(games['gameday'], format='%m/%d/%y')

team_games = games[
    (games['season'] >= 2013) & (games['season'] <= 2017) &
    ((games['home_team'] == TEAM_ABBR) | (games['away_team'] == TEAM_ABBR))
].sort_values(['season', 'gameday'])


In [4]:
collection, client = get_connection(geotweets=True)

Connecting on mgmt1.vacc.uvm.edu


Find all the tweets that mentioned a team-related anchor within 3 days before or after a game

In [5]:
%%capture

all_tweets = []
for index, game in team_games.iterrows():
    gameday = game['gameday']
    
    # Determine opponent
    if game['home_team'] == TEAM_ABBR:
        opponent = game['away_team']
    else:
        opponent = game['home_team']

    anchors = [
        "#NYJets", 
        "#Jets",
        f"#NYJvs{opponent}",
        f"#{opponent}vsNYJ"
    ]

    start_date = gameday - timedelta(days=3)
    end_date = gameday + timedelta(days=3)
    dates = pd.date_range(start_date, end_date, freq='D')

    for anchor in anchors:
        tweets_list1 = [t for t in get_ambient_tweets(anchor, dates, collection)]
        all_tweets.extend(tweets_list1)



Save the tweets to a dataframe and extract the geo coordinates

In [6]:
geo_df = pd.DataFrame(all_tweets)

# Extract lon and lat from the 'geo' column
geo_df['lon'] = geo_df['geo'].apply(lambda x: x['coordinates'][0] if isinstance(x, dict) else None)
geo_df['lat'] = geo_df['geo'].apply(lambda x: x['coordinates'][1] if isinstance(x, dict) else None)

# Convert tweets to geoframe
tweets_gdf = gpd.GeoDataFrame(
    geo_df,
    geometry=gpd.points_from_xy(geo_df.lon, geo_df.lat),
    crs="EPSG:4326"
)

Load U.S. metropolitan area boundaries and merges them with 2024 metropolitan population estimates, keeping only major metropolitan areas. Then converts tweet coordinates into a GeoDataFrame and spatially joins each tweet to the metro area it falls within.

In [7]:
cities = gpd.read_file(
    "/Users/elisabethkollrack/Thesis/EK-Thesis/tl_2024_us_cbsa/tl_2024_us_cbsa.shp"
)

pop_data = pd.read_csv(
    "/Users/elisabethkollrack/Thesis/EK-Thesis/cbsa_population.csv",
    encoding="latin1"
)

# Keep metro-level population only
pop_metro = pop_data.loc[
    pop_data["LSAD"] == "Metropolitan Statistical Area",
    ["CBSA", "POPESTIMATE2024"]
].rename(columns={
    "POPESTIMATE2024": "population_2024"
})

# Ensure GEOID and CBSA are strings for merging
cities["GEOID"] = cities["GEOID"].astype(str)
pop_metro["CBSA"] = pop_metro["CBSA"].astype(str)

# Merge population into CBSA geometries
cities = cities.merge(
    pop_metro,
    left_on="GEOID",
    right_on="CBSA",
    how="left"
)

# Keep only major metros
cities = cities[cities["LSAD"] == "M1"]


# Spatial join tweets to metros
tweets_with_city = gpd.sjoin(
    tweets_gdf,
    cities,
    how="left",
    predicate="within"
)


Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:4326
Right CRS: EPSG:4269

  tweets_with_city = gpd.sjoin(


In [8]:
city_center = Point(CITY_CENTER_LON, CITY_CENTER_LAT)

# Project tweets to meters
tweets_proj = tweets_with_city.to_crs(epsg=5070)
city_proj = gpd.GeoSeries([city_center], crs="EPSG:4326").to_crs(epsg=5070).iloc[0]

tweets_proj["distance_km"] = tweets_proj.geometry.distance(city_proj) / 1000

# Collapse tweets to metro level
metro_dist = (
    tweets_proj
    .groupby(["GEOID", "NAME"])
    .agg(
        tweet_count=("geometry", "count"),
        mean_distance_km=("distance_km", "mean"),
        population_2024=("population_2024", "first")
    )
    .reset_index()
)

metro_dist = metro_dist.dropna(subset=["population_2024"])
metro_dist = metro_dist[metro_dist["population_2024"] >= MIN_POP].copy()

# Normalize tweets per 100k
metro_dist["tweets_per_100k"] = metro_dist["tweet_count"] / metro_dist["population_2024"] * 100_000

# Sort metros and compute cumulative population
metro_dist = metro_dist.sort_values("mean_distance_km")
metro_dist["cum_population"] = metro_dist["population_2024"].cumsum()


baseline_start_dist = metro_dist.loc[metro_dist["cum_population"] >= POP_BASELINE, "mean_distance_km"].iloc[0]
baseline_activity = metro_dist.loc[metro_dist["mean_distance_km"] >= baseline_start_dist, "tweets_per_100k"].mean()

# Fandom radius
R_km = metro_dist.loc[metro_dist["tweets_per_100k"] <= baseline_activity, "mean_distance_km"].min()

# Geodesic circle
def geodesic_circle(lat, lon, radius_km, n_points=360):
    earth_radius_km = 6371.0
    angles = np.linspace(0, 2*np.pi, n_points)

    lat_rad = np.radians(lat)
    lon_rad = np.radians(lon)

    circle_lats = np.arcsin(
        np.sin(lat_rad) * np.cos(radius_km / earth_radius_km) +
        np.cos(lat_rad) * np.sin(radius_km / earth_radius_km) * np.cos(angles)
    )

    circle_lons = lon_rad + np.arctan2(
        np.sin(angles) * np.sin(radius_km / earth_radius_km) * np.cos(lat_rad),
        np.cos(radius_km / earth_radius_km) - np.sin(lat_rad) * np.sin(circle_lats)
    )

    return np.degrees(circle_lats), np.degrees(circle_lons)

circle_lat, circle_lon = geodesic_circle(CITY_CENTER_LAT, CITY_CENTER_LON, R_km)

# Metro centroids for plotting
metro_coords = (
    tweets_with_city.groupby(["GEOID", "NAME"])
    .agg(lat=("lat", "mean"), lon=("lon", "mean"))
    .reset_index()
)

metro_summary = metro_dist.merge(metro_coords, on=["GEOID", "NAME"])

# Filter for valid metros
metro_summary = metro_summary[
    (metro_summary["tweet_count"] >= 10) &
    (metro_summary["tweets_per_100k"].notna())
].copy()

# Plotly map
fig = px.scatter_geo(
    metro_summary,
    lat="lat",
    lon="lon",
    size="tweets_per_100k",
    hover_name="NAME",
    hover_data={"tweets_per_100k": True},
    title=f"Geographic Radius of {TEAM_NAME} Fandom (2013–2017)",
    scope="usa"
)

# Add geodesic fandom circle
fig.add_trace(px.line_geo(lat=circle_lat, lon=circle_lon).data[0])
fig.data[-1].update(name=f"Fandom radius ≈ {int(R_km)} km", showlegend=True)

# Add city marker
fig.add_trace(px.scatter_geo(lat=[CITY_CENTER_LAT], lon=[CITY_CENTER_LON]).data[0])
fig.data[-1].update(marker=dict(size=14, color='red'), name=CITY_NAME, showlegend=True)

# Map styling
fig.update_geos(
    scope="usa",
    showland=True,
    landcolor="lightgray",
    showocean=True,
    oceancolor="azure",
    showlakes=True,
    lakecolor="azure",
    showrivers=True,
    rivercolor="azure"
)

fig.update_layout(
    title_x=0.5,
    title_font_size=20,
    geo=dict(bgcolor='rgba(0,0,0,0)', landcolor='lightgray', lakecolor='azure')
)


fig.show()

In [9]:
# save figure in its own folder
fig.write_html(f'/Users/elisabethkollrack/Thesis/EK-Thesis/Fandom Radii/Interactive Graphs/{TEAM_ABBR}_fandom_radius.html')
