In [7]:
import pandas as pd
from datetime import datetime
from meteostat import Stations, Daily

In [9]:
# Hardcode stadium locations
STADIUM_LOCATIONS = {
    "ARI" : (33.4454, -112.0666), # Chase Field
    "ATL" : (33.8907, -84.4676), # Truist Park
    "BAL" : (39.2837, -76.6216), # Camden Yards
    "BOS" : (42.3464, -71.0970), # Fenway Park
    "CHC" : (41.9481, -87.6555), # Wrigley Field
    "CHW" : (41.8296, -87.6337), # Rate Field
    "CIN" : (39.0972, -84.5069), # Great American Ballpark
    "CLE" : (41.4960, -81.6851), # Progressive Field
    "COL" : (39.7560, -104.9929), # Coors Field
    "DET" : (42.3392, -83.0488), # Comerica Park
    "HOU" : (29.7572, -95.3552), # Daikin Park
    "KCR" : (39.0515, -94.4804), # Kauffman Stadium
    "LAA" : (33.8002, -117.8828), # Angel Stadium
    "LAD" : (34.0736, -118.2398), # Dodger Stadium
    "MIA" : (25.7781, -80.2195), # LoanDepot Park
    "MIL" : (43.0280, -87.9712), # American Family Field
    "MIN" : (44.9816, -93.2778), # Target Field
    "NYM" : (40.7572, -73.8458), # Citi Field
    "NYY" : (40.8295, -73.9265), # Yankee Stadium
    "OAK" : (37.7455, -122.1990), # Oakland Coliseum
    "PHI" : (39.9060, -75.1664), # Citizens Bank Park
    "PIT" : (40.4469, -80.0056), # PNC Park
    "SDP" : (32.7071, -117.1568), # Petco Park
    "SEA" : (47.5913, -122.3325), # T-Mobile Park
    "SFG" : (37.7786, -122.3902), # Oracle Park
    "STL" : (38.6225, -90.1930), # Busch Stadium
    "TBR" : (27.7680, -82.6532), # Tropicana Field
    "TEX" : (32.7476, -97.0841), # Globe Life Field
    "TOR" : (43.6416, -79.389), # Rogers Centre
    "WSN" : (38.8727, -77.0074) # Nationals Park
}

Gather data for all 30 teams for specified year range

In [10]:
team_data = [] # Loops though team-by-team
years = range(2012, 2020) # 2012-2019

for team, (lat, lon) in STADIUM_LOCATIONS.items():
    for year in years: # Loop through each year (8 total)
        start = datetime(year, 3, 20) # March 20th was earliest Opening Day Start
        end = datetime(year, 10, 5) # October 4th was latest last game of season

        # Find nearest stations for each Team/Year Combo
        stations_near_stadium = Stations()
        stations_near_stadium = stations_near_stadium.nearby(lat, lon)
        stations_near_stadium = stations_near_stadium.inventory("daily", (start, end)) # Key line. Fixed LAD issue of 135 observations
        stations_near_stadium = stations_near_stadium.fetch(5) # Sometimes the first station has no data

        station_id = None # Will be given a value once closest stadium with actual data is found
        yearly_data = None # Same situation as above

        # Loop through potential stadiums to find closest one with actual data (ignore empty data)
        for potential_stadium_id in stations_near_stadium.index:
            potential_data = Daily(potential_stadium_id, start, end)
            potential_data = potential_data.fetch()

            # First station with complete observations (Dates between 3/20, 10/5) will be saved
            if len(potential_data) >= 200: # 200 possible dates.
                station_id = potential_stadium_id
                yearly_data = potential_data
                break # Want the data of station closest to stadium – don't continue

        yearly_data["year"] = year # Used for joining tables
        yearly_data["team"] = team # Used for joining tables
        team_data.append(yearly_data)


weather_data = pd.concat(team_data)
weather_data.reset_index(inplace=True)

weather_data.rename(columns = {"time":"date"}, inplace=True)
weather_data["date"] = pd.to_datetime(weather_data["date"])
weather_data = weather_data[["team", "year", "date", "tavg", "tmin", "tmax", "prcp"]] # Desired columns

# Convert tavg, tmin, tmax from Celsius to Fahrenheit
for each in ["tavg", "tmin", "tmax"]:
    weather_data[each] = (weather_data[each] * (9/5)) + 32

# Convert prcp from mm to inches
weather_data["prcp"] = weather_data["prcp"] * 0.0393701


Weather Data Head

In [13]:
weather_data.head()

Unnamed: 0,team,year,date,tavg,tmin,tmax,prcp
0,ARI,2012,2012-03-20,54.50,42.08,66.02,0.0
1,ARI,2012,2012-03-21,61.34,48.02,75.02,0.0
2,ARI,2012,2012-03-22,68.72,53.06,84.02,0.0
3,ARI,2012,2012-03-23,72.50,57.92,87.08,0.0
4,ARI,2012,2012-03-24,74.30,59.00,87.98,0.0
...,...,...,...,...,...,...,...
195,ARI,2012,2012-10-01,90.32,73.94,102.92,0.0
196,ARI,2012,2012-10-02,88.52,73.94,104.00,0.0
197,ARI,2012,2012-10-03,87.44,73.04,102.02,0.0
198,ARI,2012,2012-10-04,85.82,73.94,100.04,0.0


Observing Value Counts and Missing Values

In [5]:
print(weather_data.shape)
weather_data.groupby(["team", "year"]).size().sort_values()

(48000, 7)


team  year
ARI   2012    200
OAK   2012    200
      2013    200
      2014    200
      2015    200
             ... 
HOU   2017    200
      2018    200
      2019    200
WSN   2018    200
      2019    200
Length: 240, dtype: int64

In [6]:
weather_data[weather_data["prcp"].isna()]

Unnamed: 0,team,year,date,tavg,tmin,tmax,prcp
1602,ATL,2012,2012-03-22,69.98,66.02,77.00,
1603,ATL,2012,2012-03-23,68.18,64.94,73.04,
1611,ATL,2012,2012-03-31,68.72,62.96,78.08,
1615,ATL,2012,2012-04-04,71.78,62.96,82.94,
1616,ATL,2012,2012-04-05,67.10,60.98,73.04,
...,...,...,...,...,...,...,...
45798,TOR,2016,2016-10-04,63.14,59.00,66.20,
45799,TOR,2016,2016-10-05,63.86,60.80,66.20,
46108,TOR,2018,2018-07-06,70.34,64.58,75.92,
46112,TOR,2018,2018-07-10,77.00,68.36,85.46,


In [12]:
weather_data.to_csv("weather_2012_2019.csv", index = False)