In [13]:
################################################################################                                                                                                                                                              
################################################################################
#
# FILE: baseball-matching.py
#
# BY: Dmitry Sedov 
#
# CREATED: Tue May 19 2020
#
# DESC: This code matches the stadiums to Sname POI dataset.
#
# EXEC:
#      
################################################################################
################################################################################

In [14]:
################################ Libraries #####################################

from bs4 import BeautifulSoup
import requests

import re
import os

import pandas as pd

from sqlalchemy.orm import sessionmaker
import sqlalchemy as db
import psycopg2

import us
from Levenshtein import ratio

################################################################################

In [15]:
################################ Constants #####################################

project_folder = '/home/user/projects/stadiums'
teams_output_folder = os.path.join(project_folder, 'data/baseball/teams/')
games_output_folder = os.path.join(project_folder, 'data/baseball/games/')

################################################################################

In [16]:
############################ PostgreSQL connection #############################

connection = psycopg2.connect(dbname = 'dataname2', 
                              user = 'user')
cursor = connection.cursor()

################################################################################

In [17]:
################################ Constants #####################################

def stadium_search(name, state):
    """ Search for a stadium in the database. """
    
    search_query = f"""
    SELECT 
        sname_place_id,
        location_name,
        naics_code,
        top_category,
        sub_category
    FROM 
        establishments
    WHERE
        state = '{state}'
        AND
        LOWER (location_name) LIKE '%{name.lower()}%'
    ;
    """
    results = pd.read_sql(search_query, con = connection)
    results['stadium'] = name
    
    return results

def visits_exist(sname_place_id):
    visits_exist_query = f"""
    SELECT 
        sname_place_id 
    FROM 
        visits 
    WHERE 
        sname_place_id = '{sname_place_id}'
    """
    cursor.execute(visits_exist_query)
    return (cursor.rowcount)

def visits_total(sname_place_id):
    visits_total_query = f"""
    SELECT 
        SUM(raw_visit_counts) 
    FROM 
        visits 
    WHERE 
        sname_place_id = '{sname_place_id}'
    """
    cursor.execute(visits_total_query)
    return (cursor.fetchone()[0])

################################################################################

In [18]:
def find_potential_matches(row, stadium_column):
    stadium = row[stadium_column]
    state = row['state']
    potential_matches = stadium_search(stadium, state)
    potential_matches['visits_exist'] = potential_matches['sname_place_id'].apply(visits_exist)
    potential_matches['visits_count'] = potential_matches['sname_place_id'].apply(visits_total)
    potential_matches['category_correct'] = potential_matches['naics_code'].apply(lambda x: str(x).startswith('71'))
    potential_matches['name_similarity'] = potential_matches.apply(lambda row: 
                                                               ratio(row['stadium'], row['location_name']), 
                                                               axis = 1)
    potential_matches.sort_values(by = ['visits_exist', 'name_similarity', 'visits_count', 'category_correct'],
                                  ascending = [False, False, False, False],
                                  inplace = True)
    potential_matches.reset_index(inplace = True)
    good_match = potential_matches.loc[0, ['visits_exist', 
                                           'category_correct', 
                                           'name_similarity']].to_list()
    print(potential_matches.loc[0, 'location_name'])
    print(good_match)
    if (good_match[0] >= 1 and good_match[1] and (good_match[2] >= 0.6)):
        return [good_match[0]] + potential_matches.loc[0, ['location_name', 'sname_place_id']].tolist()
    else:
        return None

In [19]:
############################# Import stadiums names ############################

stadiums = pd.read_csv(os.path.join(teams_output_folder,
                                    'baseball_teams.csv')
                      )
towns = pd.read_csv(os.path.join(teams_output_folder,
                                 'baseball_locations.csv')
                   )

towns = towns[['Team', 'City']]

stadiums = pd.merge(stadiums, 
                    towns, how = 'left', 
                    left_on = 'name', 
                    right_on = 'Team', 
                    validate = 'one_to_one')

################################################################################

In [20]:
stadiums['state'] = stadiums['City'].apply(lambda x: x.split(',')[1].strip().lower())
stadiums['city'] = stadiums['City'].apply(lambda x: x.split(',')[0].lower())

In [21]:
stadiums.drop(columns = [x for x in stadiums.columns 
                         if x not in ['name', 
                                      'city', 
                                      'state',
                                      'stadium_2017',
                                      'stadium_2018', 
                                      'stadium_2019'
                                     ]
                        ], 
              inplace = True)

In [22]:
for i, row in stadiums.iterrows():
    try:
        stadiums.loc[i, 'state'] = us.states.lookup(row['state'
                                                       ].replace('.',
                                                                 '')
                                                   ).abbr.lower()
    except AttributeError:
        stadiums.loc[i,'state'] = None

In [23]:
# Find stadium columns
stadium_columns = [x for x in stadiums.columns if 'stadium' in x]
# Fix stadiums names
stadiums[stadium_columns] = stadiums[stadium_columns].replace(to_replace = "Yankee Stadium III",
                                                              value = 'Yankee Stadium')
stadiums[stadium_columns] = stadiums[stadium_columns].replace(to_replace = "Busch Stadium III",
                                                              value = 'Busch Stadium')
stadiums[stadium_columns] = stadiums[stadium_columns].replace(to_replace = "Oakland-Alameda County Coliseum",
                                                              value = 'Oakland Alameda County Coliseum')
# New names to old (as in SG)
stadiums[stadium_columns] = stadiums[stadium_columns].replace(to_replace = "T-Mobile Park",
                                                              value = 'Safeco Field')
stadiums[stadium_columns] = stadiums[stadium_columns].replace(to_replace = "Minute Maid Park and Tropicana Field",
                                                              value = 'Minute Maid Park')
stadiums[stadium_columns] = stadiums[stadium_columns].replace(to_replace = "Oracle Park",
                                                              value = 'AT&T Park')


In [24]:
# Iterate through stadium columns and find matches in SG dataset
for stadium_column in stadium_columns:
    y = stadium_column.split('_')[1]
    m = f'good_match_{y}' 
    stadiums[m] = None
    for i, row in stadiums.iterrows():
        try:
            stadiums.loc[i, m] = find_potential_matches(row, stadium_column)
        except ValueError:
            stadiums.loc[i, m] = None

Chase Field
[26, True, 1.0]
Suntrust Park
[26, True, 0.9230769230769231]
Oriole Park At Camden Yards
[26, True, 0.9629629629629629]
Fenway Park
[26, True, 1.0]
Wrigley Field
[0, True, 1.0]
Guaranteed Rate Field
[26, True, 1.0]
Great American Ball Park
[26, True, 1.0]
Progressive Field
[26, True, 1.0]
Coors Field
[26, True, 1.0]
Comerica Park
[26, True, 1.0]
Minute Maid Park
[26, True, 1.0]
Kauffman Stadium
[26, True, 1.0]
Angel Stadium of Anaheim
[3, True, 1.0]
Dodger Stadium
[26, True, 1.0]
Marlins Park
[26, True, 1.0]
Miller Park
[26, True, 1.0]
Target Field
[26, True, 1.0]
Citi Field
[26, True, 1.0]
Yankee Stadium
[26, True, 1.0]
Oakland Alameda County Coliseum
[0, True, 1.0]
Citizens Bank Park
[26, True, 1.0]
Pnc Park
[26, True, 0.75]
Petco Park
[26, True, 1.0]
At&t Park
[26, True, 0.7777777777777778]
Safeco Field
[26, True, 1.0]
Busch Stadium
[26, True, 1.0]
Tropicana Field
[26, True, 1.0]
Globe Life Park In Arlington
[26, True, 0.9642857142857143]
Nationals Park
[26, True, 1.0]
C

In [25]:
stadiums

Unnamed: 0,name,stadium_2017,stadium_2018,stadium_2019,state,city,good_match_2017,good_match_2018,good_match_2019
0,Arizona Diamondbacks,Chase Field,Chase Field,Chase Field,az,phoenix,"[26, Chase Field, sg:21e64d7375b74fb888bd58477...","[26, Chase Field, sg:21e64d7375b74fb888bd58477...","[26, Chase Field, sg:21e64d7375b74fb888bd58477..."
1,Atlanta Braves,SunTrust Park,SunTrust Park,SunTrust Park,ga,atlanta,"[26, Suntrust Park, sg:2383945c21d348028a91eb9...","[26, Suntrust Park, sg:2383945c21d348028a91eb9...","[26, Suntrust Park, sg:2383945c21d348028a91eb9..."
2,Baltimore Orioles,Oriole Park at Camden Yards,Oriole Park at Camden Yards,Oriole Park at Camden Yards,md,baltimore,"[26, Oriole Park At Camden Yards, sg:faf8ed1be...","[26, Oriole Park At Camden Yards, sg:faf8ed1be...","[26, Oriole Park At Camden Yards, sg:faf8ed1be..."
3,Boston Red Sox,Fenway Park,Fenway Park,Fenway Park,ma,boston,"[26, Fenway Park, sg:d0edd6a67d18439abca845f5d...","[26, Fenway Park, sg:d0edd6a67d18439abca845f5d...","[26, Fenway Park, sg:d0edd6a67d18439abca845f5d..."
4,Chicago Cubs,Wrigley Field,Wrigley Field,Wrigley Field,il,chicago,,,
5,Chicago White Sox,Guaranteed Rate Field,Guaranteed Rate Field,Guaranteed Rate Field,il,chicago,"[26, Guaranteed Rate Field, sg:300bf80a251c461...","[26, Guaranteed Rate Field, sg:300bf80a251c461...","[26, Guaranteed Rate Field, sg:300bf80a251c461..."
6,Cincinnati Reds,Great American Ball Park,Great American Ball Park,Great American Ball Park,oh,cincinnati,"[26, Great American Ball Park, sg:6c574413023a...","[26, Great American Ball Park, sg:6c574413023a...","[26, Great American Ball Park, sg:6c574413023a..."
7,Cleveland Indians,Progressive Field,Progressive Field,Progressive Field,oh,cleveland,"[26, Progressive Field, sg:58884340705b4af194e...","[26, Progressive Field, sg:58884340705b4af194e...","[26, Progressive Field, sg:58884340705b4af194e..."
8,Colorado Rockies,Coors Field,Coors Field,Coors Field,co,denver,"[26, Coors Field, sg:2a7aa0a8aa90416c847d10e62...","[26, Coors Field, sg:2a7aa0a8aa90416c847d10e62...","[26, Coors Field, sg:2a7aa0a8aa90416c847d10e62..."
9,Detroit Tigers,Comerica Park,Comerica Park,Comerica Park,mi,detroit,"[26, Comerica Park, sg:51f13fdec24d456e8fb7b56...","[26, Comerica Park, sg:51f13fdec24d456e8fb7b56...","[26, Comerica Park, sg:51f13fdec24d456e8fb7b56..."


In [26]:
stadiums['sg_id_2017'] = stadiums['good_match_2017'].apply(lambda x: x[2] if x else None)
stadiums['sg_id_2018'] = stadiums['good_match_2018'].apply(lambda x: x[2] if x else None)
stadiums['sg_id_2019'] = stadiums['good_match_2019'].apply(lambda x: x[2] if x else None)

In [27]:
# Export the matching
output_file_path = os.path.join(teams_output_folder, 'baseball_matching.csv')
stadiums[['name', 'sg_id_2017', 'sg_id_2018', 'sg_id_2019']].to_csv(output_file_path, 
                                                                    index = False)