In [1]:
################################################################################                                                                                                                                                              
################################################################################
#
# FILE: football-matching.ipynb
#
# BY: Dmitry Sedov 
#
# CREATED: Tue Jun 2 2020
#
# DESC: This code matches the stadiums to Sname POI dataset.
#
# EXEC:
#      
################################################################################
################################################################################

In [2]:
################################ Libraries #####################################

from bs4 import BeautifulSoup
import requests

import re
import os

import pandas as pd

from sqlalchemy.orm import sessionmaker
import sqlalchemy as db
import psycopg2

import us
from Levenshtein import ratio

################################################################################

In [3]:
################################ Constants #####################################

project_folder = '/home/user/projects/stadiums'
teams_output_folder = os.path.join(project_folder, 'data/football/teams/')
games_output_folder = os.path.join(project_folder, 'data/football/games/')

################################################################################

In [4]:
############################ PostgreSQL connection #############################

connection = psycopg2.connect(dbname = 'dataname2', 
                              user = 'user')
cursor = connection.cursor()

################################################################################

In [5]:
################################ Constants #####################################

def stadium_search(name, state):
    """ Search for a stadium in the database. """
    
    search_query = f"""
    SELECT 
        sname_place_id,
        location_name,
        naics_code,
        top_category,
        sub_category
    FROM 
        establishments
    WHERE
        state = '{state}'
        AND
        LOWER (location_name) LIKE $$%{name.lower()}%$$
    ;
    """
    results = pd.read_sql(search_query, con = connection)
    results['stadium'] = name
    
    return results

def visits_exist(sname_place_id):
    """Determine whether visits exist for a sname_place_id. """
    
    visits_exist_query = f"""
    SELECT 
        sname_place_id 
    FROM 
        visits 
    WHERE 
        sname_place_id = '{sname_place_id}'
    """
    cursor.execute(visits_exist_query)
    return (cursor.rowcount)

def visits_total(sname_place_id):
    visits_total_query = f"""
    SELECT 
        SUM(raw_visit_counts) 
    FROM 
        visits 
    WHERE 
        sname_place_id = '{sname_place_id}'
    """
    cursor.execute(visits_total_query)
    return (cursor.fetchone()[0])

################################################################################

In [6]:
def find_potential_matches(row, stadium_column):
    stadium = row[stadium_column]
    state = row['state']
    potential_matches = stadium_search(stadium, state)
    potential_matches['visits_exist'] = potential_matches['sname_place_id'].apply(visits_exist)
    potential_matches['visits_count'] = potential_matches['sname_place_id'].apply(visits_total)
    potential_matches['category_correct'] = potential_matches['naics_code'].apply(lambda x: str(x).startswith('71'))
    potential_matches['name_similarity'] = potential_matches.apply(lambda row: 
                                                               ratio(row['stadium'], row['location_name']), 
                                                               axis = 1)
    potential_matches.sort_values(by = ['visits_exist', 'name_similarity', 'visits_count', 'category_correct'],
                                  ascending = [False, False, False, False],
                                  inplace = True)
    potential_matches.reset_index(inplace = True)
    good_match = potential_matches.loc[0, ['visits_exist', 
                                           'category_correct', 
                                           'name_similarity']].to_list()
    print(potential_matches.loc[0, 'location_name'])
    print(good_match)
    # 53% match - lucas oil & indiana convention center threshold
    if (good_match[0] >= 1 and good_match[1] and (good_match[2] >= 0.53)):
        return [good_match[0]] + potential_matches.loc[0, ['location_name', 'sname_place_id']].tolist()
    elif (not good_match[1] and good_match[0] >= 1 and (good_match[2] >= 0.53)):
        return ['!', good_match[0]] + potential_matches.loc[0, ['location_name', 'sname_place_id']].tolist()
    else:
        return None

In [7]:
############################# Import stadiums names ############################

stadiums = pd.read_csv(os.path.join(teams_output_folder,
                                    'football_teams.csv')
                      )

towns = pd.read_csv(os.path.join(teams_output_folder,
                                 'football_locations.csv')
                   )
towns.rename(columns = {'City, State': 'City'}, inplace = True)

towns = towns[['Club', 'City']]
towns['Club'] = towns['Club'].replace(to_replace = 'San Francisco ers', 
                                      value = 'San Francisco 49ers')
towns.loc[-1] = ['Oakland Raiders', 'Oakland, California']

stadiums = pd.merge(stadiums, 
                    towns, how = 'left', 
                    left_on = 'name', 
                    right_on = 'Club', 
                    validate = 'one_to_one')

################################################################################

In [8]:
stadiums['state'] = stadiums['City'].apply(lambda x: x.split(',')[1].strip().lower())
stadiums['city'] = stadiums['City'].apply(lambda x: x.split(',')[0].lower())

In [9]:
stadiums.drop(columns = [x for x in stadiums.columns 
                         if x not in ['name', 
                                      'city', 
                                      'state',
                                      'stadium_2017',
                                      'stadium_2018'                                     ]
                        ], 
              inplace = True)

In [10]:
for i, row in stadiums.iterrows():
    try:
        stadiums.loc[i, 'state'] = us.states.lookup(row['state'
                                                       ].replace('.',
                                                                 '')
                                                   ).abbr.lower()
    except AttributeError:
        stadiums.loc[i,'state'] = None

In [11]:
# Find stadium columns
stadium_columns = [x for x in stadiums.columns if 'stadium' in x]

In [12]:
stadiums[stadium_columns] = stadiums[stadium_columns].replace(to_replace = 'Mercedes-Benz Stadium', 
                                                              value = 'Mercedes Benz Stadium')
stadiums[stadium_columns] = stadiums[stadium_columns].replace(to_replace = 'New Era Field', 
                                                              value = 'Ralph Wilson Stadium')
stadiums[stadium_columns] = stadiums[stadium_columns].replace(to_replace = 'U.S. Bank Stadium', 
                                                              value = 'US Bank Stadium')
stadiums[stadium_columns] = stadiums[stadium_columns].replace(to_replace = 'Mercedes-Benz Superdome', 
                                                              value = 'Mercedes Benz Superdome')
stadiums[stadium_columns] = stadiums[stadium_columns].replace(to_replace = 'Oakland-Alameda County Coliseum', 
                                                              value = 'Oakland Alameda County Coliseum')
# Replace new names with old ones to match SG
stadiums[stadium_columns] = stadiums[stadium_columns].replace(to_replace = 'State Farm Stadium', 
                                                              value = 'University of Phoenix Stadium')
stadiums[stadium_columns] = stadiums[stadium_columns].replace(to_replace = 'Broncos Stadium', 
                                                              value = 'Sports Authority Field at Mile High')
stadiums[stadium_columns] = stadiums[stadium_columns].replace(to_replace = 'TIAA Bank Stadium', 
                                                              value = 'EverBank Field')

In [13]:
# Iterate through stadium columns and find matches in SG dataset
for stadium_column in stadium_columns:
    y = stadium_column.split('_')[1]
    m = f'good_match_{y}' 
    stadiums[m] = None
    for i, row in stadiums.iterrows():
        try:
            stadiums.loc[i, m] = find_potential_matches(row, stadium_column)
        except ValueError:
            stadiums.loc[i, m] = None

University Of Phoenix Stadium
[26, True, 0.9655172413793104]
Mercedes Benz Stadium
[26, True, 1.0]
M&t Bank Stadium
[26, True, 0.9375]
Ralph Wilson Stadium
[26, True, 1.0]
Bank Of America Stadium
[26, True, 0.9565217391304348]
Soldier Field
[26, True, 1.0]
Paul Brown Stadium
[26, True, 1.0]
Firstenergy Stadium
[26, True, 0.9473684210526315]
At&t Stadium
[26, True, 0.8333333333333334]
Sports Authority Field At Mile High
[26, True, 0.9714285714285714]
Ford Field
[26, False, 1.0]
Lambeau Field
[26, True, 1.0]
Nrg Stadium
[26, True, 0.8181818181818182]
Indiana Convention Center & Lucas Oil Stadium
[26, True, 0.5483870967741935]
Everbank Field
[26, True, 0.9285714285714286]
Arrowhead Stadium
[26, True, 1.0]
Stubhub Center
[26, True, 0.9285714285714286]
Los Angeles Memorial Coliseum
[26, True, 1.0]
Hard Rock Stadium
[26, True, 1.0]
US Bank Stadium
[26, True, 1.0]
Gillette Stadium
[26, True, 1.0]
Mercedes Benz Superdome
[26, False, 1.0]
Metlife Stadium
[26, True, 0.9333333333333333]
Metlife S

In [19]:
stadiums

Unnamed: 0,name,stadium_2017,stadium_2018,state,city,good_match_2017,good_match_2018,sg_id_2017,sg_id_2018
0,Arizona Cardinals,University of Phoenix Stadium,University of Phoenix Stadium,az,glendale,"[26, University Of Phoenix Stadium, sg:dc29c07...","[26, University Of Phoenix Stadium, sg:dc29c07...",sg:dc29c074ee464cbc9312c26d268696f6,sg:dc29c074ee464cbc9312c26d268696f6
1,Atlanta Falcons,Mercedes Benz Stadium,Mercedes Benz Stadium,ga,atlanta,"[26, Mercedes Benz Stadium, sg:b631de7ea1a3416...","[26, Mercedes Benz Stadium, sg:b631de7ea1a3416...",sg:b631de7ea1a3416a82dd1a1a79f3c098,sg:b631de7ea1a3416a82dd1a1a79f3c098
2,Baltimore Ravens,M&T Bank Stadium,M&T Bank Stadium,md,baltimore,"[26, M&t Bank Stadium, sg:2380e3619e664c6882d7...","[26, M&t Bank Stadium, sg:2380e3619e664c6882d7...",sg:2380e3619e664c6882d7261d08a35a31,sg:2380e3619e664c6882d7261d08a35a31
3,Buffalo Bills,Ralph Wilson Stadium,Ralph Wilson Stadium,ny,orchard park,"[26, Ralph Wilson Stadium, sg:5a3ce52de9fd4df6...","[26, Ralph Wilson Stadium, sg:5a3ce52de9fd4df6...",sg:5a3ce52de9fd4df688531839b7534c1e,sg:5a3ce52de9fd4df688531839b7534c1e
4,Carolina Panthers,Bank of America Stadium,Bank of America Stadium,nc,charlotte,"[26, Bank Of America Stadium, sg:994382f93a404...","[26, Bank Of America Stadium, sg:994382f93a404...",sg:994382f93a404dbf892a95a1be816875,sg:994382f93a404dbf892a95a1be816875
5,Chicago Bears,Soldier Field,Soldier Field,il,chicago,"[26, Soldier Field, sg:a794c5f8495641d29bd92c0...","[26, Soldier Field, sg:a794c5f8495641d29bd92c0...",sg:a794c5f8495641d29bd92c0b65a9ff6d,sg:a794c5f8495641d29bd92c0b65a9ff6d
6,Cincinnati Bengals,Paul Brown Stadium,Paul Brown Stadium,oh,cincinnati,"[26, Paul Brown Stadium, sg:630e269dcbea41db87...","[26, Paul Brown Stadium, sg:630e269dcbea41db87...",sg:630e269dcbea41db87ec58dd20731fd0,sg:630e269dcbea41db87ec58dd20731fd0
7,Cleveland Browns,FirstEnergy Stadium,FirstEnergy Stadium,oh,cleveland,"[26, Firstenergy Stadium, sg:bb2e1588c2044ece8...","[26, Firstenergy Stadium, sg:bb2e1588c2044ece8...",sg:bb2e1588c2044ece869f09e99232db12,sg:bb2e1588c2044ece869f09e99232db12
8,Dallas Cowboys,AT&T Stadium,AT&T Stadium,tx,arlington,"[26, At&t Stadium, sg:be0e81c90d484c25aa9192e2...","[26, At&t Stadium, sg:be0e81c90d484c25aa9192e2...",sg:be0e81c90d484c25aa9192e2be4b05c3,sg:be0e81c90d484c25aa9192e2be4b05c3
9,Denver Broncos,Sports Authority Field at Mile High,Sports Authority Field at Mile High,co,denver,"[26, Sports Authority Field At Mile High, sg:e...","[26, Sports Authority Field At Mile High, sg:e...",sg:e8e360cab0804eacaa3949c67ebc23d2,sg:e8e360cab0804eacaa3949c67ebc23d2


In [None]:
towns

In [18]:
# Deal with incorrect category (non-71, due to car-related names)
stadiums['good_match_2017'] = stadiums['good_match_2017'].apply(lambda x: x[1:] if (x and x[0] == '!') else x)
stadiums['good_match_2018'] = stadiums['good_match_2018'].apply(lambda x: x[1:] if (x and x[0] == '!') else x)

stadiums['sg_id_2017'] = stadiums['good_match_2017'].apply(lambda x: x[2] if x else None)
stadiums['sg_id_2018'] = stadiums['good_match_2018'].apply(lambda x: x[2] if x else None)

In [20]:
# Export the matching
output_file_path = os.path.join(teams_output_folder, 'football_matching.csv')
stadiums[['name', 'sg_id_2017', 'sg_id_2018']].to_csv(output_file_path, 
                                                                    index = False)