In [39]:
################################################################################                                                                                                                                                              
################################################################################
#
# FILE: basketball-matching.ipynb
#
# BY: Dmitry Sedov 
#
# CREATED: Tue Jun 2 2020
#
# DESC: This code matches the stadiums to Sname POI dataset.
#
# EXEC:
#      
################################################################################
################################################################################

In [40]:
################################ Libraries #####################################

from bs4 import BeautifulSoup
import requests

import re
import os

import pandas as pd

from sqlalchemy.orm import sessionmaker
import sqlalchemy as db
import psycopg2

import us
from Levenshtein import ratio

################################################################################

In [41]:
################################ Constants #####################################

project_folder = '/home/user/projects/stadiums'
teams_output_folder = os.path.join(project_folder, 'data/basketball/teams/')
games_output_folder = os.path.join(project_folder, 'data/basketball/games/')

################################################################################

In [42]:
############################ PostgreSQL connection #############################

connection = psycopg2.connect(dbname = 'dataname2', 
                              user = 'user')
cursor = connection.cursor()

################################################################################

In [43]:
################################ Constants #####################################

def stadium_search(name, state):
    """ Search for a stadium in the database. """
    
    search_query = f"""
    SELECT 
        sname_place_id,
        location_name,
        naics_code,
        top_category,
        sub_category
    FROM 
        establishments
    WHERE
        state = '{state}'
        AND
        LOWER (location_name) LIKE '%{name.lower()}%'
    ;
    """
    results = pd.read_sql(search_query, con = connection)
    results['stadium'] = name
    
    return results

def visits_exist(sname_place_id):
    """Determine whether visits exist for a sname_place_id. """
    
    visits_exist_query = f"""
    SELECT 
        sname_place_id 
    FROM 
        visits 
    WHERE 
        sname_place_id = '{sname_place_id}'
    """
    cursor.execute(visits_exist_query)
    return (cursor.rowcount)

def visits_total(sname_place_id):
    visits_total_query = f"""
    SELECT 
        SUM(raw_visit_counts) 
    FROM 
        visits 
    WHERE 
        sname_place_id = '{sname_place_id}'
    """
    cursor.execute(visits_total_query)
    return (cursor.fetchone()[0])

################################################################################

In [44]:
def find_potential_matches(row, stadium_column):
    stadium = row[stadium_column]
    state = row['state']
    potential_matches = stadium_search(stadium, state)
    potential_matches['visits_exist'] = potential_matches['sname_place_id'].apply(visits_exist)
    potential_matches['visits_count'] = potential_matches['sname_place_id'].apply(visits_total)
    potential_matches['category_correct'] = potential_matches['naics_code'].apply(lambda x: str(x).startswith('71'))
    potential_matches['name_similarity'] = potential_matches.apply(lambda row: 
                                                               ratio(row['stadium'], row['location_name']), 
                                                               axis = 1)
    potential_matches.sort_values(by = ['visits_exist', 'name_similarity', 'visits_count', 'category_correct'],
                                  ascending = [False, False, False, False],
                                  inplace = True)
    potential_matches.reset_index(inplace = True)
    good_match = potential_matches.loc[0, ['visits_exist', 
                                           'category_correct', 
                                           'name_similarity']].to_list()
    print(potential_matches.loc[0, 'location_name'])
    print(good_match)
    if (good_match[0] >= 1 and good_match[1] and (good_match[2] >= 0.6)):
        return [good_match[0]] + potential_matches.loc[0, ['location_name', 'sname_place_id']].tolist()
    else:
        return None

In [45]:
############################# Import stadiums names ############################

stadiums = pd.read_csv(os.path.join(teams_output_folder,
                                    'basketball_teams.csv')
                      )
towns = pd.read_csv(os.path.join(teams_output_folder,
                                 'basketball_locations.csv')
                   )
towns.rename(columns = {'City, State': 'City'}, inplace = True)

towns = towns[['Team', 'City']]

stadiums = pd.merge(stadiums, 
                    towns, how = 'left', 
                    left_on = 'name', 
                    right_on = 'Team', 
                    validate = 'one_to_one')

################################################################################

In [46]:
stadiums['state'] = stadiums['City'].apply(lambda x: x.split(',')[1].strip().lower())
stadiums['city'] = stadiums['City'].apply(lambda x: x.split(',')[0].lower())

In [47]:
stadiums.drop(columns = [x for x in stadiums.columns 
                         if x not in ['name', 
                                      'city', 
                                      'state',
                                      'stadium_2017',
                                      'stadium_2018', 
                                      'stadium_2019'
                                     ]
                        ], 
              inplace = True)

In [48]:
stadiums

Unnamed: 0,name,stadium_2017,stadium_2018,stadium_2019,state,city
0,Atlanta Hawks,Philips Arena,Philips Arena,State Farm Arena,georgia,atlanta
1,Boston Celtics,TD Garden,TD Garden,TD Garden,massachusetts,boston
2,Brooklyn Nets,Barclays Center,Barclays Center,Barclays Center,new york,new york city
3,Charlotte Hornets,Spectrum Center,Spectrum Center,Spectrum Center,north carolina,charlotte
4,Chicago Bulls,United Center,United Center,United Center,illinois,chicago
5,Cleveland Cavaliers,Quicken Loans Arena,Quicken Loans Arena,Quicken Loans Arena,ohio,cleveland
6,Dallas Mavericks,American Airlines Center,American Airlines Center,American Airlines Center,texas,dallas
7,Denver Nuggets,Pepsi Center,Pepsi Center,Pepsi Center,colorado,denver
8,Detroit Pistons,The Palace of Auburn Hills,Little Caesars Arena,Little Caesars Arena,michigan,detroit
9,Golden State Warriors,Oracle Arena,Oracle Arena,Oracle Arena,california,san francisco


In [49]:
for i, row in stadiums.iterrows():
    try:
        stadiums.loc[i, 'state'] = us.states.lookup(row['state'
                                                       ].replace('.',
                                                                 '')
                                                   ).abbr.lower()
    except AttributeError:
        stadiums.loc[i,'state'] = None

In [50]:
# Find stadium columns
stadium_columns = [x for x in stadiums.columns if 'stadium' in x]

In [51]:
stadiums[stadium_columns] = stadiums[stadium_columns].replace(to_replace = 'Spectrum Center', 
                                                              value = 'Time Warner Cable Arena')
stadiums[stadium_columns] = stadiums[stadium_columns].replace(to_replace = 'STAPLES Center', 
                                                              value = 'Staples Center')
stadiums[stadium_columns] = stadiums[stadium_columns].replace(to_replace = 'FedEx Forum', 
                                                              value = 'FedExForum')
stadiums[stadium_columns] = stadiums[stadium_columns].replace(to_replace = 'AmericanAirlines Arena', 
                                                              value = 'American Airlines Arena')
stadiums[stadium_columns] = stadiums[stadium_columns].replace(to_replace = 'Madison Square Garden (IV)', 
                                                              value = 'Madison Square Garden')
stadiums[stadium_columns] = stadiums[stadium_columns].replace(to_replace = '', 
                                                              value = 'Madison Square Garden')
# Replace the new name with the old one - as in SG database
stadiums[stadium_columns] = stadiums[stadium_columns].replace(to_replace = 'State Farm Arena', 
                                                              value = 'Philips Arena')
stadiums[stadium_columns] = stadiums[stadium_columns].replace(to_replace = 'Capital One Arena', 
                                                              value = 'Verizon Center')

In [52]:
# Iterate through stadium columns and find matches in SG dataset
for stadium_column in stadium_columns:
    y = stadium_column.split('_')[1]
    m = f'good_match_{y}' 
    stadiums[m] = None
    for i, row in stadiums.iterrows():
        try:
            stadiums.loc[i, m] = find_potential_matches(row, stadium_column)
        except ValueError:
            stadiums.loc[i, m] = None

Philips Arena
[26, True, 1.0]
Td Garden
[26, True, 0.8888888888888888]
Barclays Center
[26, True, 1.0]
Time Warner Cable Arena
[26, True, 1.0]
United Center
[26, True, 1.0]
Quicken Loans Arena
[1, True, 1.0]
American Airlines Center
[26, True, 1.0]
Pepsi Center
[26, True, 1.0]
The Palace Of Auburn Hills
[26, True, 0.9615384615384616]
Oracle Arena
[26, True, 1.0]
toyota center
[26, False, 0.8461538461538461]
Bankers Life Fieldhouse
[26, True, 1.0]
Staples Center
[26, True, 1.0]
Staples Center
[26, True, 1.0]
Fedexforum
[26, True, 0.8]
American Airlines Arena
[26, True, 1.0]
Bmo Harris Bradley Center
[26, True, 0.92]
Target Center
[26, True, 1.0]
Smoothie King Center
[26, True, 1.0]
Madison Square Garden
[26, True, 1.0]
Chesapeake Energy Arena
[26, True, 1.0]
Amway Center
[26, True, 1.0]
Wells Fargo Center
[26, True, 1.0]
Talking Stick Resort Arena
[26, True, 1.0]
Moda Center
[26, True, 1.0]
At&t Center
[26, True, 0.8181818181818182]
Vivint Smart Home Arena
[26, True, 1.0]
Verizon Center

In [14]:
towns

Unnamed: 0,Team,City
0,Atlanta Hawks,"Atlanta, Georgia"
1,Boston Celtics,"Boston, Massachusetts"
2,Brooklyn Nets,"New York City, New York"
3,Charlotte Hornets,"Charlotte, North Carolina"
4,Chicago Bulls,"Chicago, Illinois"
5,Cleveland Cavaliers,"Cleveland, Ohio"
6,Dallas Mavericks,"Dallas, Texas"
7,Denver Nuggets,"Denver, Colorado"
8,Detroit Pistons,"Detroit, Michigan"
9,Golden State Warriors,"San Francisco, California"


In [53]:
stadiums

Unnamed: 0,name,stadium_2017,stadium_2018,stadium_2019,state,city,good_match_2017,good_match_2018,good_match_2019
0,Atlanta Hawks,Philips Arena,Philips Arena,Philips Arena,ga,atlanta,"[26, Philips Arena, sg:c7c2dbe83bfb4a88a26ca9a...","[26, Philips Arena, sg:c7c2dbe83bfb4a88a26ca9a...","[26, Philips Arena, sg:c7c2dbe83bfb4a88a26ca9a..."
1,Boston Celtics,TD Garden,TD Garden,TD Garden,ma,boston,"[26, Td Garden, sg:2a96716e3b8a4f0e9dfc249ac7c...","[26, Td Garden, sg:2a96716e3b8a4f0e9dfc249ac7c...","[26, Td Garden, sg:2a96716e3b8a4f0e9dfc249ac7c..."
2,Brooklyn Nets,Barclays Center,Barclays Center,Barclays Center,ny,new york city,"[26, Barclays Center, sg:a4375f84884541bca5628...","[26, Barclays Center, sg:a4375f84884541bca5628...","[26, Barclays Center, sg:a4375f84884541bca5628..."
3,Charlotte Hornets,Time Warner Cable Arena,Time Warner Cable Arena,Time Warner Cable Arena,nc,charlotte,"[26, Time Warner Cable Arena, sg:be683bc19f2d4...","[26, Time Warner Cable Arena, sg:be683bc19f2d4...","[26, Time Warner Cable Arena, sg:be683bc19f2d4..."
4,Chicago Bulls,United Center,United Center,United Center,il,chicago,"[26, United Center, sg:ae4d4adf1c4a4ace882a3f5...","[26, United Center, sg:ae4d4adf1c4a4ace882a3f5...","[26, United Center, sg:ae4d4adf1c4a4ace882a3f5..."
5,Cleveland Cavaliers,Quicken Loans Arena,Quicken Loans Arena,Quicken Loans Arena,oh,cleveland,"[1, Quicken Loans Arena, sg:e5a38ec18fc44c8593...","[1, Quicken Loans Arena, sg:e5a38ec18fc44c8593...","[1, Quicken Loans Arena, sg:e5a38ec18fc44c8593..."
6,Dallas Mavericks,American Airlines Center,American Airlines Center,American Airlines Center,tx,dallas,"[26, American Airlines Center, sg:f79998a4f42d...","[26, American Airlines Center, sg:f79998a4f42d...","[26, American Airlines Center, sg:f79998a4f42d..."
7,Denver Nuggets,Pepsi Center,Pepsi Center,Pepsi Center,co,denver,"[26, Pepsi Center, sg:e7385ac76b2546c9b905be12...","[26, Pepsi Center, sg:e7385ac76b2546c9b905be12...","[26, Pepsi Center, sg:e7385ac76b2546c9b905be12..."
8,Detroit Pistons,The Palace of Auburn Hills,Little Caesars Arena,Little Caesars Arena,mi,detroit,"[26, The Palace Of Auburn Hills, sg:3c36843e94...","[26, Little Caesars Arena, sg:461065e801484d2e...","[26, Little Caesars Arena, sg:461065e801484d2e..."
9,Golden State Warriors,Oracle Arena,Oracle Arena,Oracle Arena,ca,san francisco,"[26, Oracle Arena, sg:bfae5fecd9b94aeab823761c...","[26, Oracle Arena, sg:bfae5fecd9b94aeab823761c...","[26, Oracle Arena, sg:bfae5fecd9b94aeab823761c..."


In [54]:
stadiums['sg_id_2017'] = stadiums['good_match_2017'].apply(lambda x: x[2] if x else None)
stadiums['sg_id_2018'] = stadiums['good_match_2018'].apply(lambda x: x[2] if x else None)
stadiums['sg_id_2019'] = stadiums['good_match_2019'].apply(lambda x: x[2] if x else None)

In [55]:
# Export the matching
output_file_path = os.path.join(teams_output_folder, 'basketball_matching.csv')
stadiums[['name', 'sg_id_2017', 'sg_id_2018', 'sg_id_2019']].to_csv(output_file_path, 
                                                                    index = False)