In [1]:
################################################################################                                                                                                                                                              
################################################################################
#
# FILE: hockey-matching.ipynb
#
# BY: Dmitry Sedov 
#
# CREATED: Tue Jun 2 2020
#
# DESC: This code matches the stadiums to Sname POI dataset.
#
# EXEC:
#      
################################################################################
################################################################################

In [2]:
################################ Libraries #####################################

from bs4 import BeautifulSoup
import requests

import re
import os

import pandas as pd

from sqlalchemy.orm import sessionmaker
import sqlalchemy as db
import psycopg2

import us
from Levenshtein import ratio

################################################################################

In [3]:
################################ Constants #####################################

project_folder = '/home/user/projects/stadiums'
teams_output_folder = os.path.join(project_folder, 'data/hockey/teams/')
games_output_folder = os.path.join(project_folder, 'data/hockey/games/')

################################################################################

In [4]:
############################ PostgreSQL connection #############################

connection = psycopg2.connect(dbname = 'dataname2', 
                              user = 'user')
cursor = connection.cursor()

################################################################################

In [5]:
################################ Constants #####################################

def stadium_search(name, state):
    """ Search for a stadium in the database. """
    
    search_query = f"""
    SELECT 
        sname_place_id,
        location_name,
        naics_code,
        top_category,
        sub_category
    FROM 
        establishments
    WHERE
        state = '{state}'
        AND
        LOWER (location_name) LIKE $$%{name.lower()}%$$
    ;
    """
    results = pd.read_sql(search_query, con = connection)
    results['stadium'] = name
    
    return results

def visits_exist(sname_place_id):
    """Determine whether visits exist for a sname_place_id. """
    
    visits_exist_query = f"""
    SELECT 
        sname_place_id 
    FROM 
        visits 
    WHERE 
        sname_place_id = '{sname_place_id}'
    """
    cursor.execute(visits_exist_query)
    return (cursor.rowcount)

def visits_total(sname_place_id):
    visits_total_query = f"""
    SELECT 
        SUM(raw_visit_counts) 
    FROM 
        visits 
    WHERE 
        sname_place_id = '{sname_place_id}'
    """
    cursor.execute(visits_total_query)
    return (cursor.fetchone()[0])

################################################################################

In [6]:
def find_potential_matches(row, stadium_column):
    stadium = row[stadium_column]
    state = row['state']
    potential_matches = stadium_search(stadium, state)
    potential_matches['visits_exist'] = potential_matches['sname_place_id'].apply(visits_exist)
    potential_matches['visits_count'] = potential_matches['sname_place_id'].apply(visits_total)
    potential_matches['category_correct'] = potential_matches['naics_code'].apply(lambda x: str(x).startswith('71'))
    potential_matches['name_similarity'] = potential_matches.apply(lambda row: 
                                                               ratio(row['stadium'], row['location_name']), 
                                                               axis = 1)
    potential_matches.sort_values(by = ['name_similarity', 'visits_exist', 'visits_count', 'category_correct'],
                                  ascending = [False, False, False, False],
                                  inplace = True)
    potential_matches.reset_index(inplace = True)
    good_match = potential_matches.loc[0, ['visits_exist', 
                                           'category_correct', 
                                           'name_similarity']].to_list()
    print(potential_matches.loc[0, 'location_name'])
    print(good_match)
    if (good_match[0] >= 1 and good_match[1] and (good_match[2] >= 0.6)):
        return [good_match[0]] + potential_matches.loc[0, ['location_name', 'sname_place_id']].tolist()
    elif (not good_match[1] and good_match[0] >= 1 and (good_match[2] >= 0.6)):
        return ['!', good_match[0]] + potential_matches.loc[0, ['location_name', 'sname_place_id']].tolist()
    else:
        return None

In [7]:
############################# Import stadiums names ############################

stadiums = pd.read_csv(os.path.join(teams_output_folder,
                                    'hockey_teams.csv')
                      )

towns = pd.read_csv(os.path.join(teams_output_folder,
                                 'hockey_locations.csv')
                   )
towns.rename(columns = {'City, State': 'City'}, inplace = True)

towns = towns[['Team', 'City']]
towns['Team'] = towns['Team'].replace(to_replace = 'Arizona Coyotesnb ', 
                                      value = 'Arizona Coyotes')
towns['Team'] = towns['Team'].replace(to_replace = 'St Louis Blues', 
                                      value = 'St. Louis Blues')
towns['City'] = towns['City'].replace(to_replace = 'New York City, New YorkUniondale, New York', 
                                      value = 'New York City, New York')

stadiums = pd.merge(stadiums, 
                    towns, how = 'left', 
                    left_on = 'name', 
                    right_on = 'Team', 
                    validate = 'one_to_one')

stadiums.loc[stadiums['name'] == 'Vegas Golden Knights', 'stadium_2017'] = 'T-Mobile Arena'

################################################################################

In [8]:
stadiums['state'] = stadiums['City'].apply(lambda x: x.split(',')[1].strip().lower())
stadiums['city'] = stadiums['City'].apply(lambda x: x.split(',')[0].lower())

In [9]:
stadiums.drop(columns = [x for x in stadiums.columns 
                         if x not in ['name', 
                                      'city', 
                                      'state',
                                      'stadium_2017',
                                      'stadium_2018', 
                                      'stadium_2019'
                                     ]
                        ], 
              inplace = True)

In [10]:
for i, row in stadiums.iterrows():
    try:
        stadiums.loc[i, 'state'] = us.states.lookup(row['state'
                                                       ].replace('.',
                                                                 '')
                                                   ).abbr.lower()
    except AttributeError:
        stadiums.loc[i,'state'] = None

In [11]:
# Find stadium columns
stadium_columns = [x for x in stadiums.columns if 'stadium' in x]

In [12]:
stadiums[stadium_columns] = stadiums[stadium_columns].replace(to_replace = 'STAPLES Center', 
                                                              value = 'Staples Center')
stadiums[stadium_columns] = stadiums[stadium_columns].replace(to_replace = 'KeyBank Center', 
                                                              value = 'First Niagara Center')
stadiums[stadium_columns] = stadiums[stadium_columns].replace(to_replace = 'Madison Square Garden (IV)', 
                                                              value = 'Madison Square Garden')
# Replace the new name with the old one - as in SG database
stadiums[stadium_columns] = stadiums[stadium_columns].replace(to_replace = 'State Farm Arena', 
                                                              value = 'Philips Arena')
stadiums[stadium_columns] = stadiums[stadium_columns].replace(to_replace = 'Capital One Arena', 
                                                              value = 'Verizon Center')
# Trick
stadiums[stadium_columns] = stadiums[stadium_columns].replace(to_replace = 'TD Garden', 
                                                              value = 'Td Garden')
stadiums[stadium_columns] = stadiums[stadium_columns].replace(to_replace = 'BB&T Center', 
                                                              value = 'Bb&t Cente')
stadiums[stadium_columns] = stadiums[stadium_columns].replace(to_replace = 'PNC Arena', 
                                                              value = 'Pnc Arena')
stadiums[stadium_columns] = stadiums[stadium_columns].replace(to_replace = 'SAP Center at San Jose', 
                                                              value = 'Sap Center at San Jose')

In [13]:
# Iterate through stadium columns and find matches in SG dataset
for stadium_column in stadium_columns:
    y = stadium_column.split('_')[1]
    m = f'good_match_{y}' 
    stadiums[m] = None
    for i, row in stadiums.iterrows():
        try:
            stadiums.loc[i, m] = find_potential_matches(row, stadium_column)
        except ValueError:
            stadiums.loc[i, m] = None

Honda Center
[2, True, 1.0]
Gila River Arena
[0, True, 1.0]
Td Garden
[26, True, 1.0]
First Niagara Center
[26, True, 1.0]
Pnc Arena
[26, True, 1.0]
United Center
[26, True, 1.0]
Pepsi Center
[26, True, 1.0]
Nationwide Arena
[26, True, 1.0]
American Airlines Center
[26, True, 1.0]
Joe Louis Arena
[26, True, 1.0]
Bb&t Center
[26, True, 0.9523809523809523]
Staples Center
[26, True, 1.0]
Xcel Energy Center
[26, True, 1.0]
Bridgestone Arena
[26, True, 1.0]
Prudential Center
[26, True, 1.0]
Barclays Center
[26, True, 1.0]
Madison Square Garden
[26, True, 1.0]
Wells Fargo Center
[26, True, 1.0]
PPG Paints Arena
[25, True, 1.0]
Sap Center At San Jose
[26, True, 0.9545454545454546]
Scottrade Center
[26, True, 1.0]
Amalie Arena
[26, True, 1.0]
Verizon Center
[26, True, 1.0]
Honda Center
[2, True, 1.0]
Gila River Arena
[0, True, 1.0]
Td Garden
[26, True, 1.0]
First Niagara Center
[26, True, 1.0]
Pnc Arena
[26, True, 1.0]
United Center
[26, True, 1.0]
Pepsi Center
[26, True, 1.0]
Nationwide Arena

In [14]:
stadiums

Unnamed: 0,name,stadium_2017,stadium_2018,stadium_2019,state,city,good_match_2017,good_match_2018,good_match_2019
0,Anaheim Ducks,Honda Center,Honda Center,Honda Center,ca,anaheim,"[2, Honda Center, sg:1a660f827ee040e5bbb6da664...","[2, Honda Center, sg:1a660f827ee040e5bbb6da664...","[2, Honda Center, sg:1a660f827ee040e5bbb6da664..."
1,Arizona Coyotes,Gila River Arena,Gila River Arena,Gila River Arena,az,glendale,,,
2,Boston Bruins,Td Garden,Td Garden,Td Garden,ma,boston,"[26, Td Garden, sg:2a96716e3b8a4f0e9dfc249ac7c...","[26, Td Garden, sg:2a96716e3b8a4f0e9dfc249ac7c...","[26, Td Garden, sg:2a96716e3b8a4f0e9dfc249ac7c..."
3,Buffalo Sabres,First Niagara Center,First Niagara Center,First Niagara Center,ny,buffalo,"[26, First Niagara Center, sg:74d5a8291f1345b4...","[26, First Niagara Center, sg:74d5a8291f1345b4...","[26, First Niagara Center, sg:74d5a8291f1345b4..."
4,Calgary Flames,Scotiabank Saddledome,Scotiabank Saddledome,Scotiabank Saddledome,,calgary,,,
5,Carolina Hurricanes,Pnc Arena,Pnc Arena,Pnc Arena,nc,raleigh,"[26, Pnc Arena, sg:ac283081b6e44c1cb19feb1a5ce...","[26, Pnc Arena, sg:ac283081b6e44c1cb19feb1a5ce...","[26, Pnc Arena, sg:ac283081b6e44c1cb19feb1a5ce..."
6,Chicago Blackhawks,United Center,United Center,United Center,il,chicago,"[26, United Center, sg:ae4d4adf1c4a4ace882a3f5...","[26, United Center, sg:ae4d4adf1c4a4ace882a3f5...","[26, United Center, sg:ae4d4adf1c4a4ace882a3f5..."
7,Colorado Avalanche,Pepsi Center,Pepsi Center,Pepsi Center,co,denver,"[26, Pepsi Center, sg:e7385ac76b2546c9b905be12...","[26, Pepsi Center, sg:e7385ac76b2546c9b905be12...","[26, Pepsi Center, sg:e7385ac76b2546c9b905be12..."
8,Columbus Blue Jackets,Nationwide Arena,Nationwide Arena,Nationwide Arena,oh,columbus,"[26, Nationwide Arena, sg:496bf83aad094f17b605...","[26, Nationwide Arena, sg:496bf83aad094f17b605...","[26, Nationwide Arena, sg:496bf83aad094f17b605..."
9,Dallas Stars,American Airlines Center,American Airlines Center,American Airlines Center,tx,dallas,"[26, American Airlines Center, sg:f79998a4f42d...","[26, American Airlines Center, sg:f79998a4f42d...","[26, American Airlines Center, sg:f79998a4f42d..."


In [15]:
towns

Unnamed: 0,Team,City
0,Anaheim Ducks,"Anaheim, California"
1,Arizona Coyotes,"Glendale, Arizona"
2,Boston Bruins,"Boston, Massachusetts"
3,Buffalo Sabres,"Buffalo, New York"
4,Calgary Flames,"Calgary, Alberta"
5,Carolina Hurricanes,"Raleigh, North Carolina"
6,Chicago Blackhawks,"Chicago, Illinois"
7,Colorado Avalanche,"Denver, Colorado"
8,Columbus Blue Jackets,"Columbus, Ohio"
9,Dallas Stars,"Dallas, Texas"


In [15]:
stadiums['sg_id_2017'] = stadiums['good_match_2017'].apply(lambda x: x[2] if x else None)
stadiums['sg_id_2018'] = stadiums['good_match_2018'].apply(lambda x: x[2] if x else None)
stadiums['sg_id_2019'] = stadiums['good_match_2019'].apply(lambda x: x[2] if x else None)

In [16]:
# Export the matching
output_file_path = os.path.join(teams_output_folder, 'hockey_matching.csv')
stadiums[['name', 'sg_id_2017', 'sg_id_2018', 'sg_id_2019']].to_csv(output_file_path, 
                                                                    index = False)