In [None]:
################################################################################                                                                                                                                                              
################################################################################
#
# FILE: hockey-matching.ipynb
#
# BY: Dmitry Sedov 
#
# CREATED: Tue Jun 2 2020
#
# DESC: This code matches the stadiums to Sname POI dataset.
#
# EXEC:
#      
################################################################################
################################################################################

In [None]:
################################ Libraries #####################################

from bs4 import BeautifulSoup
import requests

import re
import os

import pandas as pd

from sqlalchemy.orm import sessionmaker
import sqlalchemy as db
import psycopg2

import us
from Levenshtein import ratio

################################################################################

In [None]:
################################ Constants #####################################

project_folder = '/home/user/projects/stadiums'
teams_output_folder = os.path.join(project_folder, 'data/hockey/teams/')
games_output_folder = os.path.join(project_folder, 'data/hockey/games/')

################################################################################

In [None]:
############################ PostgreSQL connection #############################

connection = psycopg2.connect(dbname = 'dataname2', 
                              user = 'user')
cursor = connection.cursor()

################################################################################

In [None]:
################################ Constants #####################################

def stadium_search(name, state):
    """ Search for a stadium in the database. """
    
    search_query = f"""
    SELECT 
        sname_place_id,
        location_name,
        naics_code,
        top_category,
        sub_category
    FROM 
        establishments
    WHERE
        state = '{state}'
        AND
        LOWER (location_name) LIKE $$%{name.lower()}%$$
    ;
    """
    results = pd.read_sql(search_query, con = connection)
    results['stadium'] = name
    
    return results

def visits_exist(sname_place_id):
    """Determine whether visits exist for a sname_place_id. """
    
    visits_exist_query = f"""
    SELECT 
        sname_place_id 
    FROM 
        visits 
    WHERE 
        sname_place_id = '{sname_place_id}'
    """
    cursor.execute(visits_exist_query)
    return (cursor.rowcount)

################################################################################

In [None]:
def find_potential_matches(row, stadium_column):
    stadium = row[stadium_column]
    state = row['state']
    potential_matches = stadium_search(stadium, state)
    potential_matches['visits_exist'] = potential_matches['sname_place_id'].apply(visits_exist)
    potential_matches['category_correct'] = potential_matches['naics_code'].apply(lambda x: str(x).startswith('71'))
    potential_matches['name_similarity'] = potential_matches.apply(lambda row: 
                                                               ratio(row['stadium'], row['location_name']), 
                                                               axis = 1)
    potential_matches.sort_values(by = ['visits_exist', 'name_similarity', 'category_correct'],
                                  ascending = [False, False, False],
                                  inplace = True)
    potential_matches.reset_index(inplace = True)
    good_match = potential_matches.loc[0, ['visits_exist', 
                                           'category_correct', 
                                           'name_similarity']].to_list()
    print(potential_matches.loc[0, 'location_name'])
    print(good_match)
    return(good_match[0] > 10 and good_match[1] and (good_match[2] >= 0.6))

In [None]:
############################# Import stadiums names ############################

stadiums = pd.read_csv(os.path.join(teams_output_folder,
                                    'hockey_teams.csv')
                      )

towns = pd.read_csv(os.path.join(teams_output_folder,
                                 'hockey_locations.csv')
                   )
towns.rename(columns = {'City, State': 'City'}, inplace = True)

towns = towns[['Team', 'City']]
towns['Team'] = towns['Team'].replace(to_replace = 'Arizona Coyotesnb ', 
                                      value = 'Arizona Coyotes')
towns['Team'] = towns['Team'].replace(to_replace = 'St Louis Blues', 
                                      value = 'St. Louis Blues')
towns['City'] = towns['City'].replace(to_replace = 'New York City, New YorkUniondale, New York', 
                                      value = 'New York City, New York')

stadiums = pd.merge(stadiums, 
                    towns, how = 'left', 
                    left_on = 'name', 
                    right_on = 'Team', 
                    validate = 'one_to_one')

stadiums.loc[stadiums['name'] == 'Vegas Golden Knights', 'stadium_2017'] = 'T-Mobile Arena'

################################################################################

In [None]:
stadiums['state'] = stadiums['City'].apply(lambda x: x.split(',')[1].strip().lower())
stadiums['city'] = stadiums['City'].apply(lambda x: x.split(',')[0].lower())

In [None]:
for i, row in stadiums.iterrows():
    try:
        stadiums.loc[i, 'state'] = us.states.lookup(row['state'
                                                       ].replace('.',
                                                                 '')
                                                   ).abbr.lower()
    except AttributeError:
        stadiums.loc[i,'state'] = None

In [None]:
# Find stadium columns
stadium_columns = [x for x in stadiums.columns if 'stadium' in x]

In [None]:
stadiums[stadium_columns] = stadiums[stadium_columns].replace(to_replace = 'STAPLES Center', 
                                                              value = 'Staples Center')
stadiums[stadium_columns] = stadiums[stadium_columns].replace(to_replace = 'KeyBank Center', 
                                                              value = 'First Niagara Center')
stadiums[stadium_columns] = stadiums[stadium_columns].replace(to_replace = 'Madison Square Garden (IV)', 
                                                              value = 'Madison Square Garden')

In [None]:
# Iterate through stadium columns and find matches in SG dataset
for stadium_column in stadium_columns:
    y = stadium_column.split('_')[1]
    m = f'good_match_{y}' 
    stadiums[m] = None
    for i, row in stadiums.iterrows():
        try:
            stadiums.loc[i, m] = find_potential_matches(row, stadium_column)
        except ValueError:
            stadiums.loc[i, m] = None

In [None]:
stadiums

In [None]:
towns