import pandas as pd
import pickle
# ^^^ pyforest auto-imports - don't write above this line
**EDA for NHL Game Data**

# Imports 

In [1]:
import mysql.connector
import os
import json
import requests
import pprint
from config import *
from functions import *

# Getting General Info about database

## Tables

In [2]:
test_query = "SHOW TABLES;"
all_tables_query_result = run_query(test_query)
all_tables = [a[0] for a in all_tables_query_result]
all_tables # ignoring entries table

['entries',
 'game',
 'game_goalie_stats',
 'game_plays',
 'game_plays_players',
 'game_shifts',
 'game_skater_stats',
 'game_teams_stats',
 'player_info',
 'team_info']

## Making all columns in all tables into a dataframe

In [3]:
all_query_results = []
for table in all_tables[1:]:
    query = f"""
                SHOW columns FROM {table}
            """
    query_result = run_query(query) # returns:  Field       | Type     | Null | Key | Default | Extra     
    table_w_query = {table: query_result}
    all_query_results.append(table_w_query)

In [4]:
# get all columns in a contingency table 
all_columns = [] 
for thing in all_query_results:
    for key, value in thing.items():
        for item in value:
            list_item = list(item)
            list_item.append(key)
    #         one_col.append(key) # values (and its features) are first, table is last
            all_columns.append(list_item)

In [5]:
# removing incorrect values
for x in all_columns:
    if len(x) != 7:
        all_columns.remove(x)

In [6]:
table_info_df = pd.DataFrame(data = all_columns, 
             columns = ["Column_name", "Column_type", "Null", "Key", "Default", "Extra", "Table"])

In [7]:
# key, null and extra are all the same value, so I'm removing them
table_info_df.drop(columns = ['Null', "Key", "Extra", "Default"], inplace=True)

### Adding num of unique values to df

In [8]:
num_unique_values_list = []
for idx in range(len(table_info_df)):
    col_name = table_info_df['Column_name'][idx]
    table_name = table_info_df['Table'][idx]
    query = f"""
            SELECT COUNT(DISTINCT({col_name}))
            FROM {table_name}
    """
    query_result = run_query(query)
    num_unique_values_list.append(query_result)

In [9]:
# flattening the num_unique_values_list
flattened_num_unique_values = [num_unique_values_list[x][0][0] for x in range(len(num_unique_values_list))]

In [10]:
table_info_df['Number_of_unique_values'] = flattened_num_unique_values

In [11]:
table_info_df[0:60]
# table_info_df[60:]

Unnamed: 0,Column_name,Column_type,Table,Number_of_unique_values
0,game_id,int(11),game,5718
1,season,int(11),game,10
2,type,varchar(1),game,3
3,date_time,varchar(10),game,1726
4,date_time_GMT,varchar(20),game,4388
5,away_team_id,int(11),game,34
6,home_team_id,int(11),game,34
7,away_goals,int(11),game,11
8,home_goals,int(11),game,11
9,outcome,varchar(12),game,7


### Saving DF

In [12]:
# table_info_df.to_csv("../data/table_info_df.csv")

## Listing all events

In [13]:
# what events are possible
events_query = """
        SELECT DISTINCT(event)
        FROM game_plays
"""
all_events = run_query(events_query)

### Saving All events as a list

In [26]:
with open('../data/all_event_types', 'wb') as f:
    pickle.dump(all_events, f)

<IPython.core.display.Javascript object>

In [240]:
# what seasons does this cover
seasons_query = """
        SELECT DISTINCT (season)
        FROM game
"""
distinct_seasons = run_query(seasons_query)
distinct_seasons

[(0,),
 (20112012,),
 (20102011,),
 (20122013,),
 (20162017,),
 (20142015,),
 (20152016,),
 (20132014,),
 (20172018,),
 (20182019,)]

In [242]:
# getting a game id to put into nhl API 
gameid_query = """
        SELECT DISTINCT (game_id)
        FROM game
        LIMIT 10
"""
example_game_ids = run_query(gameid_query)
example_game_ids

[(0,),
 (2011030222,),
 (2011030224,),
 (2011030411,),
 (2011030413,),
 (2011030415,),
 (2010030311,),
 (2010030313,),
 (2010030315,),
 (2010030317,)]