## 필요한 라이브러리 불러오기

In [1]:
import pandas as pd

from utils import load_dataset
from config import DATA_DIR

In [2]:
games = load_dataset('games')
ranking = load_dataset('ranking')
games.head(10)

Unnamed: 0,GAME_DATE_EST,GAME_ID,GAME_STATUS_TEXT,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,...,AST_home,REB_home,TEAM_ID_away,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
0,2022-12-22,22200477,Final,1610612740,1610612759,2022,1610612740,126.0,0.484,0.926,...,25.0,46.0,1610612759,117.0,0.478,0.815,0.321,23.0,44.0,1
1,2022-12-22,22200478,Final,1610612762,1610612764,2022,1610612762,120.0,0.488,0.952,...,16.0,40.0,1610612764,112.0,0.561,0.765,0.333,20.0,37.0,1
2,2022-12-21,22200466,Final,1610612739,1610612749,2022,1610612739,114.0,0.482,0.786,...,22.0,37.0,1610612749,106.0,0.47,0.682,0.433,20.0,46.0,1
3,2022-12-21,22200467,Final,1610612755,1610612765,2022,1610612755,113.0,0.441,0.909,...,27.0,49.0,1610612765,93.0,0.392,0.735,0.261,15.0,46.0,1
4,2022-12-21,22200468,Final,1610612737,1610612741,2022,1610612737,108.0,0.429,1.0,...,22.0,47.0,1610612741,110.0,0.5,0.773,0.292,20.0,47.0,0
5,2022-12-21,22200469,Final,1610612738,1610612754,2022,1610612738,112.0,0.386,0.84,...,26.0,62.0,1610612754,117.0,0.469,0.778,0.462,27.0,47.0,0
6,2022-12-21,22200470,Final,1610612751,1610612744,2022,1610612751,143.0,0.643,0.875,...,42.0,32.0,1610612744,113.0,0.494,0.76,0.364,32.0,36.0,1
7,2022-12-21,22200471,Final,1610612752,1610612761,2022,1610612752,106.0,0.553,0.611,...,25.0,38.0,1610612761,113.0,0.447,0.909,0.265,17.0,38.0,0
8,2022-12-21,22200472,Final,1610612745,1610612753,2022,1610612745,110.0,0.466,0.647,...,22.0,49.0,1610612753,116.0,0.451,0.697,0.297,19.0,45.0,0
9,2022-12-21,22200473,Final,1610612750,1610612742,2022,1610612750,99.0,0.494,0.7,...,23.0,39.0,1610612742,104.0,0.453,0.852,0.333,17.0,39.0,0


## 날짜 분포 확인하기

In [17]:
games_dates = set(games['GAME_DATE_EST'].values)
ranking_dates = set(ranking['STANDINGSDATE'].values)

len(games_dates), len(ranking_dates)

(4304, 7023)

### 데이터셋 전체 범위 확인

In [76]:
min(games_dates), max(games_dates)

(np.datetime64('2003-10-05T00:00:00.000000000'),
 np.datetime64('2022-12-22T00:00:00.000000000'))

In [77]:
min(ranking_dates), max(ranking_dates)

(np.datetime64('2003-10-01T00:00:00.000000000'),
 np.datetime64('2022-12-22T00:00:00.000000000'))

### Games, Preseason, Regular Season 별 타임라인

In [16]:
preseason_cond = ranking['SEASON_ID'].str.startswith('1')
regular_cond = ranking['SEASON_ID'].str.startswith('2')

preseason_ranking = ranking[preseason_cond]
regular_ranking  = ranking[regular_cond]

In [18]:
preseason_dates = set(preseason_ranking['STANDINGSDATE'].values)
regular_dates = set(regular_ranking['STANDINGSDATE'].values)

len(preseason_dates), len(regular_dates)

(429, 6594)

In [23]:
games_dates_lst = sorted(list(games_dates))
preseason_dates_lst = sorted(list(preseason_dates))
regular_dates_lst = sorted(list(regular_dates))

In [45]:
import numpy as np
from typing import List

def get_continuous_dates(dates: List[np.datetime64]) -> List[list]:
    diff_arr = np.diff(dates)
    continuous_lst = []
    start = dates[0]
    for i, diff in enumerate(diff_arr):
        if diff > np.timedelta64(1, 'D'):
            end = dates[i]
            continuous_lst.append([start, end])
            start = dates[i+1]
    continuous_lst.append([start, dates[i]])
    return continuous_lst

In [46]:
games_continuous_dates = get_continuous_dates(games_dates_lst)
preseason_continuous_dates = get_continuous_dates(preseason_dates_lst)
regular_continuous_dates = get_continuous_dates(regular_dates_lst)

In [47]:
games_dates_data = [{'Category': 'Games', 'Start': start, 'End': end} for start, end in games_continuous_dates]
preseason_dates_data = [{'Category': 'Preseason', 'Start': start, 'End': end} for start, end in preseason_continuous_dates]
regular_dates_data = [{'Category': 'Regular', 'Start': start, 'End': end} for start, end in regular_continuous_dates]

dates_data = games_dates_data + preseason_dates_data + regular_dates_data

In [None]:
import plotly.express as px

dates_data = pd.DataFrame(dates_data)

# 타임라인 차트
fig = px.timeline(
    dates_data,
    x_start="Start",
    x_end="End",
    y="Category",
    color="Category"
)

fig.update_layout(
    title="카테고리별 기간 타임라인",
    xaxis_title="날짜",
    yaxis_title="카테고리",
    width=900,
    height=400
)

fig.show()

### 문제 해결 - Start 와 End가 같으면 그래프에 없음


In [73]:
dates_data.loc[dates_data['Start'] == dates_data['End'], 'End'] += pd.Timedelta(hours=1)

# 타임라인 차트
fig = px.timeline(
    dates_data,
    x_start="Start",
    x_end="End",
    y="Category",
    color="Category"
)

fig.update_layout(
    title="카테고리별 기간 타임라인",
    xaxis_title="날짜",
    yaxis_title="카테고리",
    width=900,
    height=400
)

fig.show()

In [55]:
season_dates = ranking.groupby('SEASON_ID').agg(Start=('STANDINGSDATE', 'min'), End=('STANDINGSDATE', 'max')).reset_index()

In [None]:
# 타임라인 차트
fig = px.timeline(
    season_dates,
    x_start="Start",
    x_end="End",
    y="SEASON_ID",
    color="SEASON_ID"
)

fig.update_layout(
    title="시즌별 기간 타임라인",
    xaxis_title="날짜",
    yaxis_title="시즌",
    width=900,
    height=400
)

fig.show()

In [79]:
regular_ranking[regular_ranking['STANDINGSDATE'] == '2018-06']

Unnamed: 0,TEAM_ID,LEAGUE_ID,SEASON_ID,STANDINGSDATE,CONFERENCE,TEAM,G,W,L,W_PCT,HOME_RECORD,ROAD_RECORD,RETURNTOPLAY
101187,1610612745,0,22017,2018-06-01,West,Houston,82,65,17,0.793,34-7,31-10,
101188,1610612744,0,22017,2018-06-01,West,Golden State,82,58,24,0.707,29-12,29-12,
101189,1610612757,0,22017,2018-06-01,West,Portland,82,49,33,0.598,28-13,21-20,
101190,1610612740,0,22017,2018-06-01,West,New Orleans,82,48,34,0.585,24-17,24-17,
101191,1610612760,0,22017,2018-06-01,West,Oklahoma City,82,48,34,0.585,27-14,21-20,
101192,1610612762,0,22017,2018-06-01,West,Utah,82,48,34,0.585,28-13,20-21,
101193,1610612750,0,22017,2018-06-01,West,Minnesota,82,47,35,0.573,30-11,17-24,
101194,1610612759,0,22017,2018-06-01,West,San Antonio,82,47,35,0.573,33-8,14-27,
101195,1610612743,0,22017,2018-06-01,West,Denver,82,46,36,0.561,31-10,15-26,
101196,1610612746,0,22017,2018-06-01,West,LA Clippers,82,42,40,0.512,22-19,20-21,


In [69]:
games[games['GAME_DATE_EST'] == '2019-05-23']

Unnamed: 0,GAME_DATE_EST,GAME_ID,GAME_STATUS_TEXT,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,...,AST_home,REB_home,TEAM_ID_away,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
19659,2019-05-23,41800305,Final,1610612749,1610612761,2018,1610612749,99.0,0.452,0.722,...,26.0,53.0,1610612761,105.0,0.369,0.806,0.419,19.0,45.0,0
