# 필요한 라이브러리 임포트

In [2]:
# 표준 라이브러리 임포트
from typing import List
import platform

# 서드 파티 라이브러리 임포트
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt

# 로컬 모듈 임포트
from utils import load_dataset

In [3]:
# OS 자동 감지 및 한글 폰트 설정
system = platform.system()

if system == 'Windows':
    # Windows: 맑은 고딕 사용
    plt.rcParams['font.family'] = 'Malgun Gothic'
    
elif system == 'Darwin':  # Darwin = macOS
    # macOS: 애플 고딕 사용
    plt.rcParams['font.family'] = 'AppleGothic'
    
elif system == 'Linux':
    # Linux: 나눔 고딕 사용 (사전 설치 필요)
    # 터미널: sudo apt-get install -y fonts-nanum
    plt.rcParams['font.family'] = 'NanumGothic'
    
else:
    print(f"알 수 없는 OS: {system}")

# 모든 OS 공통: 음수 기호 깨짐 방지
plt.rcParams['axes.unicode_minus'] = False

print(f"OS: {system}, 설정된 폰트: {plt.rcParams['font.family']}")

OS: Linux, 설정된 폰트: ['NanumGothic']


# 배경 이해하기

승률이 제일 낮은 팀을 찾아 개선 방향을 찾자.

In [4]:
# 승률이 담긴 데이터셋 불러오기
ranking = load_dataset('ranking')

In [5]:
ranking.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210342 entries, 0 to 210341
Data columns (total 13 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   TEAM_ID        210342 non-null  int64         
 1   LEAGUE_ID      210342 non-null  int64         
 2   SEASON_ID      210342 non-null  object        
 3   STANDINGSDATE  210342 non-null  datetime64[ns]
 4   CONFERENCE     210342 non-null  object        
 5   TEAM           210342 non-null  object        
 6   G              210342 non-null  int64         
 7   W              210342 non-null  int64         
 8   L              210342 non-null  int64         
 9   W_PCT          210342 non-null  float64       
 10  HOME_RECORD    210342 non-null  object        
 11  ROAD_RECORD    210342 non-null  object        
 12  RETURNTOPLAY   3990 non-null    float64       
dtypes: datetime64[ns](1), float64(2), int64(5), object(5)
memory usage: 20.9+ MB


### 최신 꼴찌를 찾기

In [6]:
sorted_ranking = ranking.sort_values(by=['STANDINGSDATE', 'W_PCT'], ascending=[False, True])
cols = ['STANDINGSDATE', 'TEAM', 'CONFERENCE', 'G', 'W', 'L','W_PCT']
sorted_ranking[cols]

Unnamed: 0,STANDINGSDATE,TEAM,CONFERENCE,G,W,L,W_PCT
4289,2022-12-22,Detroit,East,34,8,26,0.235
4288,2022-12-22,Charlotte,East,32,8,24,0.250
14,2022-12-22,Houston,West,31,9,22,0.290
13,2022-12-22,San Antonio,West,31,10,21,0.323
4286,2022-12-22,Orlando,East,33,12,21,0.364
...,...,...,...,...,...,...,...
181542,2003-10-01,Detroit,East,82,50,32,0.610
92941,2003-10-01,Minnesota,West,82,51,31,0.622
92940,2003-10-01,Sacramento,West,82,59,23,0.720
92938,2003-10-01,Dallas,West,82,60,22,0.732


In [7]:
# 승률 꼴찌는 Detroit
target_team = 'Detroit'

# 날짜 범위 이해하기

In [8]:
games = load_dataset('games')

In [9]:
games_dates = set(games['GAME_DATE_EST'].values)
ranking_dates = set(ranking['STANDINGSDATE'].values)

len(games_dates), len(ranking_dates)

(4304, 7023)

## 데이터 날짜 전체 범위 확인

In [10]:
print(f'games의 최소 날짜: {min(games_dates)}')
print(f'games의 최대 날짜: {max(games_dates)}')

games의 최소 날짜: 2003-10-05T00:00:00.000000000
games의 최대 날짜: 2022-12-22T00:00:00.000000000


In [11]:
print(f'ranking의 최소 날짜: {min(ranking_dates)}')
print(f'ranking의 최대 날짜: {max(ranking_dates)}')

ranking의 최소 날짜: 2003-10-01T00:00:00.000000000
ranking의 최대 날짜: 2022-12-22T00:00:00.000000000


## Games, Preseason, Regular Season 별 타임라인

In [12]:
preseason_cond = ranking['SEASON_ID'].str.startswith('1')
regular_cond = ranking['SEASON_ID'].str.startswith('2')

preseason_ranking = ranking[preseason_cond]
regular_ranking  = ranking[regular_cond]

In [13]:
preseason_dates = set(preseason_ranking['STANDINGSDATE'].values)
regular_dates = set(regular_ranking['STANDINGSDATE'].values)

len(preseason_dates), len(regular_dates)

(429, 6594)

### 연속된 구간으로 쪼개기

In [14]:
games_dates_lst = sorted(list(games_dates))
preseason_dates_lst = sorted(list(preseason_dates))
regular_dates_lst = sorted(list(regular_dates))

In [15]:
def get_continuous_dates(dates: List[np.datetime64]) -> List[list]:
    diff_arr = np.diff(dates)
    continuous_lst = []
    start = dates[0]
    for i, diff in enumerate(diff_arr):
        if diff > np.timedelta64(1, 'D'):
            end = dates[i]
            continuous_lst.append([start, end])
            start = dates[i+1]
    continuous_lst.append([start, dates[i]])
    return continuous_lst

In [16]:
games_continuous_dates = get_continuous_dates(games_dates_lst)
preseason_continuous_dates = get_continuous_dates(preseason_dates_lst)
regular_continuous_dates = get_continuous_dates(regular_dates_lst)

### 타임라인 생성

In [17]:
games_dates_data = [{'Category': 'Games', 'Start': start, 'End': end} for start, end in games_continuous_dates]
preseason_dates_data = [{'Category': 'Preseason', 'Start': start, 'End': end} for start, end in preseason_continuous_dates]
regular_dates_data = [{'Category': 'Regular', 'Start': start, 'End': end} for start, end in regular_continuous_dates]

dates_data = games_dates_data + preseason_dates_data + regular_dates_data
dates_data = pd.DataFrame(dates_data)

In [18]:
# 처음과 끝이 같아 표시 안되는 날짜는 1시간의 격차를 둠
dates_data.loc[dates_data['Start'] == dates_data['End'], 'End'] += pd.Timedelta(hours=1)

# 타임라인 차트
fig = px.timeline(
    dates_data,
    x_start="Start",
    x_end="End",
    y="Category",
    color="Category"
)

fig.update_layout(
    title="카테고리별 기간 타임라인",
    xaxis_title="날짜",
    yaxis_title="카테고리",
    width=900,
    height=400
)

fig.show()

### 시즌 별 타임라인 생성

In [19]:
# 시즌은 연속된 날짜가 아닌 최소 최대만 따짐
season_dates = ranking.groupby('SEASON_ID').agg(Start=('STANDINGSDATE', 'min'), End=('STANDINGSDATE', 'max')).reset_index()

In [20]:
# 타임라인 차트
fig = px.timeline(
    season_dates,
    x_start="Start",
    x_end="End",
    y="SEASON_ID",
    color="SEASON_ID"
)

fig.update_layout(
    title="시즌별 기간 타임라인",
    xaxis_title="날짜",
    yaxis_title="시즌",
    width=900,
    height=400
)

fig.show()

# 팀별 Regular 시즌 성적 보기

In [21]:
# 누적된 시즌 성적 중 마지막 날만 필터링
last_date = regular_ranking.groupby('SEASON_ID').agg({'STANDINGSDATE': 'max'})
final_cond = regular_ranking['STANDINGSDATE'].isin(last_date['STANDINGSDATE'])

cond = True
cond &= final_cond

final_ranking = regular_ranking[cond].copy()

# 순위 지표 생성
final_ranking['SEASON_RANK'] = final_ranking.groupby('SEASON_ID')['W_PCT'].rank(method='min', ascending=False)

final_ranking.sort_values(by=['STANDINGSDATE', 'SEASON_RANK'])

Unnamed: 0,TEAM_ID,LEAGUE_ID,SEASON_ID,STANDINGSDATE,CONFERENCE,TEAM,G,W,L,W_PCT,HOME_RECORD,ROAD_RECORD,RETURNTOPLAY,SEASON_RANK
92896,1610612742,0,22002,2003-10-04,West,Dallas,82,60,22,0.732,33-8,27-14,,1.0
92897,1610612759,0,22002,2003-10-04,West,San Antonio,82,60,22,0.732,33-8,27-14,,1.0
92898,1610612758,0,22002,2003-10-04,West,Sacramento,82,59,23,0.720,35-6,24-17,,3.0
92899,1610612750,0,22002,2003-10-04,West,Minnesota,82,51,31,0.622,33-8,18-23,,4.0
92900,1610612747,0,22002,2003-10-04,West,L.A. Lakers,82,50,32,0.610,31-10,19-22,,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4287,1610612764,0,22022,2022-12-22,East,Washington,33,12,21,0.364,8-7,4-14,,25.0
13,1610612759,0,22022,2022-12-22,West,San Antonio,31,10,21,0.323,5-12,5-9,,27.0
14,1610612745,0,22022,2022-12-22,West,Houston,31,9,22,0.290,6-9,3-13,,28.0
4288,1610612766,0,22022,2022-12-22,East,Charlotte,32,8,24,0.250,4-11,4-13,,29.0


In [22]:
# 그래프 그릴 때 팀 순서 현재 순위대로 지정
sorted_teams = sorted_ranking['TEAM'].head(30).tolist()
all_teams = set(ranking['TEAM'].tolist())
not_in_teams = all_teams - set(sorted_teams)
sorted_teams.extend(list(not_in_teams))
len(sorted_teams)

34

In [23]:
# px.line에서 값이 없는 경우 앞으로 밀려서 표시되지 않도록 제대로 NaN으로 만들어줌
final_ranking = final_ranking.astype({'SEASON_ID': 'category', 'TEAM': 'category'})
team_ranking = final_ranking.groupby(['SEASON_ID', 'TEAM'], observed=False).agg({'W_PCT': 'max', 'SEASON_RANK': 'max'}).reset_index()

## 팀별, 시즌별 승률 그래프 그리기

In [24]:
fig = px.line(
    team_ranking,
    x="SEASON_ID",
    y="W_PCT",
    animation_frame="TEAM",
    title="팀별 시즌 승률 변화",
    category_orders={"TEAM": sorted_teams},
)

fig.update_yaxes(range=(0, 1))
fig.update_traces(connectgaps=False)
fig.show()

## 팀별, 시즌별 순위 그래프 그리기

In [25]:
fig = px.line(
    team_ranking,
    x="SEASON_ID",
    y="SEASON_RANK",
    animation_frame="TEAM",
    title="팀별 시즌 순위 변화",
    category_orders={"TEAM": sorted_teams},
    range_y=[0, team_ranking["SEASON_RANK"].max()+1]
)

fig.update_yaxes(range=[30, 0])   # 순위는 낮을수록 높음
fig.update_traces(connectgaps=False)
fig.show()