# 기본 모듈

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 불필요한 경고 메시지를 나오지 않도록 한다.
import warnings
warnings.filterwarnings('ignore')


# seaborn 에서 제공하는 그래프 테마
# sns.set_style('ticks')
# sns.set_style('darkgrid')
# sns.set_style('whitegrid')

# 그래프를 그리기 위한 기본 설정
# 폰트 설정 - 윈도우
plt.rcParams['font.family'] = 'Malgun Gothic'

# 그래프의 크기
plt.rcParams['figure.figsize'] = 12, 6

# 글자 크기
plt.rcParams['font.size'] = 14

# 폰트 설정 시 - 기호 깨는거 방지하기
plt.rcParams['axes.unicode_minus'] = False

# 데이터 읽어오기

In [2]:
# 데이터를 읽어올 때 사용할 주소
site = 'https://www.basketball-reference.com/leagues/NBA_2018_games.html'

In [3]:
# read_html : 지정된 페이지에 table 태그가 있을 경우 table 태그 하나당 하나의 데이터 프레임을 생성한다.
tables = pd.read_html(site)
tables

[                  Date Start (ET)        Visitor/Neutral  PTS  \
 0    Tue, Oct 17, 2017      8:01p         Boston Celtics   99   
 1    Tue, Oct 17, 2017     10:30p        Houston Rockets  122   
 2    Wed, Oct 18, 2017      7:00p      Charlotte Hornets   90   
 3    Wed, Oct 18, 2017      7:00p          Brooklyn Nets  131   
 4    Wed, Oct 18, 2017      7:00p             Miami Heat  109   
 ..                 ...        ...                    ...  ...   
 99   Mon, Oct 30, 2017     10:30p  Golden State Warriors  141   
 100  Tue, Oct 31, 2017      7:00p       Sacramento Kings   83   
 101  Tue, Oct 31, 2017      7:30p           Phoenix Suns  122   
 102  Tue, Oct 31, 2017      8:00p  Oklahoma City Thunder  110   
 103  Tue, Oct 31, 2017     10:30p        Detroit Pistons   93   
 
               Home/Neutral  PTS.1 Unnamed: 6 Unnamed: 7  Attend.  Notes  
 0      Cleveland Cavaliers    102  Box Score        NaN    20562    NaN  
 1    Golden State Warriors    121  Box Score        NaN

In [6]:
tables[0]

Unnamed: 0,Date,Start (ET),Visitor/Neutral,PTS,Home/Neutral,PTS.1,Unnamed: 6,Unnamed: 7,Attend.,Notes
0,"Tue, Oct 17, 2017",8:01p,Boston Celtics,99,Cleveland Cavaliers,102,Box Score,,20562,
1,"Tue, Oct 17, 2017",10:30p,Houston Rockets,122,Golden State Warriors,121,Box Score,,19596,
2,"Wed, Oct 18, 2017",7:00p,Charlotte Hornets,90,Detroit Pistons,102,Box Score,,20491,
3,"Wed, Oct 18, 2017",7:00p,Brooklyn Nets,131,Indiana Pacers,140,Box Score,,15008,
4,"Wed, Oct 18, 2017",7:00p,Miami Heat,109,Orlando Magic,116,Box Score,,18846,
...,...,...,...,...,...,...,...,...,...,...
99,"Mon, Oct 30, 2017",10:30p,Golden State Warriors,141,Los Angeles Clippers,113,Box Score,,19068,
100,"Tue, Oct 31, 2017",7:00p,Sacramento Kings,83,Indiana Pacers,101,Box Score,,12245,
101,"Tue, Oct 31, 2017",7:30p,Phoenix Suns,122,Brooklyn Nets,114,Box Score,,12936,
102,"Tue, Oct 31, 2017",8:00p,Oklahoma City Thunder,110,Milwaukee Bucks,91,Box Score,,16713,


In [17]:
# 데이터를 저장한다.
# 만약 한글이 포함되어 있다면 "encoding = 'utf-8-sig'"로 설정하는 것이 좋다
tables[0].to_csv('data2/nba_2018.csv', index = False)
print('저장 완료 !')

저장 완료 !


# 데이터 파악하기

In [18]:
df1 = pd.read_csv('data2/nba_2018.csv')
df1

Unnamed: 0,Date,Start (ET),Visitor/Neutral,PTS,Home/Neutral,PTS.1,Unnamed: 6,Unnamed: 7,Attend.,Notes
0,"Tue, Oct 17, 2017",8:01p,Boston Celtics,99,Cleveland Cavaliers,102,Box Score,,20562,
1,"Tue, Oct 17, 2017",10:30p,Houston Rockets,122,Golden State Warriors,121,Box Score,,19596,
2,"Wed, Oct 18, 2017",7:00p,Charlotte Hornets,90,Detroit Pistons,102,Box Score,,20491,
3,"Wed, Oct 18, 2017",7:00p,Brooklyn Nets,131,Indiana Pacers,140,Box Score,,15008,
4,"Wed, Oct 18, 2017",7:00p,Miami Heat,109,Orlando Magic,116,Box Score,,18846,
...,...,...,...,...,...,...,...,...,...,...
99,"Mon, Oct 30, 2017",10:30p,Golden State Warriors,141,Los Angeles Clippers,113,Box Score,,19068,
100,"Tue, Oct 31, 2017",7:00p,Sacramento Kings,83,Indiana Pacers,101,Box Score,,12245,
101,"Tue, Oct 31, 2017",7:30p,Phoenix Suns,122,Brooklyn Nets,114,Box Score,,12936,
102,"Tue, Oct 31, 2017",8:00p,Oklahoma City Thunder,110,Milwaukee Bucks,91,Box Score,,16713,


In [19]:
# 데이터 프레임 정보 확인
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Date             104 non-null    object 
 1   Start (ET)       104 non-null    object 
 2   Visitor/Neutral  104 non-null    object 
 3   PTS              104 non-null    int64  
 4   Home/Neutral     104 non-null    object 
 5   PTS.1            104 non-null    int64  
 6   Unnamed: 6       104 non-null    object 
 7   Unnamed: 7       2 non-null      object 
 8   Attend.          104 non-null    int64  
 9   Notes            0 non-null      float64
dtypes: float64(1), int64(3), object(6)
memory usage: 8.2+ KB


# 데이터 전처리

In [42]:
# 컬럼 이름을 확인한다
df1.columns = ['경기일자', '경기시간', '방문팀', '방문팀점수', '홈팀', '홈팀점수', 'Box', '연장전여부', '관중수', 'Notes']
df1

Unnamed: 0,경기일자,경기시간,방문팀,방문팀점수,홈팀,홈팀점수,Box,연장전여부,관중수,Notes
0,2017-10-17,8:01p,Boston Celtics,99,Cleveland Cavaliers,102,Box Score,NOT,20562,
1,2017-10-17,10:30p,Houston Rockets,122,Golden State Warriors,121,Box Score,NOT,19596,
2,2017-10-18,7:00p,Charlotte Hornets,90,Detroit Pistons,102,Box Score,NOT,20491,
3,2017-10-18,7:00p,Brooklyn Nets,131,Indiana Pacers,140,Box Score,NOT,15008,
4,2017-10-18,7:00p,Miami Heat,109,Orlando Magic,116,Box Score,NOT,18846,
...,...,...,...,...,...,...,...,...,...,...
99,2017-10-30,10:30p,Golden State Warriors,141,Los Angeles Clippers,113,Box Score,NOT,19068,
100,2017-10-31,7:00p,Sacramento Kings,83,Indiana Pacers,101,Box Score,NOT,12245,
101,2017-10-31,7:30p,Phoenix Suns,122,Brooklyn Nets,114,Box Score,NOT,12936,
102,2017-10-31,8:00p,Oklahoma City Thunder,110,Milwaukee Bucks,91,Box Score,NOT,16713,


In [22]:
# 결측치 확인
df1.isna().sum()

경기일자       0
경기시간       0
방문팀        0
방문팀점수      0
홈팀         0
홈팀점수       0
Box        0
연장전여부    102
관중수        0
Note     104
dtype: int64

In [24]:
# 연장전 여부에 저장되어 있는 값의 종류를 확인한다.
df1['연장전여부'].value_counts()

OT    2
Name: 연장전여부, dtype: int64

In [38]:
# 연장전 여부의 결측은 연장전을 하지 않은 경기이다.
# 따라서 결측치를 Not으로 설정한다.
df1['연장전여부'].fillna('NOT', inplace = True)
df1['연장전여부'].value_counts()

NOT    102
OT       2
Name: 연장전여부, dtype: int64

In [39]:
df1.isna().sum()

경기일자       0
경기시간       0
방문팀        0
방문팀점수      0
홈팀         0
홈팀점수       0
Box        0
연장전여부      0
관중수        0
Notes    104
dtype: int64

In [40]:
# 날짜 컬럼의 타입을 확인한다.
print(df1['경기일자'])
type(df1['경기일자'][0])

0     2017-10-17
1     2017-10-17
2     2017-10-18
3     2017-10-18
4     2017-10-18
         ...    
99    2017-10-30
100   2017-10-31
101   2017-10-31
102   2017-10-31
103   2017-10-31
Name: 경기일자, Length: 104, dtype: datetime64[ns]


pandas._libs.tslibs.timestamps.Timestamp

In [35]:
# 날짜 데이터는 원하는 양식으로 변경하는 것이 좋다
# 파이썬 날짜 데이터 문자 양식
# https://docs.python.org/3/library/datetime.html#strftime-and-behavior
# Tue, Oct 17, 2017
df1['경기일자'] = pd.to_datetime(df1['경기일자'], format = '%a, %b %d, %Y')
df1['경기일자']

0     2017-10-17
1     2017-10-17
2     2017-10-18
3     2017-10-18
4     2017-10-18
         ...    
99    2017-10-30
100   2017-10-31
101   2017-10-31
102   2017-10-31
103   2017-10-31
Name: 경기일자, Length: 104, dtype: datetime64[ns]

In [43]:
# Notes, Box 제거
df1_na = df1.drop(['Notes', 'Box'], axis = 1)
df1_na

Unnamed: 0,경기일자,경기시간,방문팀,방문팀점수,홈팀,홈팀점수,연장전여부,관중수
0,2017-10-17,8:01p,Boston Celtics,99,Cleveland Cavaliers,102,NOT,20562
1,2017-10-17,10:30p,Houston Rockets,122,Golden State Warriors,121,NOT,19596
2,2017-10-18,7:00p,Charlotte Hornets,90,Detroit Pistons,102,NOT,20491
3,2017-10-18,7:00p,Brooklyn Nets,131,Indiana Pacers,140,NOT,15008
4,2017-10-18,7:00p,Miami Heat,109,Orlando Magic,116,NOT,18846
...,...,...,...,...,...,...,...,...
99,2017-10-30,10:30p,Golden State Warriors,141,Los Angeles Clippers,113,NOT,19068
100,2017-10-31,7:00p,Sacramento Kings,83,Indiana Pacers,101,NOT,12245
101,2017-10-31,7:30p,Phoenix Suns,122,Brooklyn Nets,114,NOT,12936
102,2017-10-31,8:00p,Oklahoma City Thunder,110,Milwaukee Bucks,91,NOT,16713


# 필요한 컬럼만 추출해서 정리한다. (1)

In [45]:
a1 = ['경기일자', '방문팀', '방문팀점수', '홈팀', '홈팀점수']
games_nae = df1_na[a1]
games_nae

Unnamed: 0,경기일자,방문팀,방문팀점수,홈팀,홈팀점수
0,2017-10-17,Boston Celtics,99,Cleveland Cavaliers,102
1,2017-10-17,Houston Rockets,122,Golden State Warriors,121
2,2017-10-18,Charlotte Hornets,90,Detroit Pistons,102
3,2017-10-18,Brooklyn Nets,131,Indiana Pacers,140
4,2017-10-18,Miami Heat,109,Orlando Magic,116
...,...,...,...,...,...
99,2017-10-30,Golden State Warriors,141,Los Angeles Clippers,113
100,2017-10-31,Sacramento Kings,83,Indiana Pacers,101
101,2017-10-31,Phoenix Suns,122,Brooklyn Nets,114
102,2017-10-31,Oklahoma City Thunder,110,Milwaukee Bucks,91


In [48]:
# 경기 일자 컬럼을 인덱스로 지정한다
# append에 True를 넣어주면 이전에 있던 index도 유지된다.
games_naes = games_nae.set_index('경기일자', append = True)
games_naes

Unnamed: 0_level_0,Unnamed: 1_level_0,방문팀,방문팀점수,홈팀,홈팀점수
Unnamed: 0_level_1,경기일자,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2017-10-17,Boston Celtics,99,Cleveland Cavaliers,102
1,2017-10-17,Houston Rockets,122,Golden State Warriors,121
2,2017-10-18,Charlotte Hornets,90,Detroit Pistons,102
3,2017-10-18,Brooklyn Nets,131,Indiana Pacers,140
4,2017-10-18,Miami Heat,109,Orlando Magic,116
...,...,...,...,...,...
99,2017-10-30,Golden State Warriors,141,Los Angeles Clippers,113
100,2017-10-31,Sacramento Kings,83,Indiana Pacers,101
101,2017-10-31,Phoenix Suns,122,Brooklyn Nets,114
102,2017-10-31,Oklahoma City Thunder,110,Milwaukee Bucks,91


In [49]:
# 인덱스의 이름을 설정한다
games_naesr = games_naes.rename_axis(['게임', '경기일자'])
games_naesr

Unnamed: 0_level_0,Unnamed: 1_level_0,방문팀,방문팀점수,홈팀,홈팀점수
게임,경기일자,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2017-10-17,Boston Celtics,99,Cleveland Cavaliers,102
1,2017-10-17,Houston Rockets,122,Golden State Warriors,121
2,2017-10-18,Charlotte Hornets,90,Detroit Pistons,102
3,2017-10-18,Brooklyn Nets,131,Indiana Pacers,140
4,2017-10-18,Miami Heat,109,Orlando Magic,116
...,...,...,...,...,...
99,2017-10-30,Golden State Warriors,141,Los Angeles Clippers,113
100,2017-10-31,Sacramento Kings,83,Indiana Pacers,101
101,2017-10-31,Phoenix Suns,122,Brooklyn Nets,114
102,2017-10-31,Oklahoma City Thunder,110,Milwaukee Bucks,91


## melt 함수 사용해보기

In [50]:
df = pd.DataFrame({
    'A' : ['a1', 'a2', 'a3', 'a4'],
    'B' : ['b1', 'b2', 'b3', 'b4'],
    'C' : [1, 2, 3, 4],
    'D' : [100, 200, 300, 400]
})
df

Unnamed: 0,A,B,C,D
0,a1,b1,1,100
1,a2,b2,2,200
2,a3,b3,3,300
3,a4,b4,4,400


In [52]:
df.melt(id_vars = 'A', value_vars = ['B', 'C'])

Unnamed: 0,A,variable,value
0,a1,B,b1
1,a2,B,b2
2,a3,B,b3
3,a4,B,b4
4,a1,C,1
5,a2,C,2
6,a3,C,3
7,a4,C,4


# 필요한 컬럼만 추출해서 정리한다. (2)

In [53]:
# 인덱스를 다시 설정한다
a1 = games_naesr.reset_index()
a1

Unnamed: 0,게임,경기일자,방문팀,방문팀점수,홈팀,홈팀점수
0,0,2017-10-17,Boston Celtics,99,Cleveland Cavaliers,102
1,1,2017-10-17,Houston Rockets,122,Golden State Warriors,121
2,2,2017-10-18,Charlotte Hornets,90,Detroit Pistons,102
3,3,2017-10-18,Brooklyn Nets,131,Indiana Pacers,140
4,4,2017-10-18,Miami Heat,109,Orlando Magic,116
...,...,...,...,...,...,...
99,99,2017-10-30,Golden State Warriors,141,Los Angeles Clippers,113
100,100,2017-10-31,Sacramento Kings,83,Indiana Pacers,101
101,101,2017-10-31,Phoenix Suns,122,Brooklyn Nets,114
102,102,2017-10-31,Oklahoma City Thunder,110,Milwaukee Bucks,91


In [58]:
tidy = pd.melt(a1, id_vars = ['게임', '경기일자'], value_vars = ['방문팀', '홈팀'], var_name = '팀구분', value_name = '팀')
tidy.query('게임 == 0')

Unnamed: 0,게임,경기일자,팀구분,팀
0,0,2017-10-17,방문팀,Boston Celtics
104,0,2017-10-17,홈팀,Cleveland Cavaliers


In [59]:
# diff 함수
# 현재 값이 이전 값보다 얼마나 등락했는지를 계산해서 반환한다
a1 = tidy.query('팀 == "Boston Celtics"')
# 경기 일자를 기준으로 오름차순 정렬한다
a2 = a1.sort_values('경기일자')
# 날짜 간격을 구한다
a2['경기일자'].diff()

0        NaT
110   1 days
19    2 days
154   4 days
66    2 days
79    2 days
195   2 days
Name: 경기일자, dtype: timedelta64[ns]

In [67]:
# 각 팀별로 그룹을 묶고 그 안에서 전날보다 몇일 후에 경기를 했는지 계산해서 가져온다.
tidy.sort_values('경기일자', inplace = True)
a1 = tidy.groupby('팀')['경기일자'].diff()

# 날짜를 정수값으로 가져온다
a2 = a1.dt.days

# 날짜 차이값이 1일 날짜(연속으로 경기가 있었던 날짜)를 가져온다.
tidy[['경기일자', '팀']][a2 == 1]

Unnamed: 0,경기일자,팀
12,2017-10-18,Houston Rockets
110,2017-10-18,Boston Celtics
25,2017-10-20,Los Angeles Lakers
30,2017-10-21,Golden State Warriors
26,2017-10-21,Philadelphia 76ers
32,2017-10-21,Detroit Pistons
139,2017-10-21,Utah Jazz
28,2017-10-21,Orlando Magic
29,2017-10-21,Dallas Mavericks
31,2017-10-21,Indiana Pacers


# 각 팀의 휴식일을 구한다

In [69]:
# 며칠씩 쉬었는지 계산한다.
ab = tidy.groupby('팀')['경기일자'].diff()

# 정수형으로 변환한다
ab = ab.dt.days
ab

0      NaN
1      NaN
104    NaN
105    NaN
6      NaN
      ... 
100    2.0
206    2.0
204    2.0
205    2.0
207    3.0
Name: 경기일자, Length: 208, dtype: float64

In [70]:
# 결측치 확인
ab.isna().sum()

30

In [71]:
# 첫 경기들은 그 앞의 경기들이 없기 때문에 결측치는 한 팀당 하나씩 반드시 발생한다.
# 따라서 결측치를 0으로 채워준다
ab.fillna(0, inplace = True)
ab

0      0.0
1      0.0
104    0.0
105    0.0
6      0.0
      ... 
100    2.0
206    2.0
204    2.0
205    2.0
207    3.0
Name: 경기일자, Length: 208, dtype: float64

In [72]:
tidy['휴식일'] = ab
tidy.sort_index(inplace = True)
tidy

Unnamed: 0,게임,경기일자,팀구분,팀,휴식일
0,0,2017-10-17,방문팀,Boston Celtics,0.0
1,1,2017-10-17,방문팀,Houston Rockets,0.0
2,2,2017-10-18,방문팀,Charlotte Hornets,0.0
3,3,2017-10-18,방문팀,Brooklyn Nets,0.0
4,4,2017-10-18,방문팀,Miami Heat,0.0
...,...,...,...,...,...
203,99,2017-10-30,홈팀,Los Angeles Clippers,2.0
204,100,2017-10-31,홈팀,Indiana Pacers,2.0
205,101,2017-10-31,홈팀,Brooklyn Nets,2.0
206,102,2017-10-31,홈팀,Milwaukee Bucks,2.0
