# 10.1 농구데이터 정리

- 전처리 : 데이터 분석을 하기 전에 
  - 사전에 Cleansing되어야 할 데이터가 있거나 
  - 사전에 정비되어야 할 데이터가 있는 경우에 데이터를 가공하는 작업 
- 후처리 : 기존에 분석된 자료를 보고 정밀도가 떨어졌을 경우 
  - 데이터의 정밀도를 높이기 위해 사후에 다시 데이터를 가공하는 작업  
  

- 전처리, 후처리 작업은 사전에 정해진 것이 아니고   
  - 상황에 따라서 유동적으로 적용될 경우가 매우 많다.  
  - 대부분의 전처리작업은 예측가능하나 
  - 후처리 작업은 데이터의 Cleansing 패턴을 일정하게 만들어서 
  - 반복시키도록 유도하는 것이 필요하다.  
  
  
- 참조 자료 : https://tomaugspurger.github.io/modern-5-tidy 

In [1]:
import warnings     # 파이썬 기본적인 경고메세지 처리하기     

warnings.filterwarnings(action='ignore')      # 기본적인 경고메세지 무시하기 
# warnings.filterwarnings(action='default')    # 기본적인 경고메세지 기본값으로 표시하기   

In [2]:
import pandas as pd

##  예제 10-1 데이터 알아보기

In [3]:
tables = pd.read_html("http://www.basketball-reference.com/leagues/NBA_2018_games.html")
tables

[                  Date Start (ET)        Visitor/Neutral  PTS  \
 0    Tue, Oct 17, 2017      8:01p         Boston Celtics   99   
 1    Tue, Oct 17, 2017     10:30p        Houston Rockets  122   
 2    Wed, Oct 18, 2017      7:00p      Charlotte Hornets   90   
 3    Wed, Oct 18, 2017      7:00p          Brooklyn Nets  131   
 4    Wed, Oct 18, 2017      7:00p             Miami Heat  109   
 ..                 ...        ...                    ...  ...   
 99   Mon, Oct 30, 2017     10:30p  Golden State Warriors  141   
 100  Tue, Oct 31, 2017      7:00p       Sacramento Kings   83   
 101  Tue, Oct 31, 2017      7:30p           Phoenix Suns  122   
 102  Tue, Oct 31, 2017      8:00p  Oklahoma City Thunder  110   
 103  Tue, Oct 31, 2017     10:30p        Detroit Pistons   93   
 
               Home/Neutral  PTS.1 Unnamed: 6 Unnamed: 7  Attend.  Notes  
 0      Cleveland Cavaliers    102  Box Score        NaN    20562    NaN  
 1    Golden State Warriors    121  Box Score        NaN

In [4]:
# tables.to_csv('../data/10장/basketball.csv', encoding='UTF-8-SIG')
# tables.to_csv('basketball.csv', encoding='UTF-8-SIG')

In [5]:
type(tables)     # [] 

list

In [6]:
type(tables[0])

pandas.core.frame.DataFrame

In [7]:
games = tables[0]   # [0] : DataFrame 으로 되어 있으므로 to_CSV가 가능 
games

Unnamed: 0,Date,Start (ET),Visitor/Neutral,PTS,Home/Neutral,PTS.1,Unnamed: 6,Unnamed: 7,Attend.,Notes
0,"Tue, Oct 17, 2017",8:01p,Boston Celtics,99,Cleveland Cavaliers,102,Box Score,,20562,
1,"Tue, Oct 17, 2017",10:30p,Houston Rockets,122,Golden State Warriors,121,Box Score,,19596,
2,"Wed, Oct 18, 2017",7:00p,Charlotte Hornets,90,Detroit Pistons,102,Box Score,,20491,
3,"Wed, Oct 18, 2017",7:00p,Brooklyn Nets,131,Indiana Pacers,140,Box Score,,15008,
4,"Wed, Oct 18, 2017",7:00p,Miami Heat,109,Orlando Magic,116,Box Score,,18846,
...,...,...,...,...,...,...,...,...,...,...
99,"Mon, Oct 30, 2017",10:30p,Golden State Warriors,141,Los Angeles Clippers,113,Box Score,,19068,
100,"Tue, Oct 31, 2017",7:00p,Sacramento Kings,83,Indiana Pacers,101,Box Score,,12245,
101,"Tue, Oct 31, 2017",7:30p,Phoenix Suns,122,Brooklyn Nets,114,Box Score,,12936,
102,"Tue, Oct 31, 2017",8:00p,Oklahoma City Thunder,110,Milwaukee Bucks,91,Box Score,,16713,


In [8]:
games.to_csv("../data/nav_2018.csv",encoding='utf-8-sig',index=False)

In [9]:
games_1 = pd.read_csv("../data/nav_2018.csv",encoding='utf-8') 
games_1.head(10)

Unnamed: 0,Date,Start (ET),Visitor/Neutral,PTS,Home/Neutral,PTS.1,Unnamed: 6,Unnamed: 7,Attend.,Notes
0,"Tue, Oct 17, 2017",8:01p,Boston Celtics,99,Cleveland Cavaliers,102,Box Score,,20562,
1,"Tue, Oct 17, 2017",10:30p,Houston Rockets,122,Golden State Warriors,121,Box Score,,19596,
2,"Wed, Oct 18, 2017",7:00p,Charlotte Hornets,90,Detroit Pistons,102,Box Score,,20491,
3,"Wed, Oct 18, 2017",7:00p,Brooklyn Nets,131,Indiana Pacers,140,Box Score,,15008,
4,"Wed, Oct 18, 2017",7:00p,Miami Heat,109,Orlando Magic,116,Box Score,,18846,
5,"Wed, Oct 18, 2017",7:00p,Philadelphia 76ers,115,Washington Wizards,120,Box Score,,20356,
6,"Wed, Oct 18, 2017",7:30p,Milwaukee Bucks,108,Boston Celtics,100,Box Score,,18624,
7,"Wed, Oct 18, 2017",8:00p,New Orleans Pelicans,91,Memphis Grizzlies,103,Box Score,,17794,
8,"Wed, Oct 18, 2017",8:30p,Atlanta Hawks,117,Dallas Mavericks,111,Box Score,,19709,
9,"Wed, Oct 18, 2017",9:00p,Denver Nuggets,96,Utah Jazz,106,Box Score,,17588,


In [10]:
#games_1 = games_1.drop('Unnamed: 0', axis=1)  
#games_1.head()

In [11]:
import chardet

In [12]:
def find_encoding(fname):
    r_file = open(fname, 'rb').read()
    result = chardet.detect(r_file)
    charenc = result['encoding']
    return charenc

In [13]:
my_encoding1 = find_encoding('../data/nav_2018.csv')

In [14]:
df_1 = pd.read_csv('../data/nav_2018.csv', encoding=my_encoding1)  # 영문은 기본적으로 문제가 없다. 
df_1.head(7)

Unnamed: 0,Date,Start (ET),Visitor/Neutral,PTS,Home/Neutral,PTS.1,Unnamed: 6,Unnamed: 7,Attend.,Notes
0,"Tue, Oct 17, 2017",8:01p,Boston Celtics,99,Cleveland Cavaliers,102,Box Score,,20562,
1,"Tue, Oct 17, 2017",10:30p,Houston Rockets,122,Golden State Warriors,121,Box Score,,19596,
2,"Wed, Oct 18, 2017",7:00p,Charlotte Hornets,90,Detroit Pistons,102,Box Score,,20491,
3,"Wed, Oct 18, 2017",7:00p,Brooklyn Nets,131,Indiana Pacers,140,Box Score,,15008,
4,"Wed, Oct 18, 2017",7:00p,Miami Heat,109,Orlando Magic,116,Box Score,,18846,
5,"Wed, Oct 18, 2017",7:00p,Philadelphia 76ers,115,Washington Wizards,120,Box Score,,20356,
6,"Wed, Oct 18, 2017",7:30p,Milwaukee Bucks,108,Boston Celtics,100,Box Score,,18624,


In [15]:
my_encoding1    # 교재와 다르게 UTF-8-SIG로 되어 있는 것을 확인할 수가 있다. 

'UTF-8-SIG'

In [16]:
games.info()    # 자료의  Summary 정보를 조회한다.   
                              # R에서는 View(games)  OR summary(games)  이렇게 한다. 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Date             104 non-null    object 
 1   Start (ET)       104 non-null    object 
 2   Visitor/Neutral  104 non-null    object 
 3   PTS              104 non-null    int64  
 4   Home/Neutral     104 non-null    object 
 5   PTS.1            104 non-null    int64  
 6   Unnamed: 6       104 non-null    object 
 7   Unnamed: 7       2 non-null      object 
 8   Attend.          104 non-null    int64  
 9   Notes            0 non-null      float64
dtypes: float64(1), int64(3), object(6)
memory usage: 8.2+ KB


In [17]:
games.shape    # 104 X 10 Matrix 

(104, 10)

In [18]:
games.head()

Unnamed: 0,Date,Start (ET),Visitor/Neutral,PTS,Home/Neutral,PTS.1,Unnamed: 6,Unnamed: 7,Attend.,Notes
0,"Tue, Oct 17, 2017",8:01p,Boston Celtics,99,Cleveland Cavaliers,102,Box Score,,20562,
1,"Tue, Oct 17, 2017",10:30p,Houston Rockets,122,Golden State Warriors,121,Box Score,,19596,
2,"Wed, Oct 18, 2017",7:00p,Charlotte Hornets,90,Detroit Pistons,102,Box Score,,20491,
3,"Wed, Oct 18, 2017",7:00p,Brooklyn Nets,131,Indiana Pacers,140,Box Score,,15008,
4,"Wed, Oct 18, 2017",7:00p,Miami Heat,109,Orlando Magic,116,Box Score,,18846,


In [19]:
games['Attend.'].head()   # 결과값을 Series로 조회 

0    20562
1    19596
2    20491
3    15008
4    18846
Name: Attend., dtype: int64

In [20]:
games[['Attend.']].head()   # 결과값을 무조건 dataframe으로 조회 

Unnamed: 0,Attend.
0,20562
1,19596
2,20491
3,15008
4,18846


## 예제 10-2 데이터 자료형 및 이름 맞추기

- 전처리 작업을 시작한다. 

In [21]:
games.columns    # 컬럼의 Head정보를 검색 

Index(['Date', 'Start (ET)', 'Visitor/Neutral', 'PTS', 'Home/Neutral', 'PTS.1',
       'Unnamed: 6', 'Unnamed: 7', 'Attend.', 'Notes'],
      dtype='object')

In [22]:
games.columns[6]  # index가 [0]부터 출발하므로 실제적으로 7번째 컬럼이 된다.  

'Unnamed: 6'

In [23]:
games.columns[7]

'Unnamed: 7'

In [24]:
#column_names = {'Date': '경기일자', 'Start (ET)': '시작시간',
#                'Visitor/Neutral': '방문팀', 
#                'PTS': '방문팀점수', 'Home/Neutral': '홈팀',
#                'PTS.1': '홈팀점수', '\xa0' : 'Box', 'Attend.':'관중수','':'n_ot'}

In [25]:
column_names = {'Date': '경기일자', 'Start (ET)': '시작시간',
                'Visitor/Neutral': '방문팀', 
                'PTS': '방문팀점수', 'Home/Neutral': '홈팀',
                'PTS.1': '홈팀점수', 'Unnamed: 6' : 'Box', 'Attend.':'관중수','Unnamed: 7':'n_ot'}

In [26]:
games.rename(columns=column_names).head()            # 컬럼  Head를 Rename 

Unnamed: 0,경기일자,시작시간,방문팀,방문팀점수,홈팀,홈팀점수,Box,n_ot,관중수,Notes
0,"Tue, Oct 17, 2017",8:01p,Boston Celtics,99,Cleveland Cavaliers,102,Box Score,,20562,
1,"Tue, Oct 17, 2017",10:30p,Houston Rockets,122,Golden State Warriors,121,Box Score,,19596,
2,"Wed, Oct 18, 2017",7:00p,Charlotte Hornets,90,Detroit Pistons,102,Box Score,,20491,
3,"Wed, Oct 18, 2017",7:00p,Brooklyn Nets,131,Indiana Pacers,140,Box Score,,15008,
4,"Wed, Oct 18, 2017",7:00p,Miami Heat,109,Orlando Magic,116,Box Score,,18846,


In [27]:
games = games.rename(columns=column_names)
games.head()

Unnamed: 0,경기일자,시작시간,방문팀,방문팀점수,홈팀,홈팀점수,Box,n_ot,관중수,Notes
0,"Tue, Oct 17, 2017",8:01p,Boston Celtics,99,Cleveland Cavaliers,102,Box Score,,20562,
1,"Tue, Oct 17, 2017",10:30p,Houston Rockets,122,Golden State Warriors,121,Box Score,,19596,
2,"Wed, Oct 18, 2017",7:00p,Charlotte Hornets,90,Detroit Pistons,102,Box Score,,20491,
3,"Wed, Oct 18, 2017",7:00p,Brooklyn Nets,131,Indiana Pacers,140,Box Score,,15008,
4,"Wed, Oct 18, 2017",7:00p,Miami Heat,109,Orlando Magic,116,Box Score,,18846,


In [28]:
games.columns   # 컬럼 이름이 잘 바뀌었는 지를 확인 

Index(['경기일자', '시작시간', '방문팀', '방문팀점수', '홈팀', '홈팀점수', 'Box', 'n_ot', '관중수',
       'Notes'],
      dtype='object')

In [29]:
games.head()    # 실제 데이터랑 잘 매핑되어 있는 지를 확인 

Unnamed: 0,경기일자,시작시간,방문팀,방문팀점수,홈팀,홈팀점수,Box,n_ot,관중수,Notes
0,"Tue, Oct 17, 2017",8:01p,Boston Celtics,99,Cleveland Cavaliers,102,Box Score,,20562,
1,"Tue, Oct 17, 2017",10:30p,Houston Rockets,122,Golden State Warriors,121,Box Score,,19596,
2,"Wed, Oct 18, 2017",7:00p,Charlotte Hornets,90,Detroit Pistons,102,Box Score,,20491,
3,"Wed, Oct 18, 2017",7:00p,Brooklyn Nets,131,Indiana Pacers,140,Box Score,,15008,
4,"Wed, Oct 18, 2017",7:00p,Miami Heat,109,Orlando Magic,116,Box Score,,18846,


In [30]:
games['경기일자'].dtype

dtype('O')

- datetime 기본 Format 
- 관련 문서 : https://docs.python.org/ko/3/library/datetime.html  
  - a : 요일
  - b : 월 
  - c : Local 일자의 날짜와 시간 
  - Y : 연도를 나타내는 데 , YYYY로 표시되는 연도 

In [31]:
a_g = games.assign(경기일자=lambda x: pd.to_datetime(x['경기일자'], format='%a, %b %d, %Y'))

In [32]:
a_g.head()

Unnamed: 0,경기일자,시작시간,방문팀,방문팀점수,홈팀,홈팀점수,Box,n_ot,관중수,Notes
0,2017-10-17,8:01p,Boston Celtics,99,Cleveland Cavaliers,102,Box Score,,20562,
1,2017-10-17,10:30p,Houston Rockets,122,Golden State Warriors,121,Box Score,,19596,
2,2017-10-18,7:00p,Charlotte Hornets,90,Detroit Pistons,102,Box Score,,20491,
3,2017-10-18,7:00p,Brooklyn Nets,131,Indiana Pacers,140,Box Score,,15008,
4,2017-10-18,7:00p,Miami Heat,109,Orlando Magic,116,Box Score,,18846,


In [33]:
games_ = a_g.copy()
games_.head()

Unnamed: 0,경기일자,시작시간,방문팀,방문팀점수,홈팀,홈팀점수,Box,n_ot,관중수,Notes
0,2017-10-17,8:01p,Boston Celtics,99,Cleveland Cavaliers,102,Box Score,,20562,
1,2017-10-17,10:30p,Houston Rockets,122,Golden State Warriors,121,Box Score,,19596,
2,2017-10-18,7:00p,Charlotte Hornets,90,Detroit Pistons,102,Box Score,,20491,
3,2017-10-18,7:00p,Brooklyn Nets,131,Indiana Pacers,140,Box Score,,15008,
4,2017-10-18,7:00p,Miami Heat,109,Orlando Magic,116,Box Score,,18846,


In [34]:
games_.shape

(104, 10)

In [35]:
games_.경기일자.dtype

dtype('<M8[ns]')

In [36]:
print(games_.경기일자.dtype)

datetime64[ns]


## 예제 10-3 초기값 및 결측데이터 정리하기

In [37]:
games_.isnull().sum()    # 7번째 컬럼, 메모(Note)컬럼에 NaN(결측치)가 있었다. 

경기일자       0
시작시간       0
방문팀        0
방문팀점수      0
홈팀         0
홈팀점수       0
Box        0
n_ot     102
관중수        0
Notes    104
dtype: int64

In [38]:
games_[games_['Notes'].notnull()]     # 잠시 보류 

Unnamed: 0,경기일자,시작시간,방문팀,방문팀점수,홈팀,홈팀점수,Box,n_ot,관중수,Notes


In [39]:
games_['n_ot'].fillna('NOT',inplace=True)   # 기존 NaN -> NOT 으로 치환 
games_.head()

Unnamed: 0,경기일자,시작시간,방문팀,방문팀점수,홈팀,홈팀점수,Box,n_ot,관중수,Notes
0,2017-10-17,8:01p,Boston Celtics,99,Cleveland Cavaliers,102,Box Score,NOT,20562,
1,2017-10-17,10:30p,Houston Rockets,122,Golden State Warriors,121,Box Score,NOT,19596,
2,2017-10-18,7:00p,Charlotte Hornets,90,Detroit Pistons,102,Box Score,NOT,20491,
3,2017-10-18,7:00p,Brooklyn Nets,131,Indiana Pacers,140,Box Score,NOT,15008,
4,2017-10-18,7:00p,Miami Heat,109,Orlando Magic,116,Box Score,NOT,18846,


In [40]:
games_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   경기일자    104 non-null    datetime64[ns]
 1   시작시간    104 non-null    object        
 2   방문팀     104 non-null    object        
 3   방문팀점수   104 non-null    int64         
 4   홈팀      104 non-null    object        
 5   홈팀점수    104 non-null    int64         
 6   Box     104 non-null    object        
 7   n_ot    104 non-null    object        
 8   관중수     104 non-null    int64         
 9   Notes   0 non-null      float64       
dtypes: datetime64[ns](1), float64(1), int64(3), object(5)
memory usage: 8.2+ KB


In [41]:
games_na = games_.dropna(axis=1)    # Notes 컬럼이 제거 : NaN으로 되어 있으므로 
games_na.head()

Unnamed: 0,경기일자,시작시간,방문팀,방문팀점수,홈팀,홈팀점수,Box,n_ot,관중수
0,2017-10-17,8:01p,Boston Celtics,99,Cleveland Cavaliers,102,Box Score,NOT,20562
1,2017-10-17,10:30p,Houston Rockets,122,Golden State Warriors,121,Box Score,NOT,19596
2,2017-10-18,7:00p,Charlotte Hornets,90,Detroit Pistons,102,Box Score,NOT,20491
3,2017-10-18,7:00p,Brooklyn Nets,131,Indiana Pacers,140,Box Score,NOT,15008
4,2017-10-18,7:00p,Miami Heat,109,Orlando Magic,116,Box Score,NOT,18846


In [42]:
games_na.head()

Unnamed: 0,경기일자,시작시간,방문팀,방문팀점수,홈팀,홈팀점수,Box,n_ot,관중수
0,2017-10-17,8:01p,Boston Celtics,99,Cleveland Cavaliers,102,Box Score,NOT,20562
1,2017-10-17,10:30p,Houston Rockets,122,Golden State Warriors,121,Box Score,NOT,19596
2,2017-10-18,7:00p,Charlotte Hornets,90,Detroit Pistons,102,Box Score,NOT,20491
3,2017-10-18,7:00p,Brooklyn Nets,131,Indiana Pacers,140,Box Score,NOT,15008
4,2017-10-18,7:00p,Miami Heat,109,Orlando Magic,116,Box Score,NOT,18846


In [43]:
games_na.shape

(104, 9)

## 예제 10-4 필요 열만 추출해서 정리하기

In [44]:
games_nae = games_na[['경기일자', '방문팀', '방문팀점수', '홈팀', '홈팀점수']]
games_nae.head()

Unnamed: 0,경기일자,방문팀,방문팀점수,홈팀,홈팀점수
0,2017-10-17,Boston Celtics,99,Cleveland Cavaliers,102
1,2017-10-17,Houston Rockets,122,Golden State Warriors,121
2,2017-10-18,Charlotte Hornets,90,Detroit Pistons,102
3,2017-10-18,Brooklyn Nets,131,Indiana Pacers,140
4,2017-10-18,Miami Heat,109,Orlando Magic,116


In [45]:
games_nae.columns

Index(['경기일자', '방문팀', '방문팀점수', '홈팀', '홈팀점수'], dtype='object')

In [46]:
games_naes = games_nae.set_index('경기일자',append=True)   # 경기일자를 인덱스로 대체 

In [47]:
games_naes.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,방문팀,방문팀점수,홈팀,홈팀점수
Unnamed: 0_level_1,경기일자,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2017-10-17,Boston Celtics,99,Cleveland Cavaliers,102
1,2017-10-17,Houston Rockets,122,Golden State Warriors,121
2,2017-10-18,Charlotte Hornets,90,Detroit Pistons,102
3,2017-10-18,Brooklyn Nets,131,Indiana Pacers,140
4,2017-10-18,Miami Heat,109,Orlando Magic,116


In [48]:
games_naesr = games_naes.rename_axis(["게임","경기일자"])   # '',경기일자 -> 게임,경기일자으로 변환 

In [49]:
games_naesr.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,방문팀,방문팀점수,홈팀,홈팀점수
게임,경기일자,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2017-10-17,Boston Celtics,99,Cleveland Cavaliers,102
1,2017-10-17,Houston Rockets,122,Golden State Warriors,121
2,2017-10-18,Charlotte Hornets,90,Detroit Pistons,102
3,2017-10-18,Brooklyn Nets,131,Indiana Pacers,140
4,2017-10-18,Miami Heat,109,Orlando Magic,116


In [50]:
games_naesr = games_naesr.sort_index()

In [51]:
games_naesr.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,방문팀,방문팀점수,홈팀,홈팀점수
게임,경기일자,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2017-10-17,Boston Celtics,99,Cleveland Cavaliers,102
1,2017-10-17,Houston Rockets,122,Golden State Warriors,121
2,2017-10-18,Charlotte Hornets,90,Detroit Pistons,102
3,2017-10-18,Brooklyn Nets,131,Indiana Pacers,140
4,2017-10-18,Miami Heat,109,Orlando Magic,116


In [52]:
type(games_naesr.index)

pandas.core.indexes.multi.MultiIndex

In [53]:
type(games_naesr.columns)

pandas.core.indexes.base.Index

## 예제 10-5 데이터 작업 체인닝 처리하기

In [54]:
games_all = (games_na.rename(columns=column_names)  # 앞의 변경조건을 모두 한꺼번에 반영 
    .dropna(thresh=4)
    [['경기일자', '방문팀', '방문팀점수', '홈팀', '홈팀점수']]
    .assign(경기일자=lambda x: pd.to_datetime(x['경기일자'], format='%a, %b %d, %Y'))
    .set_index('경기일자', append=True)
    .rename_axis(["게임", "경기일자"])
    .sort_index())

In [55]:
games_all.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,방문팀,방문팀점수,홈팀,홈팀점수
게임,경기일자,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2017-10-17,Boston Celtics,99,Cleveland Cavaliers,102
1,2017-10-17,Houston Rockets,122,Golden State Warriors,121
2,2017-10-18,Charlotte Hornets,90,Detroit Pistons,102
3,2017-10-18,Brooklyn Nets,131,Indiana Pacers,140
4,2017-10-18,Miami Heat,109,Orlando Magic,116


In [56]:
games_all.index.levels[1]

DatetimeIndex(['2017-10-17', '2017-10-18', '2017-10-19', '2017-10-20',
               '2017-10-21', '2017-10-22', '2017-10-23', '2017-10-24',
               '2017-10-25', '2017-10-26', '2017-10-27', '2017-10-28',
               '2017-10-29', '2017-10-30', '2017-10-31'],
              dtype='datetime64[ns]', name='경기일자', freq=None)

## 예제 10-6 데이터 조정해서  확정하기

In [57]:
tidy = pd.melt(games_all.reset_index(),
               id_vars=['게임', '경기일자'], value_vars=['방문팀', '홈팀'],
               var_name='팀구분',value_name='팀')

In [58]:
tidy.head()

Unnamed: 0,게임,경기일자,팀구분,팀
0,0,2017-10-17,방문팀,Boston Celtics
1,1,2017-10-17,방문팀,Houston Rockets
2,2,2017-10-18,방문팀,Charlotte Hornets
3,3,2017-10-18,방문팀,Brooklyn Nets
4,4,2017-10-18,방문팀,Miami Heat


In [59]:
tidy.tail()

Unnamed: 0,게임,경기일자,팀구분,팀
203,99,2017-10-30,홈팀,Los Angeles Clippers
204,100,2017-10-31,홈팀,Indiana Pacers
205,101,2017-10-31,홈팀,Brooklyn Nets
206,102,2017-10-31,홈팀,Milwaukee Bucks
207,103,2017-10-31,홈팀,Los Angeles Lakers


In [60]:
tidy.경기일자.dtype

dtype('<M8[ns]')

In [61]:
x = (tidy.groupby('팀')['경기일자'].diff().dt.days == 1 )

In [62]:
tidy['경기일자'][x]

12    2017-10-18
28    2017-10-21
30    2017-10-21
32    2017-10-21
33    2017-10-21
34    2017-10-21
41    2017-10-23
59    2017-10-25
61    2017-10-25
77    2017-10-28
78    2017-10-28
90    2017-10-29
91    2017-10-30
93    2017-10-30
96    2017-10-30
137   2017-10-21
175   2017-10-27
Name: 경기일자, dtype: datetime64[ns]

In [63]:
ab = tidy.groupby('팀')['경기일자'].diff().dt.days -1

In [64]:
ab.shape

(208,)

In [65]:
ab.isnull().sum()

30

In [66]:
tidy['휴식일'] = tidy.sort_values('경기일자').groupby('팀').경기일자.diff().dt.days - 1

In [67]:
tidy.isnull().sum()

게임       0
경기일자     0
팀구분      0
팀        0
휴식일     30
dtype: int64

In [68]:
tidy = tidy.fillna(0)     # 휴식일 컬럼을 초기치 0.0으로 sETTING 

In [69]:
tidy.shape

(208, 5)

In [70]:
tidy.head()

Unnamed: 0,게임,경기일자,팀구분,팀,휴식일
0,0,2017-10-17,방문팀,Boston Celtics,0.0
1,1,2017-10-17,방문팀,Houston Rockets,0.0
2,2,2017-10-18,방문팀,Charlotte Hornets,0.0
3,3,2017-10-18,방문팀,Brooklyn Nets,0.0
4,4,2017-10-18,방문팀,Miami Heat,0.0


In [71]:
by_game = pd.pivot_table(tidy, values='휴식일',
                          index=['게임', '경기일자'],
                          columns='팀구분')

In [72]:
by_game.head()

Unnamed: 0_level_0,팀구분,방문팀,홈팀
게임,경기일자,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2017-10-17,0.0,0.0
1,2017-10-17,0.0,0.0
2,2017-10-18,0.0,0.0
3,2017-10-18,0.0,0.0
4,2017-10-18,0.0,0.0


In [73]:
by_game = by_game.rename(columns={'방문팀': '방문팀휴식일',
                              '홈팀': '홈팀휴식일'})

In [74]:
by_game.head()

Unnamed: 0_level_0,팀구분,방문팀휴식일,홈팀휴식일
게임,경기일자,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2017-10-17,0.0,0.0
1,2017-10-17,0.0,0.0
2,2017-10-18,0.0,0.0
3,2017-10-18,0.0,0.0
4,2017-10-18,0.0,0.0


In [75]:
by_game.shape

(104, 2)

In [76]:
games_all.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,방문팀,방문팀점수,홈팀,홈팀점수
게임,경기일자,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2017-10-17,Boston Celtics,99,Cleveland Cavaliers,102
1,2017-10-17,Houston Rockets,122,Golden State Warriors,121
2,2017-10-18,Charlotte Hornets,90,Detroit Pistons,102
3,2017-10-18,Brooklyn Nets,131,Indiana Pacers,140
4,2017-10-18,Miami Heat,109,Orlando Magic,116


In [77]:
df = pd.concat([games_all, by_game], axis=1)

In [78]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,방문팀,방문팀점수,홈팀,홈팀점수,방문팀휴식일,홈팀휴식일
게임,경기일자,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,2017-10-17,Boston Celtics,99,Cleveland Cavaliers,102,0.0,0.0
1,2017-10-17,Houston Rockets,122,Golden State Warriors,121,0.0,0.0
2,2017-10-18,Charlotte Hornets,90,Detroit Pistons,102,0.0,0.0
3,2017-10-18,Brooklyn Nets,131,Indiana Pacers,140,0.0,0.0
4,2017-10-18,Miami Heat,109,Orlando Magic,116,0.0,0.0


In [79]:
df.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,방문팀,방문팀점수,홈팀,홈팀점수,방문팀휴식일,홈팀휴식일
게임,경기일자,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
99,2017-10-30,Golden State Warriors,141,Los Angeles Clippers,113,0.0,1.0
100,2017-10-31,Sacramento Kings,83,Indiana Pacers,101,1.0,1.0
101,2017-10-31,Phoenix Suns,122,Brooklyn Nets,114,2.0,1.0
102,2017-10-31,Oklahoma City Thunder,110,Milwaukee Bucks,91,2.0,1.0
103,2017-10-31,Detroit Pistons,93,Los Angeles Lakers,113,1.0,2.0


In [80]:
df.shape

(104, 6)

In [81]:
df.isnull().sum()    # nULL값이 없이 모두 정제가 잘 되었다. 

방문팀       0
방문팀점수     0
홈팀        0
홈팀점수      0
방문팀휴식일    0
홈팀휴식일     0
dtype: int64

In [82]:
by_game_ = (pd.pivot_table(tidy, values='휴식일',
                          index=['게임', '경기일자'],
                          columns='팀구분')
             .rename(columns={'방문팀': '방문팀휴식일',
                              '홈팀': '홈팀휴식일'}))


In [83]:
df_ = pd.concat([games_all, by_game_], axis=1)

In [84]:
df_.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,방문팀,방문팀점수,홈팀,홈팀점수,방문팀휴식일,홈팀휴식일
게임,경기일자,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,2017-10-17,Boston Celtics,99,Cleveland Cavaliers,102,0.0,0.0
1,2017-10-17,Houston Rockets,122,Golden State Warriors,121,0.0,0.0
2,2017-10-18,Charlotte Hornets,90,Detroit Pistons,102,0.0,0.0
3,2017-10-18,Brooklyn Nets,131,Indiana Pacers,140,0.0,0.0
4,2017-10-18,Miami Heat,109,Orlando Magic,116,0.0,0.0


In [85]:
df_.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,방문팀,방문팀점수,홈팀,홈팀점수,방문팀휴식일,홈팀휴식일
게임,경기일자,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
99,2017-10-30,Golden State Warriors,141,Los Angeles Clippers,113,0.0,1.0
100,2017-10-31,Sacramento Kings,83,Indiana Pacers,101,1.0,1.0
101,2017-10-31,Phoenix Suns,122,Brooklyn Nets,114,2.0,1.0
102,2017-10-31,Oklahoma City Thunder,110,Milwaukee Bucks,91,2.0,1.0
103,2017-10-31,Detroit Pistons,93,Los Angeles Lakers,113,1.0,2.0


In [86]:
df_.to_csv("../data/nav_2018_경기분석.csv",encoding='utf-8-sig',index=False)

In [87]:
df_ = pd.read_csv("../data/nav_2018_경기분석.csv",encoding='utf-8') 
df_.head(10)

Unnamed: 0,방문팀,방문팀점수,홈팀,홈팀점수,방문팀휴식일,홈팀휴식일
0,Boston Celtics,99,Cleveland Cavaliers,102,0.0,0.0
1,Houston Rockets,122,Golden State Warriors,121,0.0,0.0
2,Charlotte Hornets,90,Detroit Pistons,102,0.0,0.0
3,Brooklyn Nets,131,Indiana Pacers,140,0.0,0.0
4,Miami Heat,109,Orlando Magic,116,0.0,0.0
5,Philadelphia 76ers,115,Washington Wizards,120,0.0,0.0
6,Milwaukee Bucks,108,Boston Celtics,100,0.0,0.0
7,New Orleans Pelicans,91,Memphis Grizzlies,103,0.0,0.0
8,Atlanta Hawks,117,Dallas Mavericks,111,0.0,0.0
9,Denver Nuggets,96,Utah Jazz,106,0.0,0.0


In [88]:
df_.tail(10)

Unnamed: 0,방문팀,방문팀점수,홈팀,홈팀점수,방문팀휴식일,홈팀휴식일
94,Philadelphia 76ers,115,Houston Rockets,107,1.0,1.0
95,Charlotte Hornets,104,Memphis Grizzlies,99,0.0,1.0
96,Orlando Magic,115,New Orleans Pelicans,99,0.0,1.0
97,Dallas Mavericks,89,Utah Jazz,104,1.0,1.0
98,Toronto Raptors,99,Portland Trail Blazers,85,2.0,1.0
99,Golden State Warriors,141,Los Angeles Clippers,113,0.0,1.0
100,Sacramento Kings,83,Indiana Pacers,101,1.0,1.0
101,Phoenix Suns,122,Brooklyn Nets,114,2.0,1.0
102,Oklahoma City Thunder,110,Milwaukee Bucks,91,2.0,1.0
103,Detroit Pistons,93,Los Angeles Lakers,113,1.0,2.0
