<a href="https://colab.research.google.com/github/gmb408/OSSP-KBO-prediction/blob/version-0.1/Data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# 데이터 분석 모듈
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from datetime import datetime

In [25]:
# 저장해둔 데이터 불러오기
result = pd.read_csv("/content/statiz_origin.csv", index_col=0)
result2 = pd.read_csv("/content/statiz_origin2.csv", index_col=0)

In [26]:
#result2에만 존재하는 WPA 열 삭제 
result2.drop(['WPA'],axis=1,inplace=True)

In [27]:
#result1과 result2를 결합 1982~2022년도 데이터  인덱스 값 초기화
result = result.append(result2)

In [28]:
# 이름 분리 해서, '이름' column 추가
result['이름'] = result['선수'].str.findall("^[ㄱ-힣]+").apply(lambda x: x[0])
# 기존 '선수' column 에 있던 이름 정보 제거
result['선수'] = result.apply(lambda x: x['선수'].replace(x['이름'], ''), axis=1)

In [29]:
# '시즌' column 생성
result['시즌'] = result['선수'].apply(lambda x: x[:2])

# 기존 '선수' column 에 있던 시즌 정보 제거
result['선수'] = result.apply(lambda x: x['선수'].replace(x['시즌'],''), axis=1)

# 년도를 4자리 숫자로 바꾸어주기 ex) 98 -&gt; 1998, 15 -&gt; 2015
result['시즌'] = result['시즌'].apply(lambda x: int("20"+ x) if int(x) < 30 else int("19"+x))

In [30]:
# 포지션
position = ['1B', '2B', '3B', 'SS', 'C', 'RF', 'LF', 'CF', 'DH'] 

# 포지션 여부 확인
result['포지션여부'] = result['선수'].apply(lambda x : (np.isin(x[-2:], position) or np.isin(x[-1:], position))) 

# 포지션 분리
result['포지션'] = result['선수'].apply(lambda x: x[-2:] if x[-2:] in position else x[-1:])

#포지션이 기록 되어 있지 않으면, nan값 넣기 
result.loc[result['포지션여부']==False, '포지션'] = np.nan

In [31]:
# 팀 & 나이 column 생성
result['팀'] = result.apply(lambda x: x['선수'].replace(x['포지션'],'') if type(x['포지션'])==str else x['선수'], axis=1)
result['나이'] = result.apply(lambda x: x['시즌'] - int(x['생일'][:4]) + 1, axis=1)

In [32]:
# 선수별 고유 아이디 부여
info = result[['이름', '생일']].apply(lambda x: tuple(x), axis=1)

player_id = {}
for i,j in enumerate(info.unique()):
    player_id[j] = i
    
result['ID'] = result.apply(lambda x: player_id[tuple([x['이름'], x['생일']])], axis=1)

In [33]:
# numeric data로 변환
float_columns = result.iloc[:,1:27].columns
result[float_columns] = result[float_columns].astype(float).fillna(0)

In [34]:
# 필요한 features 선택
cols = ['ID', '이름', '생일', '팀', '시즌', '포지션', '나이'] + list(float_columns)
result = result[cols]

In [35]:
# 타석 100 미만의 선수 제거 적은 표본으로 이상치및 결측치 제거
result.drop(result.loc[result['타석']<50].index, inplace=True)
result

Unnamed: 0,ID,이름,생일,팀,시즌,포지션,나이,G,타석,타수,...,병살,희타,희비,타율,출루,장타,OPS,wOBA,wRC+,WAR+
0,0,이종범,1970-08-15,해,1994,SS,25,124.0,561.0,499.0,...,2.0,1.0,4.0,0.393,0.452,0.581,1.033,0.462,198.3,11.77
1,1,테임즈,1986-11-10,N,2015,1B,30,142.0,595.0,472.0,...,7.0,0.0,7.0,0.381,0.498,0.790,1.288,0.530,222.3,10.71
2,2,심정수,1975-05-05,현,2003,RF,29,133.0,601.0,460.0,...,14.0,0.0,8.0,0.335,0.478,0.720,1.197,0.498,210.7,10.19
3,0,이종범,1970-08-15,해,1997,SS,28,125.0,577.0,484.0,...,9.0,0.0,3.0,0.324,0.428,0.581,1.009,0.431,173.2,9.70
4,0,이종범,1970-08-15,해,1996,SS,27,113.0,525.0,449.0,...,4.0,0.0,2.0,0.332,0.425,0.566,0.991,0.440,184.6,9.52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1475,2187,임종찬,2001-09-28,한,2021,RF,21,42.0,146.0,132.0,...,2.0,1.0,0.0,0.152,0.228,0.212,0.440,0.217,18.7,-1.06
1476,1838,박찬호,1995-06-05,K,2020,SS,26,141.0,530.0,479.0,...,14.0,12.0,3.0,0.221,0.274,0.273,0.548,0.254,40.2,-1.14
1477,1993,정보근,1999-08-31,롯,2020,C,22,85.0,152.0,133.0,...,7.0,6.0,1.0,0.150,0.219,0.165,0.385,0.187,-3.9,-1.19
1478,1801,나종덕,1998-03-16,롯,2019,C,22,104.0,209.0,185.0,...,5.0,7.0,2.0,0.124,0.188,0.195,0.383,0.180,-4.2,-1.41


In [36]:
# 인덱스 값 초기화
result = result.reset_index(drop=True)

In [37]:
#결과값 저장해두기
result.to_csv("/content/statiz_pre.csv")
result

Unnamed: 0,ID,이름,생일,팀,시즌,포지션,나이,G,타석,타수,...,병살,희타,희비,타율,출루,장타,OPS,wOBA,wRC+,WAR+
0,0,이종범,1970-08-15,해,1994,SS,25,124.0,561.0,499.0,...,2.0,1.0,4.0,0.393,0.452,0.581,1.033,0.462,198.3,11.77
1,1,테임즈,1986-11-10,N,2015,1B,30,142.0,595.0,472.0,...,7.0,0.0,7.0,0.381,0.498,0.790,1.288,0.530,222.3,10.71
2,2,심정수,1975-05-05,현,2003,RF,29,133.0,601.0,460.0,...,14.0,0.0,8.0,0.335,0.478,0.720,1.197,0.498,210.7,10.19
3,0,이종범,1970-08-15,해,1997,SS,28,125.0,577.0,484.0,...,9.0,0.0,3.0,0.324,0.428,0.581,1.009,0.431,173.2,9.70
4,0,이종범,1970-08-15,해,1996,SS,27,113.0,525.0,449.0,...,4.0,0.0,2.0,0.332,0.425,0.566,0.991,0.440,184.6,9.52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5054,2187,임종찬,2001-09-28,한,2021,RF,21,42.0,146.0,132.0,...,2.0,1.0,0.0,0.152,0.228,0.212,0.440,0.217,18.7,-1.06
5055,1838,박찬호,1995-06-05,K,2020,SS,26,141.0,530.0,479.0,...,14.0,12.0,3.0,0.221,0.274,0.273,0.548,0.254,40.2,-1.14
5056,1993,정보근,1999-08-31,롯,2020,C,22,85.0,152.0,133.0,...,7.0,6.0,1.0,0.150,0.219,0.165,0.385,0.187,-3.9,-1.19
5057,1801,나종덕,1998-03-16,롯,2019,C,22,104.0,209.0,185.0,...,5.0,7.0,2.0,0.124,0.188,0.195,0.383,0.180,-4.2,-1.41
