In [1]:
!pip install pandas==2.0.0
!pip install matplotlib
!pip install seaborn
!pip install --upgrade pip

Collecting pandas==2.0.0
  Using cached pandas-2.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting pytz>=2020.1 (from pandas==2.0.0)
  Using cached pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.1 (from pandas==2.0.0)
  Using cached tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting numpy>=1.21.0 (from pandas==2.0.0)
  Using cached numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Using cached pandas-2.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
Using cached numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
Using cached pytz-2024.1-py2.py3-none-any.whl (505 kB)
Using cached tzdata-2024.1-py2.py3-none-any.whl (345 kB)
Installing collected packages: pytz, tzdata, numpy, pandas
Successfully installed numpy-1.26.4 pandas-2.0.0 pytz-2024.1 tzdata-2024.1
Collecting matplotlib
  Downloading matplotlib-3.8.4-cp310-c

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 스크롤을 사용해 판다스의 모든 데이터를 볼 수 있도록 함
pd.set_option('display.max_columns', None)
# 한글 폰트 사용
plt.rc('font', family='NanumGothic')
# 판다스 출력 결과를 소수점 둘째 자리 까지로 제한
pd.options.display.float_format = '{:,.2f}'.format
%matplotlib inline

# http://www.statiz.co.kr/salary.php?opt=0&sopt=1997&cnv=&pos=&te=10

In [3]:
# 용어 변경
change_term = {
    # 타자 기록
    '2B': '2루타',
    '3B': '3루타',
    'AB': '타수',
    'AO': '뜬공',
    'AVG': '타율',
    'BB': '볼넷',
    'BB/K': '볼넷/삼진',
    'CS': '도루실패',
    'E': '실책',
    'G': '경기',
    'GDP': '병살타',
    'GO': '땅볼',
    'GO/AO': '땅볼/뜬공',
    'GPA': '(1.8x출루율+장타율)/4',
    'GW RBI': '결승타',
    'H': '안타',
    'HBP': '사구',
    'HR': '홈런',
    'IBB': '고의4구',
    'ISOP': '순수장타율',
    'MH': '멀티히트',
    'OBP': '출루율',
    'OPS': '출루율+장타율',
    'P/PA': '투구수/타석',
    'PA': '타석',
    'PH-BA': '대타타율',
    'R': '득점',
    'RBI': '타점',
    'RISP': '득점권타율',
    'SAC': '희생번트',
    'SB': '도루',
    'SF': '희생플라이',
    'SLG': '장타율',
    'SO': '삼진',
    'TB': '루타',
    'XBH': '장타',
    'XR': '추정득점',

    # 투수 기록
    '2B_pitcher': '2루타',
    '3B_pitcher': '3루타',
    'AO_pitcher': '뜬공',
    'AVG_pitcher': '피안타율',
    'BABIP': '인플레이타구타율',
    'BB_pitcher': '볼넷',
    'BB/9': '9이닝당 볼넷',
    'BK': '보크',
    'BSV': '블론세이브',
    'CG': '완투',
    'ER': '자책점',
    'ERA': '평균자책점',
    'G_pitcher': '경기',
    'GDP_pitcher': '병살타',
    'GF': '종료',
    'GO_pitcher': '땅볼',
    'GO/AO_pitcher': '땅볼/뜬공',
    'GS': '선발',
    'H_pitcher': '피안타',
    'HBP_pitcher': '사구',
    'HLD': '홀드',
    'HR_pitcher': '홈런',
    'IBB_pitcher': '고의4구',
    'IP': '이닝',
    'K/9': '9이닝당 삼진',
    'K/BB': '삼진/볼넷',
    'L': '패',
    'NP': '투구수',
    'OBP_pitcher': '피출루율',
    'OPS_pitcher': '피출루율+피장타율',
    'P/G': '투구수/경기',
    'P/IP': '투구수/이닝',
    'QS': '퀄리티스타트',
    'R_pitcher': '실점',
    'SAC_pitcher': '희생번트',
    'SF_pitcher': '희생플라이',
    'SHO': '완봉',
    'SLG_pitcher': '피장타율',
    'SO_pitcher': '삼진',
    'SV': '세이브',
    'SVO': '세이브기회',
    'TBF': '타자수',
    'TS': '터프세이브',
    'W': '승',
    'Wgr': '구원승',
    'Wgs': '선발승',
    'WHIP': '이닝당 출루허용률',
    'WP': '폭투',
    'WPCT': '승률',

    # 수비 기록
    'A': '어시스트',
    'CS_defense': '도루저지',
    'CS%': '도루저지율',
    'DP': '병살',
    'E_defense': '실책',
    'FPCT': '수비율',
    'G_defense': '경기',
    'GS_defense': '선발경기',
    'PB': '포일',
    'PKO': '견제사',
    'PO': '풋아웃',
    'POS': '포지션',

    # 주루 기록
    'CS_base': '도루실패',
    'G_base': '경기',
    'OOB': '주루사',
    'PKO_base': '견제사',
    'SB_base': '도루',
    'SB%': '도루성공률',
    'SBA': '도루시도'
}


In [28]:
df1 = pd.read_csv('./data/2002_2023_야구데이터_2.csv', index_col=0).drop(['순위'], axis=1)
df1.head()

Unnamed: 0,선수명,팀명,AVG,G,PA,AB,H,2B,3B,HR,RBI,년도,경기타입
0,최형우,삼성,0.4,4,6,5,2,2,0,0,0,2002,KBO 정규시즌
1,마해영,삼성,0.323,133,596,532,172,40,2,33,116,2002,KBO 정규시즌
2,이승엽,삼성,0.323,133,617,511,165,42,2,47,126,2002,KBO 정규시즌
3,김한수,삼성,0.311,127,541,486,151,23,1,17,76,2002,KBO 정규시즌
4,강동우,삼성,0.288,130,521,466,134,23,3,9,49,2002,KBO 정규시즌


In [29]:
df2 = pd.read_csv('./data/2002_2023_야구데이터_3.csv', index_col=0).drop(['순위'], axis=1)
df2.head()

Unnamed: 0,선수명,팀명,AVG,BB,IBB,HBP,SO,GDP,SLG,OBP,OPS,MH,RISP,PH-BA,년도,경기타입
0,최형우,삼성,0.4,0,0,0,0,0,0.8,0.4,1.2,0,0.0,0.0,2002,KBO 정규시즌
1,마해영,삼성,0.323,46,5,12,74,14,0.592,0.386,0.978,53,0.31,0.0,2002,KBO 정규시즌
2,이승엽,삼성,0.323,89,5,15,109,11,0.689,0.436,1.125,47,0.33,0.0,2002,KBO 정규시즌
3,김한수,삼성,0.311,35,2,14,61,15,0.467,0.371,0.838,44,0.29,0.0,2002,KBO 정규시즌
4,강동우,삼성,0.288,46,0,1,76,9,0.408,0.352,0.76,36,0.33,0.0,2002,KBO 정규시즌


In [37]:
merge_df = pd.merge(df1, df2, how='left', on=['선수명', '팀명', '년도', '경기타입', 'AVG'])
merge_df = merge_df[['년도', '경기타입', '선수명', '팀명', 'AVG', 'G', 'PA', 'AB', 'H', '2B', '3B', 'HR', 'RBI', 'BB', 'IBB', 'HBP', 'SO', 'GDP', 'SLG', 'OBP', 'OPS', 'MH', 'RISP', 'PH-BA']]
merge_df.head()

Unnamed: 0,년도,경기타입,선수명,팀명,AVG,G,PA,AB,H,2B,3B,HR,RBI,BB,IBB,HBP,SO,GDP,SLG,OBP,OPS,MH,RISP,PH-BA
0,2002,KBO 정규시즌,최형우,삼성,0.4,4,6,5,2,2,0,0,0,0.0,0.0,0.0,0.0,0.0,0.8,0.4,1.2,0.0,0.0,0.0
1,2002,KBO 정규시즌,마해영,삼성,0.323,133,596,532,172,40,2,33,116,46.0,5.0,12.0,74.0,14.0,0.592,0.386,0.978,53.0,0.31,0.0
2,2002,KBO 정규시즌,이승엽,삼성,0.323,133,617,511,165,42,2,47,126,89.0,5.0,15.0,109.0,11.0,0.689,0.436,1.125,47.0,0.33,0.0
3,2002,KBO 정규시즌,김한수,삼성,0.311,127,541,486,151,23,1,17,76,35.0,2.0,14.0,61.0,15.0,0.467,0.371,0.838,44.0,0.29,0.0
4,2002,KBO 정규시즌,강동우,삼성,0.288,130,521,466,134,23,3,9,49,46.0,0.0,1.0,76.0,9.0,0.408,0.352,0.76,36.0,0.33,0.0


In [12]:
merge_df.drop(['순위_y', 'AVG_y', '경기타입_y'])

KeyError: "['순위_y', 'AVG_y', '경기타입_y'] not found in axis"