In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import gc
import glob

# 그래프 기본 테마 설정
# https://coldbrown.co.kr/2023/07/%ED%8C%8C%EC%9D%B4%EC%8D%AC-%EC%8B%A4%EC%A0%84%ED%8E%B8-08-seaborn-sns-set%EC%9D%84-%ED%86%B5%ED%95%B4-%EC%8A%A4%ED%83%80%EC%9D%BC-%EC%84%A4%EC%A0%95%ED%95%98%EA%B8%B0/
sns.set()

# 그래프 기본 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False


# 복잡한 통계 처리를 위한 라이브러리
from scipy import stats

In [2]:
# data_type = "train"
# month = "07"
# category = "잔액정보"

# local
root_path = '../../data'

# colab
# root_path = '/content/drive/MyDrive/12조 파이널프로젝트/data'

#drive_folder = f'{root_path}/스탯티즈_Raw_data/'

In [3]:
df1 = pd.read_csv(f'{root_path}/KBO FA 투수 [2013-2024]_최종_전처리.csv', encoding='utf-8')
df1

Unnamed: 0,FA년도,구단명,선수명,세부 포지션,투,구분,FA등급,플레이년도,나이,게임수,...,선발 이닝,구원 이닝,선발 WAR,구원 WAR,종합 WAR,연봉,비고1,잔류 여부,FA 계약 연수,FA 계약 총액
0,2011,롯데,강영식,RP,좌투,자격유지,A,2010,29.0,63.0,...,0.0,52.2,0.00,1.09,1.09,12000.0,고졸,잔류,0.0,0.0
1,2012,롯데,강영식,RP,좌투,자격유지,A,2011,30.0,64.0,...,0.0,41.0,0.00,1.26,1.26,30000.0,고졸,잔류,0.0,0.0
2,2013,롯데,강영식,RP,좌투,자격유지,A,2012,31.0,55.0,...,0.0,41.2,0.00,0.66,0.66,30000.0,고졸,잔류,0.0,0.0
3,2014,롯데,강영식,RP,좌투,자격유지,A,2013,32.0,55.0,...,0.0,39.2,0.00,0.49,0.49,30000.0,고졸,잔류,4.0,170000.0
4,2015,롯데,강영식,RP,좌투,FA 1년차,A,2014,33.0,52.0,...,0.0,47.0,0.00,1.18,1.18,30000.0,고졸,잔류,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,2019,NC,구창모,SP,좌투,다년계약,다년계약,2018,21.0,36.0,...,106.2,26.1,0.92,1.13,2.05,9000.0,다년계약,잔류,0.0,0.0
361,2020,NC,구창모,SP,좌투,다년계약,다년계약,2019,22.0,23.0,...,101.0,6.0,3.28,0.30,3.58,12500.0,다년계약,잔류,0.0,0.0
362,2021,NC,구창모,SP,좌투,다년계약,다년계약,2020,23.0,15.0,...,92.0,1.1,5.25,0.14,5.39,18000.0,다년계약,잔류,0.0,0.0
363,2023,NC,구창모,SP,좌투,다년계약,다년계약,2022,25.0,19.0,...,111.2,0.0,4.79,0.00,4.79,19000.0,다년계약,잔류,6.0,1250000.0


## '피장타율' 파생 컬럼
- 선수명, 플레이년도만 엑셀파일로 저장
- 수기로 피장타율 입력
- 다시 가져와서 선수명, 플레이년도로 merge

In [5]:
template_df = df1[['선수명', '플레이년도']].drop_duplicates().sort_values(['선수명', '플레이년도'])
template_df['피장타율_입력'] = None  # 수기로 채울 컬럼 추가

template_df.to_excel("../../data/피장타율_입력템플릿.xlsx", index=False)

## '완투' / '완봉' 파생 컬럼

In [None]:
template_df2 = df1[['선수명', '플레이년도']].drop_duplicates().sort_values(['선수명', '플레이년도'])
template_df2['완투_입력'] = None  # 수기로 채울 컬럼 추가
template_df2['완봉_입력'] = None  # 수기로 채울 컬럼 추가

template_df2.to_excel("../../data/완투완봉_입력템플릿.xlsx", index=False)

## '사사구' 파생 컬럼
- 볼넷 + 사구

In [18]:
df1['사사구'] = df1['볼넷허용'] + df1['사구허용']

In [22]:
df1['사사구']

0      25.0
1      24.0
2      19.0
3      21.0
4      26.0
       ... 
360    59.0
361    47.0
362    18.0
363    33.0
364    16.0
Name: 사사구, Length: 365, dtype: float64

## '원클럽맨' 파생 컬럼
- 같은 선수명이 한 구단에서만 활동한 경우

In [25]:
# 선수명 기준으로 몇 개의 구단에서 뛰었는지 count
club_counts = df1.groupby('선수명')['구단명'].nunique()

# 원클럽맨 여부 판단: 구단 수가 1인 경우 True
df1['원클럽맨'] = df1['선수명'].map(lambda name: club_counts[name] == 1)

In [37]:
df1['원클럽맨'].value_counts()

원클럽맨
True     189
False    176
Name: count, dtype: int64

## '투구수' 파생 컬럼
- 나중에 필요시, 이닝으로 나누면 '게임당 투구수'

In [44]:
template_df3 = df1[['선수명', '플레이년도']].drop_duplicates().sort_values(['선수명', '플레이년도'])
template_df3['투구수_입력'] = None  # 수기로 채울 컬럼 추가

template_df3.to_excel("../../data/투구수_입력템플릿.xlsx", index=False)