In [1]:
import json
import glob
import os.path
import pandas as pd
import numpy as np

def getDF(path):
    ## json load
    data = json.loads(open(path, 'r', encoding='utf-8-sig').read())['rows']
    # data = json.loads(open(path, 'r', encoding='utf-8-sig').read())
    
    if len(data) < 1:
        print(f'>>> json File : \"{path}\" is Data Not Found...')
        return None;
    
    df = pd.DataFrame()
    # json 구조에서 경기정보 dataframe 변환 ###
    for d in data:
        tmp = pd.DataFrame(d['row'])['Text']
        if len(d['row'])==8:
            tmp[8] = np.NaN # 기존 인덱스 0 ~ 7.새 열을 넣고 우측으로 이동
            tmp = tmp.shift(1) # row 전체를 우측으로 이동. col이 1개여서 row로 출력되므로 axiex
        df = df.append(tmp, ignore_index=True)

    df[9] = np.NaN
    df = df.shift(1, axis=1)
    df[0] = path[:-8].split('_')[-1:][0] # 파일명에서 연도를 추가
    
    print(f'>>> json File : \"{path}\" is Load Complete!!!')
    return df

## 연도 기간을 입력하면 해당되는 파일 목록이 출력
# getAsosFilePath('./tmp/', ['2018', '2019'])
# './teamRawData_json/data/0,9_2018_05.json'
def getjsonFilePath(parDirPath, yearlist):
    targetPathList = []
    yearlist = set(yearlist) # 연도 목록 중복제거
    monList = ['01','02','03','04','05','06','07','08','09','10','11','12']
    for y in yearlist:
        for m in monList:
            filename = f'0,9_{y}_{m}.json'
            fullpath = os.path.join(parDirPath, filename)
            targetPathList.append(fullpath)
    return targetPathList

def exportCSV(df, filename):
    df.to_csv(filename, encoding='utf-8-sig', index=False)

In [212]:

parDirPath = r'teamScheduleRawData_json'
yearlist = ['2018','2019','2020','2021']

pathList = getjsonFilePath(parDirPath, yearlist) # 경로 생성
pathList.sort()

df_rawData = pd.DataFrame()
for path in pathList:
    tmp = getDF(path)
    if type(tmp) != type(None):
        df_rawData = df_rawData.append(tmp, ignore_index=True)

df_rawData.columns = ['연도','날짜','시간','경기','게임센터','하이라이트','TV','라디오','구장','비고']
# df_rawData # 연도 추가, 컬럼 shift만 수행한 raw data
# exportCSV(df_rawData, 'ScheduleList_rawdata_v1.csv')

## 2018 : 762
## 2019 : 781
## 2020 : 788
## 2021 : 856


>>> json File : "teamScheduleRawData_json\0,9_2018_01.json" is Data Not Found...
>>> json File : "teamScheduleRawData_json\0,9_2018_02.json" is Data Not Found...
>>> json File : "teamScheduleRawData_json\0,9_2018_03.json" is Load Complete!!!
>>> json File : "teamScheduleRawData_json\0,9_2018_04.json" is Load Complete!!!
>>> json File : "teamScheduleRawData_json\0,9_2018_05.json" is Load Complete!!!
>>> json File : "teamScheduleRawData_json\0,9_2018_06.json" is Load Complete!!!
>>> json File : "teamScheduleRawData_json\0,9_2018_07.json" is Load Complete!!!
>>> json File : "teamScheduleRawData_json\0,9_2018_08.json" is Load Complete!!!
>>> json File : "teamScheduleRawData_json\0,9_2018_09.json" is Load Complete!!!
>>> json File : "teamScheduleRawData_json\0,9_2018_10.json" is Load Complete!!!
>>> json File : "teamScheduleRawData_json\0,9_2018_11.json" is Data Not Found...
>>> json File : "teamScheduleRawData_json\0,9_2018_12.json" is Data Not Found...
>>> json File : "teamScheduleRawData

In [215]:
df_rawData.replace('<b>', '', regex=True, inplace=True)
df_rawData.replace('</b>', '', regex=True, inplace=True)
df_rawData['경기'] = df_rawData['경기'].apply(lambda x: x.strip().replace('vs',',')
                            .replace('<em>',',')
                            .replace('</em>',',')
                            .replace('<span class="lose">','')
                            .replace('<span class="win">','')
                            .replace('<span class="same">','')
                            .replace('<span>','')
                            .replace('</span>',''))

df_rawData['TV'].replace('<br />', ',', regex=True, inplace=True) ## tv(방송국) 구분자 컷마로 변경
df_rawData['라디오'].replace('<br />', ',', regex=True, inplace=True) ## 라디오 방송국 구분자 컷마로 변경
df_rawData['날짜'].fillna(method='ffill', inplace=True) ## 게임센터의 날짜와 일치여부 확인 필요

df_rawData['gameId'] = df_rawData['게임센터'].apply(lambda x: str(x).split('gameId=')[-1].split('&')[0])
df_rawData['away'] = df_rawData['경기'].apply(lambda x: x.split(',')[0])
df_rawData['away_score'] = df_rawData['경기'].apply(lambda x: x.split(',')[1])
df_rawData['home'] = df_rawData['경기'].apply(lambda x: x.split(',')[3])
df_rawData['home_score'] = df_rawData['경기'].apply(lambda x: x.split(',')[2])
exportCSV(df_rawData, 'ScheduleList_rawdata_v3.csv')
df_rawData

Unnamed: 0,연도,날짜,시간,경기,게임센터,하이라이트,TV,라디오,구장,비고,gameId,away,away_score,home,home_score
0,2018,03.24(토),14:00,"LG,2,4,NC",<a href='/Schedule/GameCenter/Main.aspx?gameDa...,<a href='/Schedule/GameCenter/Main.aspx?gameDa...,"KN-T,SPO-T","KNN-R,GM-R",마산,-,20180324LGNC0,LG,2,NC,4
1,2018,03.24(토),14:00,"삼성,6,3,두산",<a href='/Schedule/GameCenter/Main.aspx?gameDa...,<a href='/Schedule/GameCenter/Main.aspx?gameDa...,S-T,"T-R,DM-R",잠실,-,20180324SSOB0,삼성,6,두산,3
2,2018,03.24(토),14:00,"롯데,5,6,SK",<a href='/Schedule/GameCenter/Main.aspx?gameDa...,<a href='/Schedule/GameCenter/Main.aspx?gameDa...,M-T,"KNN-R,PM-R",문학,-,20180324LTSK0,롯데,5,SK,6
3,2018,03.24(토),14:00,"KT,5,4,KIA",<a href='/Schedule/GameCenter/Main.aspx?gameDa...,<a href='/Schedule/GameCenter/Main.aspx?gameDa...,K-2T,KBC-R,광주,-,20180324KTHT0,KT,5,KIA,4
4,2018,03.24(토),14:00,"한화,3,6,넥센",<a href='/Schedule/GameCenter/Main.aspx?gameDa...,<a href='/Schedule/GameCenter/Main.aspx?gameDa...,"MS-T,SPO-2T",S-R,고척,-,20180324HHWO0,한화,3,넥센,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3182,2021,10.30(토),17:00,"LG,2,4,롯데",<a href='/Schedule/GameCenter/Main.aspx?gameDa...,<a href='/Schedule/GameCenter/Main.aspx?gameDa...,SPO-2T,KNN-R,사직,-,20211030LGLT0,LG,2,롯데,4
3183,2021,10.30(토),17:00,"삼성,11,5,NC",<a href='/Schedule/GameCenter/Main.aspx?gameDa...,<a href='/Schedule/GameCenter/Main.aspx?gameDa...,SS-T,T-R,창원,-,20211030SSNC0,삼성,11,NC,5
3184,2021,10.30(토),17:00,"키움,6,1,KIA",<a href='/Schedule/GameCenter/Main.aspx?gameDa...,<a href='/Schedule/GameCenter/Main.aspx?gameDa...,"KN-T,G-CMB",,광주,-,20211030WOHT0,키움,6,KIA,1
3185,2021,10.30(토),17:00,"두산,5,3,한화",<a href='/Schedule/GameCenter/Main.aspx?gameDa...,<a href='/Schedule/GameCenter/Main.aspx?gameDa...,"MS-T,D-CMB",,대전,-,20211030OBHH0,두산,5,한화,3


In [17]:

## 연월일 시간 합체. dtype datetime으로 변경
## 1) 날짜에서 괄호를 포함한 요일 지우고
# tmp['날짜'] = tmp['날짜'].apply(lambda x: x if x.find('(') == -1 else x[:x.find('(')]) ## 요일 삭제
## 2) 연도로 모두 통합
# tmp['연도'] = tmp['연도'] + '.' + tmp['날짜'] + ' ' + tmp['시간']
## 3) 연도 컬럼의 타입을 datetime으로 변경
# tmp['연도'] = pd.to_datetime(tmp['연도'])

# tmp['비고'].unique() # >> array(['-', '우천취소', '미세먼지취소', '그라운드사정', '강풍취소', '기타'], dtype=object)
# tmp[tmp['비고'] != '-'] ## 취소경기 304건 조회. 모두 review 미존재

## 필터
# con1 = tmp['비고'] == '-'
# con2 = ~tmp['경기'].str.contains('드림|나눔')
# gameidList = list(tmp.loc[con1 & con2]['gameId'])

(24, 1)

In [767]:
tmp = get_Gameinfo(scoreboardPath)
tmp['뜻'] = ['포스트 시즌(4), 시범경기(1), 정규시즌(0)',
            '게임의 고유 ID. 연월일 + 원정팀 구단코드(2자리) + 홈팀 구단코드(2자리) + 0',
            '경기날짜(연월일)',
            '연도',
            '홈팀 이름',
            '홈팀 ID(2자리)',
            '원정팀 이름',
            '원정팀 ID(2자리)',
            '경기장',
            '관객수',
            '홈팀 전적(승)',
            '홈팀 전적(패)',
            '홈팀 전적(무)',
            '원정팀 전적(승)',
            '원정팀 전적(패)',
            '원정팀 전적(무)',
            '현재 경기의 원정팀 최종 점수',
            '현재 경기의 홈팀 최종 점수',
            '경기 시작 시간',
            '경기 종료 시간',
            '경기 진행 시간',
            '홈팀 전체 이름',
            '원정팀 전체 이름',
            '최대 이닝수']
tmp

Unnamed: 0,0,뜻
SR_ID,0,"포스트 시즌(4), 시범경기(1), 정규시즌(0)"
G_ID,20210404HTOB0,게임의 고유 ID. 연월일 + 원정팀 구단코드(2자리) + 홈팀 구단코드(2자리) + 0
G_DT,2021-04-04,경기날짜(연월일)
SEASON_ID,2021,연도
HOME_NM,두산,홈팀 이름
HOME_ID,OB,홈팀 ID(2자리)
AWAY_NM,KIA,원정팀 이름
AWAY_ID,HT,원정팀 ID(2자리)
S_NM,잠실,경기장
CROWD_CN,2410,관객수


In [22]:
df = pd.read_csv("ScheduleList_rawdata_v3.csv")
df = df.astype({'연도':'str'})
df['연도'] = df['연도'] + '.' + df['날짜']
df['연도'] = df['연도'].apply(lambda x: x[:-3])
df['날짜'] = df['날짜'].apply(lambda x: x[6:-1])
exportCSV(df, 'ScheduleList_rawdata_v4.csv')
df

Unnamed: 0,연도,날짜,시간,경기,게임센터,하이라이트,TV,라디오,구장,비고,gameId,away,away_score,home,home_score
0,2018.03.24,토,14:00,"LG,2,4,NC",<a href='/Schedule/GameCenter/Main.aspx?gameDa...,<a href='/Schedule/GameCenter/Main.aspx?gameDa...,"KN-T,SPO-T","KNN-R,GM-R",마산,-,20180324LGNC0,LG,2.0,NC,4.0
1,2018.03.24,토,14:00,"삼성,6,3,두산",<a href='/Schedule/GameCenter/Main.aspx?gameDa...,<a href='/Schedule/GameCenter/Main.aspx?gameDa...,S-T,"T-R,DM-R",잠실,-,20180324SSOB0,삼성,6.0,두산,3.0
2,2018.03.24,토,14:00,"롯데,5,6,SK",<a href='/Schedule/GameCenter/Main.aspx?gameDa...,<a href='/Schedule/GameCenter/Main.aspx?gameDa...,M-T,"KNN-R,PM-R",문학,-,20180324LTSK0,롯데,5.0,SK,6.0
3,2018.03.24,토,14:00,"KT,5,4,KIA",<a href='/Schedule/GameCenter/Main.aspx?gameDa...,<a href='/Schedule/GameCenter/Main.aspx?gameDa...,K-2T,KBC-R,광주,-,20180324KTHT0,KT,5.0,KIA,4.0
4,2018.03.24,토,14:00,"한화,3,6,넥센",<a href='/Schedule/GameCenter/Main.aspx?gameDa...,<a href='/Schedule/GameCenter/Main.aspx?gameDa...,"MS-T,SPO-2T",S-R,고척,-,20180324HHWO0,한화,3.0,넥센,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3182,2021.10.30,토,17:00,"LG,2,4,롯데",<a href='/Schedule/GameCenter/Main.aspx?gameDa...,<a href='/Schedule/GameCenter/Main.aspx?gameDa...,SPO-2T,KNN-R,사직,-,20211030LGLT0,LG,2.0,롯데,4.0
3183,2021.10.30,토,17:00,"삼성,11,5,NC",<a href='/Schedule/GameCenter/Main.aspx?gameDa...,<a href='/Schedule/GameCenter/Main.aspx?gameDa...,SS-T,T-R,창원,-,20211030SSNC0,삼성,11.0,NC,5.0
3184,2021.10.30,토,17:00,"키움,6,1,KIA",<a href='/Schedule/GameCenter/Main.aspx?gameDa...,<a href='/Schedule/GameCenter/Main.aspx?gameDa...,"KN-T,G-CMB",,광주,-,20211030WOHT0,키움,6.0,KIA,1.0
3185,2021.10.30,토,17:00,"두산,5,3,한화",<a href='/Schedule/GameCenter/Main.aspx?gameDa...,<a href='/Schedule/GameCenter/Main.aspx?gameDa...,"MS-T,D-CMB",,대전,-,20211030OBHH0,두산,5.0,한화,3.0


In [30]:
con1 = df['비고'] == '-'
con2 = (df.home != '드림') | (df.home != '나눔')
df.loc[con1 & con2]

Unnamed: 0,연도,날짜,시간,경기,게임센터,하이라이트,TV,라디오,구장,비고,gameId,away,away_score,home,home_score
0,2018.03.24,토,14:00,"LG,2,4,NC",<a href='/Schedule/GameCenter/Main.aspx?gameDa...,<a href='/Schedule/GameCenter/Main.aspx?gameDa...,"KN-T,SPO-T","KNN-R,GM-R",마산,-,20180324LGNC0,LG,2.0,NC,4.0
1,2018.03.24,토,14:00,"삼성,6,3,두산",<a href='/Schedule/GameCenter/Main.aspx?gameDa...,<a href='/Schedule/GameCenter/Main.aspx?gameDa...,S-T,"T-R,DM-R",잠실,-,20180324SSOB0,삼성,6.0,두산,3.0
2,2018.03.24,토,14:00,"롯데,5,6,SK",<a href='/Schedule/GameCenter/Main.aspx?gameDa...,<a href='/Schedule/GameCenter/Main.aspx?gameDa...,M-T,"KNN-R,PM-R",문학,-,20180324LTSK0,롯데,5.0,SK,6.0
3,2018.03.24,토,14:00,"KT,5,4,KIA",<a href='/Schedule/GameCenter/Main.aspx?gameDa...,<a href='/Schedule/GameCenter/Main.aspx?gameDa...,K-2T,KBC-R,광주,-,20180324KTHT0,KT,5.0,KIA,4.0
4,2018.03.24,토,14:00,"한화,3,6,넥센",<a href='/Schedule/GameCenter/Main.aspx?gameDa...,<a href='/Schedule/GameCenter/Main.aspx?gameDa...,"MS-T,SPO-2T",S-R,고척,-,20180324HHWO0,한화,3.0,넥센,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3182,2021.10.30,토,17:00,"LG,2,4,롯데",<a href='/Schedule/GameCenter/Main.aspx?gameDa...,<a href='/Schedule/GameCenter/Main.aspx?gameDa...,SPO-2T,KNN-R,사직,-,20211030LGLT0,LG,2.0,롯데,4.0
3183,2021.10.30,토,17:00,"삼성,11,5,NC",<a href='/Schedule/GameCenter/Main.aspx?gameDa...,<a href='/Schedule/GameCenter/Main.aspx?gameDa...,SS-T,T-R,창원,-,20211030SSNC0,삼성,11.0,NC,5.0
3184,2021.10.30,토,17:00,"키움,6,1,KIA",<a href='/Schedule/GameCenter/Main.aspx?gameDa...,<a href='/Schedule/GameCenter/Main.aspx?gameDa...,"KN-T,G-CMB",,광주,-,20211030WOHT0,키움,6.0,KIA,1.0
3185,2021.10.30,토,17:00,"두산,5,3,한화",<a href='/Schedule/GameCenter/Main.aspx?gameDa...,<a href='/Schedule/GameCenter/Main.aspx?gameDa...,"MS-T,D-CMB",,대전,-,20211030OBHH0,두산,5.0,한화,3.0


In [25]:
df['home'].unique()

array(['NC', '두산', 'SK', 'KIA', '넥센', 'LG', '롯데', '삼성', '한화', 'KT', '드림',
       '키움', '나눔', 'SSG'], dtype=object)