### 엑셀 파일 전체 전처리

In [None]:
# 필요한 라이브러리 로드
import os
import pandas as pd
import xlwings as xw
import datetime

In [None]:
# 'adobe_raw' 디렉토리에서 처리할 원본 파일 탐색
# 'New_Weekly_Scoreboard(Weekly)_Ken'으로 시작하는 파일 사용
for filename in os.listdir('./adobe_raw/'):
    if filename.startswith('New_Weekly_Scoreboard(Weekly)_Ken'):
        scoreboard_weekly_file = './adobe_raw/' + filename
        break

In [None]:
# xlwings로 Excel 파일을 열고 데이터 읽기
# DRM 보호 CSV 파일을 읽기 위해 Excel 앱을 직접 제어하는 xlwings 사용
# try...finally 구문으로 Excel 앱의 확실한 종료 보장
workbook = None
try:
    workbook = xw.Book(scoreboard_weekly_file)
    sheet1 = workbook.sheets[0].used_range.value
    df_scoreboard_weekly = pd.DataFrame(sheet1)
finally:
    if workbook:
        workbook.app.quit()

In [None]:
# 원본 데이터 정제
# 보고서의 헤더, 구분선, 불필요한 행들을 순차적으로 제거
df_cleaned = df_scoreboard_weekly.iloc[9:]
df_cleaned = df_cleaned[df_cleaned[0] != '##############################################']
df_cleaned = df_cleaned.dropna(subset=[0])
useless_rows = ['Model Category_2depth', 'Segments', 'All Page Track (p6)']
df_cleaned = df_cleaned[~df_cleaned[0].isin(useless_rows)]
df_cleaned = df_cleaned.reset_index(drop=True).copy()

### DataFrame 분리

In [None]:
# 데이터 테이블별 인덱스 식별
title_index = df_cleaned[df_cleaned[0].str.startswith('#', na=False)].index
begin_index = title_index + 1
end_index = title_index[1:].append(pd.Index([len(df_cleaned)])) - 1

In [None]:
# 딕셔너리를 사용하여 데이터프레임 관리
dataframes = {}
for i_title, i_begin, i_end in zip(title_index, begin_index, end_index):
    df_title_raw = df_cleaned.iloc[i_title][0]
    df_title = df_title_raw.replace('# ', '')
    dataframes[df_title] = df_cleaned.iloc[i_begin:i_end + 1].copy()

print("생성된 데이터프레임 목록:", list(dataframes.keys()))

#### 각 데이터프레임 정제 및 처리

In [None]:
# weekly_visits_by_model_category 처리
df = dataframes['weekly_visits_by_model_category']
df = df[~df[1].isin(['Week', pd.NaT, None])]
df.columns = ['Model Category_2depth', 'WeekNumber', 'Visits']
df['WeekNumber'] = pd.to_datetime(df['WeekNumber'])
df['Visits'] = pd.to_numeric(df['Visits'], errors='coerce').fillna(0).astype(int)
dataframes['weekly_visits_by_model_category'] = df.copy()

In [None]:
# weekly_visits_by_page_type 처리
df = dataframes['weekly_visits_by_page_type']
df = df[~df[1].isin(['Week', pd.NaT, None])]
df.columns = ['Page Type', 'WeekNumber', 'Visits']
df['WeekNumber'] = pd.to_datetime(df['WeekNumber'])
df['Visits'] = pd.to_numeric(df['Visits'], errors='coerce').fillna(0).astype(int)
dataframes['weekly_visits_by_page_type'] = df.copy()

In [None]:
# weekly_visits_by_all_page_track 처리
df = dataframes['weekly_visits_by_all_page_track']
df = df[~df[1].isin(['Week', pd.NaT, None])]
df.columns = ['All Page Track (p6)', 'WeekNumber', 'Visits']
df['WeekNumber'] = pd.to_datetime(df['WeekNumber'])
df['Visits'] = pd.to_numeric(df['Visits'], errors='coerce').fillna(0).astype(int)
dataframes['weekly_visits_by_all_page_track'] = df.copy()

In [None]:
# model_cateogry_1_and_2depth 처리
df = dataframes['model_cateogry_1_and_2depth']
df = df[~df[1].isin(['Model Category_2depth', pd.NaT, None])]
df.columns = ['Model Category_1depth', 'Model Category_2depth', 'Visits']
df['Visits'] = pd.to_numeric(df['Visits'], errors='coerce').fillna(0).astype(int)
dataframes['model_cateogry_1_and_2depth'] = df.copy()

#### 데이터 병합 및 최종 가공

In [None]:
# df_week (주차 정보) 생성
df_week = pd.DataFrame(dataframes['weekly_visits_by_model_category']['WeekNumber'].drop_duplicates())
df_week.rename(columns={'WeekNumber': 'FirstDate'}, inplace=True)
df_week['ISOYear'] = df_week['FirstDate'].dt.isocalendar().year.astype('str')
df_week['ISOWeek'] = df_week['FirstDate'].dt.isocalendar().week.astype('str')
df_week['WeekNumber'] = df_week['ISOYear'] + '-' + df_week['ISOWeek']
df_week['LastDate'] = df_week['FirstDate'] + datetime.timedelta(days=6)
dataframes['week'] = df_week[['WeekNumber', 'FirstDate', 'LastDate']].copy()

In [None]:
# 날짜(Date)를 주차(WeekNumber)로 변환하는 함수 정의
def convert_date_to_week(df):
    df.rename(columns={'WeekNumber': 'Date'}, inplace=True)
    df['ISOYear'] = df['Date'].dt.isocalendar().year.astype('str')
    df['ISOWeek'] = df['Date'].dt.isocalendar().week.astype('str')
    df['WeekNumber'] = df['ISOYear'] + '-' + df['ISOWeek']
    return df.drop(columns=['Date', 'ISOYear', 'ISOWeek'])

In [None]:
# 날짜 -> 주차 변환 적용
dataframes['weekly_visits_by_model_category'] = convert_date_to_week(dataframes['weekly_visits_by_model_category'])
dataframes['weekly_visits_by_page_type'] = convert_date_to_week(dataframes['weekly_visits_by_page_type'])
dataframes['weekly_visits_by_all_page_track'] = convert_date_to_week(dataframes['weekly_visits_by_all_page_track'])

In [None]:
# weekly_visits_by_model_category_all 생성 (merge)
df_merged = pd.merge(
    left=dataframes['weekly_visits_by_model_category'], 
    right=dataframes['model_cateogry_1_and_2depth'], 
    how='left', 
    on='Model Category_2depth'
)
df_merged.drop(columns='Visits_y', inplace=True)
df_merged.rename(columns={'Visits_x': 'Visits'}, inplace=True)
dataframes['weekly_visits_by_model_category_all'] = df_merged[['WeekNumber', 'Model Category_1depth', 'Model Category_2depth', 'Visits']].copy()

#### CSV로 저장

In [None]:
# 최종 데이터프레임들을 CSV 파일로 저장
for name, df in dataframes.items():
    # 저장할 파일들만 선택
    if name in ['weekly_visits_by_page_type', 'weekly_visits_by_all_page_track', 'week', 'weekly_visits_by_model_category_all']:
        output_path = os.path.join('./csv_raw', f'{name}.csv')
        df.to_csv(output_path, index=False, encoding='utf-8-sig')
        print(f"'{output_path}' 저장 완료")