## 라이브러리 로드

In [None]:
%pip install boto3

In [None]:
%pip install polars

In [None]:
%pip install python-dotenv

In [None]:
import os
import sys

import boto3
import polars as pl
from dotenv import load_dotenv

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

%matplotlib inline

import seaborn as sns

sns.set_style("whitegrid")

load_dotenv()

In [None]:
# 한글 흐림현상 방지
%config InlineBackend.figure_format = 'retina'

font_path = "/System/Library/Fonts/Supplemental/AppleGothic.ttf"
font = fm.FontProperties(fname=font_path).get_name()
plt.rc("font", family=font)


matplotlib.rcParams["axes.unicode_minus"] = False

## 데이터 로드

In [None]:
s3 = boto3.client(
    "s3",
    aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
    region_name="ap-northeast-2",
)

BUCKET_NAME = os.getenv("BUCKET_NAME")
PREP_PATH = os.getenv("PREP_PATH")
TABLE_NAME = os.getenv("TABLE_NAME")

## 데이터 전처리

In [None]:
for month in ["07", "08", "09", "10", "11", "12"]:
    s3.download_file(
        BUCKET_NAME,
        os.path.join(PREP_PATH, "2023", month, f"{TABLE_NAME}.csv"),
        os.path.join("data", f"{TABLE_NAME}_2023{month}.csv"),
    )

    df = pl.read_csv(os.path.join("data", f"{TABLE_NAME}_2023{month}.csv"))

    df = df.with_columns(pl.lit(2023).alias("YEAR"), pl.lit(int(month)).alias("MONTH"))

    df = df.rename(
        {
            "재결제1회완납건수_duplicated_0": "재결제2회완납건수",
            "재결제1회미납건수_duplicated_0": "재결제2회미납건수",
            "재결제1회완납건수_duplicated_1": "재결제3회완납건수",
        }
    )

    df.write_csv(os.path.join("prep", f"{TABLE_NAME}_2023{month}.csv"))

In [None]:
with open("data/CMS_CUST_LEGACY_TABLE.csv", "r") as f:
    lines = [line.strip().split(",") for line in f.readlines()]

lines

In [None]:
preplines = [line for line in lines if len(line) > 17]
preplines

In [None]:
[print(line[6:-10]) for line in preplines]

In [None]:
def prep_line(line):
    if len(line) > 17:
        for itr, col in enumerate(line):
            # 서비스 이용 목적에 쉼표 들어간 경우 찾기
            if "/" in col:
                # 업체명에 쉼표 들어간 경우 찾기
                if len(line[itr + 1 : -10]) < 2:
                    print(line[2:-14])
                    line = line[:2] + [f'{",".join(line[2:-14])}'] + line[-14:]
                else:
                    print(line[itr + 1 : -10])
                    line = (
                        line[: itr + 1]
                        + [f'{",".join(line[itr + 1 : -10])}']
                        + line[-10:]
                    )
                break

    return line

In [None]:
new_lines = [prep_line(line) for line in lines]

In [None]:
[print(line) for line in new_lines if len(line) > 17]

In [None]:
lines

In [None]:
import csv


def list_to_csv(data, filename):
    with open(filename, "w", newline="") as file:
        writer = csv.writer(file)
        writer.writerows(data)

In [None]:
list_to_csv(new_lines, "prep/CMS_CUST_LEGACY_TABLE_v2.csv")

In [None]:
df = pl.read_csv(os.path.join("prep", "CMS_CUST_LEGACY_TABLE_v2.csv"))
# df.write_csv(os.path.join("prep", "CMS_CUST_LEGACY_TABLE.csv"))

In [None]:
df = df.to_pandas()
df

In [None]:
df.replace("null", None)

In [None]:
df.to_csv(os.path.join("prep", "CMS_CUST_LEGACY_TABLE_v3.csv"), index=False)

## 데이터 분석

In [None]:
df = pl.concat(
    [
        pl.read_csv(os.path.join("prep", f"{TABLE_NAME}_2023{month}.csv"))
        for month in ["07", "08", "09", "10", "11", "12"]
    ]
)

In [None]:
df

In [None]:
df = df.with_columns(
    pl.col("세금계산서사용여부")
    .replace({"Y": 1, "N": 0})
    .cast(int)
    .cast(bool)
    .alias("세금계산서사용여부")
)
df = df.with_columns(
    pl.col("재결제자동처리여부")
    .replace({"Y": 1, "N": 0})
    .cast(int)
    .cast(bool)
    .alias("재결제자동처리여부")
)

In [None]:
df

In [None]:
df = df.drop("YEAR")
df = df.drop("MONTH")