In [52]:
import pandas as pd



def select_non(series):
    return series.dropna().iloc[0] if not series.dropna().empty else None

def preprocess(file_path, item_names_to_extract):
    # Load the data
    data = pd.read_excel(file_path)

    # Erase blank spaces and special characters in '항목명'
    data["항목명"] = (
        data["항목명"].str.replace(" ", "").str.replace(r"[^가-힣]", "", regex=True)
    )

    # If '종목코드' column is present, format it by removing brackets
    if "종목코드" in data.columns:
        data["종목코드"] = data["종목코드"].str.replace("[", "").str.replace("]", "")

    # Extract the specified item names
    extracted_data = data[data["항목명"].isin(item_names_to_extract)]
    grouped_data = extracted_data.groupby(['회사명', '종목코드', '항목명']).agg({'당기 1분기': select_non}).reset_index()

    final_data = grouped_data.pivot(
        index=["회사명", "종목코드"], columns="항목명", values="당기 1분기"
    ).reset_index()

    return final_data



# Example usage:
# df_financial_data = process_financial_excel('path_to_financial_data.xlsx', ['유동자산', '비유동자산', ...])

BS_item = ["유동자산", "비유동자산", "자산총계", "자본총계", "부채및자본총계", "유동부채", "비유동부채", "부채총계"]
CF_item = ["영업활동현금흐름"]
IC_item = ["매출액", "매출총이익", "순이익"]


def fillnan(df):
    # Filling NaN values using the provided formulas
    df["자산총계"] = df["자산총계"].fillna(df["유동자산"] + df["비유동자산"])
    df["부채총계"] = df["부채총계"].fillna(df["유동부채"] + df["비유동부채"])
    df["부채및자본총계"] = df["부채및자본총계"].fillna(df["부채총계"] + df["자본총계"])
    return df


BS_df = preprocess(
    "C:/Users/dochy/Desktop/고려대/딥러닝/project/fsdata/2023_1Q_BS_filter.xlsx", BS_item
)



In [53]:
def fillnan(df):
    # Filling NaN values using the provided formulas
    df["자산총계"] = df["자산총계"].fillna(df["유동자산"] + df["비유동자산"])
    df["부채총계"] = df["부채총계"].fillna(df["유동부채"] + df["비유동부채"])
    df["부채및자본총계"] = df["부채및자본총계"].fillna(df["부채총계"] + df["자본총계"])
    df["자본총계"] = df["자본총계"].fillna(df["부채및자본총계"] - df["부채총계"])
    return df

fillnan(BS_df)

항목명,회사명,종목코드,부채및자본총계,부채총계,비유동부채,비유동자산,유동부채,유동자산,자본총계,자산총계
0,3S,060310,6.595678e+10,2.006824e+10,2.424885e+09,4.119151e+10,1.764336e+10,2.476527e+10,4.588854e+10,6.595678e+10
1,AJ네트웍스,095570,1.390079e+12,1.001349e+12,2.680423e+11,1.186349e+12,7.333065e+11,2.037299e+11,3.887301e+11,1.390079e+12
2,AK홀딩스,006840,9.322353e+11,3.811836e+11,2.081565e+07,8.567521e+11,3.811627e+11,7.548319e+10,5.510517e+11,9.322353e+11
3,APS,054620,2.753778e+11,9.357518e+10,2.270707e+10,2.245058e+11,7.086811e+10,5.087194e+10,1.818026e+11,2.753778e+11
4,AP시스템,265520,5.436351e+11,2.671041e+11,5.505503e+10,1.513681e+11,2.120491e+11,3.922670e+11,2.765309e+11,5.436351e+11
...,...,...,...,...,...,...,...,...,...,...
2266,흥국,010240,9.808896e+10,1.862550e+10,3.345051e+09,4.065163e+10,1.528045e+10,5.743732e+10,7.946346e+10,9.808896e+10
2267,흥국에프엔비,189980,1.886286e+11,1.018184e+11,7.069427e+10,1.470849e+11,3.112410e+10,4.154375e+10,8.681026e+10,1.886286e+11
2268,흥아해운,003280,3.014978e+11,1.641065e+11,1.154552e+11,2.196050e+11,4.865131e+10,8.189277e+10,1.373912e+11,3.014978e+11
2269,희림,037440,1.925270e+11,1.268609e+11,1.874723e+10,4.889646e+10,1.081137e+11,1.436306e+11,6.566610e+10,1.925270e+11


In [54]:
CF_df = preprocess(
    "C:/Users/dochy/Desktop/고려대/딥러닝/project/fsdata/2023_1Q_CF_filter.xlsx", CF_item
)

In [28]:
CF_df

항목명,회사명,종목코드,영업활동현금흐름
0,3S,060310,5.641026e+09
1,AJ네트웍스,095570,3.549927e+09
2,AK홀딩스,006840,1.502585e+11
3,APS,054620,-4.631668e+09
4,AP시스템,265520,-4.762685e+10
...,...,...,...
1846,흥국,010240,3.321402e+09
1847,흥국에프엔비,189980,-1.543191e+09
1848,흥아해운,003280,6.529725e+09
1849,희림,037440,-5.021215e+09


In [55]:
IC_df = preprocess(
    "C:/Users/dochy/Desktop/고려대/딥러닝/project/fsdata/2023_1Q_IC_filter.xlsx", IC_item
)

In [56]:
IC_df

항목명,회사명,종목코드,매출액,매출총이익,순이익
0,3S,060310,1.331206e+10,1.920551e+09,1.081571e+09
1,AJ네트웍스,095570,,,4.755168e+09
2,AK홀딩스,006840,1.102303e+12,2.968355e+11,6.843604e+10
3,APS,054620,4.998406e+09,2.104061e+09,-3.391231e+09
4,AP시스템,265520,9.397471e+10,2.659035e+10,1.218080e+10
...,...,...,...,...,...
1856,흥국,010240,3.930823e+10,6.996745e+09,4.222078e+09
1857,흥국에프엔비,189980,2.585867e+10,8.839015e+09,2.040203e+09
1858,흥아해운,003280,4.329224e+10,1.289786e+10,7.493210e+09
1859,희림,037440,5.120846e+10,5.834408e+09,1.607548e+09


In [57]:
df_23_1Q = pd.merge(BS_df, CF_df, on=['회사명', '종목코드'], how='outer')
df_23_1Q = pd.merge(df_23_1Q, IC_df, on=['회사명', '종목코드'], how='outer')

In [58]:
label_23_1Q = pd.read_excel('C:/Users/dochy/Desktop/고려대/딥러닝/project/label/23_1Q.XLSX')

  warn("Workbook contains no default style, apply openpyxl's default")


In [59]:
import pandas as pd
drop_columns = ['시작일 기준가','종료일 종가','대비','거래량','거래대금']

def create_class_and_merge(data_df, label_df, drop_columns):
    
    # 클래스 생성
    label_df['class'] = label_df['등락률'].apply(lambda x: 0 if x <= -10 else (1 if x < 0 else (2 if x < 10 else 3)))

    # 불필요한 열 제거
    label_df = label_df.drop(drop_columns, axis=1)

    # 데이터프레임 병합
    merged_df = pd.merge(data_df, label_df, on=['종목코드'], how='outer')

    return merged_df


create_class_and_merge(df_23_1Q, label_23_1Q, drop_columns)

Unnamed: 0,회사명,종목코드,부채및자본총계,부채총계,비유동부채,비유동자산,유동부채,유동자산,자본총계,자산총계,영업활동현금흐름,매출액,매출총이익,순이익,종목명,등락률,class
0,3S,060310,6.595678e+10,2.006824e+10,2.424885e+09,4.119151e+10,1.764336e+10,2.476527e+10,4.588854e+10,6.595678e+10,5.641026e+09,1.331206e+10,1.920551e+09,1.081571e+09,3S,23.63,3.0
1,AJ네트웍스,095570,1.390079e+12,1.001349e+12,2.680423e+11,1.186349e+12,7.333065e+11,2.037299e+11,3.887301e+11,1.390079e+12,3.549927e+09,,,4.755168e+09,AJ네트웍스,-4.88,1.0
2,AK홀딩스,006840,9.322353e+11,3.811836e+11,2.081565e+07,8.567521e+11,3.811627e+11,7.548319e+10,5.510517e+11,9.322353e+11,1.502585e+11,1.102303e+12,2.968355e+11,6.843604e+10,AK홀딩스,8.61,2.0
3,APS,054620,2.753778e+11,9.357518e+10,2.270707e+10,2.245058e+11,7.086811e+10,5.087194e+10,1.818026e+11,2.753778e+11,-4.631668e+09,4.998406e+09,2.104061e+09,-3.391231e+09,APS,-31.71,0.0
4,AP시스템,265520,5.436351e+11,2.671041e+11,5.505503e+10,1.513681e+11,2.120491e+11,3.922670e+11,2.765309e+11,5.436351e+11,-4.762685e+10,9.397471e+10,2.659035e+10,1.218080e+10,AP시스템,-4.26,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2696,,008775,,,,,,,,,,,,,호텔신라우,-11.22,0.0
2697,,212310,,,,,,,,,,,,,휴벡셀,-10.19,0.0
2698,,000540,,,,,,,,,,,,,흥국화재,-0.79,1.0
2699,,000547,,,,,,,,,,,,,흥국화재2우B,-57.85,0.0
