In [1]:
import os
import pandas as pd
import numpy as np

from tqdm import tqdm
from datetime import datetime

tqdm.pandas()

import warnings
warnings.filterwarnings("ignore")

In [2]:
def get_cct_dictionary(df, attr_nm='속성1', save_json=True, save_path=os.getcwd()) :
    """속성 해더를 가지고 딕셔너리를 만듬"""

    # 속성명이 있는 행만 추출
    df_headers = df[df['속성 그룹 코드'].isin(['01_속성명'])]
    attr1_col_no = df_headers.columns.get_loc(attr_nm)
    
    df_headers = df_headers[['C|C|T'] + df_headers.columns[attr1_col_no:].to_list()]
    
    # 속성명과 순번에 대한 딕셔너리 생성
    header_list = []
    for i in tqdm(range(len(df_headers))):
        df_header = df_headers.iloc[i].dropna()
        header_list.append(df_header.to_dict())
        # header_list.append(df_headers.iloc[i].to_dict())

    # 딕셔너리 생성
    idx_list = [i for i in range(len(header_list))]
    cct_list = [header_list[i]['C|C|T'] for i in range(len(header_list))]
    dict_idx = dict(zip(cct_list, idx_list))
    
    dic = {'index' : dict_idx, 'header_list' : header_list}
    # json 파일로 저장
    
    if save_json :
        import json            
        today = datetime.today().strftime("%y%m%d")
        with open(os.path.join(save_path, f'cct_dict_{today}.json'), 'w') as f:
            json.dump(dic, f)

    return dic

In [3]:
class MeltingData() :
    def __init__(self,data_path) :
        self.data_path = data_path

    def step1(self) :
        """load data : 데이터 불러오기"""
        try :
            self.df = pd.read_csv(self.data_path, encoding='cp949')
        except :
            self.df = pd.read_csv(self.data_path, encoding='utf8')
        return self
    
    def step2(self) :
        """filtering data : 데이터 필터링"""
        self.df_filtered = self.df[self.df['속성 그룹 코드'].isin(['03_DATA'])]
        return self
    
    def step3(self, col_list=None, key=None) :
        """extract the common part in dataframe : 데이터프레임의 공통 속성 부분 추출 : 기본값은 'SR No', '공정', 'C|C|T'"""
        if col_list is None :
            self.df_common = self.df_filtered[self.df_filtered['속성 그룹 코드'].isin(['03_DATA'])]
            self.df_common = self.df_common[['SR No', '공정', 'C|C|T']]
            self.df_common.drop_duplicates(subset=['SR No'], keep='first', inplace=True)
        else :
            self.df_common = self.df_filtered[self.df_filtered['속성 그룹 코드'].isin(['03_DATA'])]
            self.df_common = self.df_common[col_list]
            self.df_common.drop_duplicates(subset=[key], keep='first', inplace=True)
        
        return self
    
    def step4(self, drop_null=True) :
        """make attribute dataframe data : 속성값 데이터프레임 생성"""
        self.attr_1_col_no = self.df_filtered.columns.get_loc('속성1')
        self.df_attrs = self.df_filtered[self.df_filtered['속성 그룹 코드'].isin(['03_DATA'])]
        self.df_attrs = self.df_attrs[['SR No'] + self.df_attrs.columns[self.attr_1_col_no:].to_list()]
        
        # melt
        self.df_attrs = pd.melt(self.df_attrs, id_vars=['SR No'], value_vars=self.df_attrs.iloc[:,1:].columns.to_list(), var_name='속성순번', value_name='속성값', col_level=None, ignore_index=True)
        if drop_null :
            self.df_attrs = self.df_attrs.dropna()
        
        return self

    def step5(self) :
        """merge common dataframe and attribute dataframe : 공통 데이터프레임과 개별속성 데이터프레임 병합"""
        self.df_indiv = pd.merge(self.df_attrs, self.df_common, on='SR No', how='left')
        return self
    
    def step6(self, att_header) :
        """change_attribute_name : 속성명 변경"""
        def change_attribute_name(dict_idx, value_name, cct, header_list) :
            idx = dict_idx[cct]
            dict_attribute_nm = header_list[idx]
            try :
                new_nm = dict_attribute_nm[value_name]
            except :
                new_nm = "Dumb"
            return new_nm
        
        self.dict_idx = att_header['index']
        self.header_list = att_header['header_list']
        self.df_indiv['속성명'] = self.df_indiv.progress_apply(lambda x : change_attribute_name(self.dict_idx, x['속성순번'], x['C|C|T'], self.header_list), axis=1)
        return self
    
    def step7(self) :
        """drop dumb value : Dumb 값 제거"""
        self.df_indiv = self.df_indiv[self.df_indiv['속성명'] != 'Dumb']
        return self
    
    def execute(self) :
        self.step1()
        self.step2()
        self.step3()
        self.step4()
        self.step5()
        self.step6()
        self.step7()
        return self.df_indiv
    
    def help(self) :
        print('step1() : load data : 데이터 불러오기') 
        print('step2() : filtering data : 데이터 필터링')
        print('step3() : extract the common part in dataframe : 데이터프레임의 공통 속성 부분 추출')
        print('step4() : make attribute dataframe data : 속성값 데이터프레임 생성')
        print('step5() : merge common dataframe and attribute dataframe : 공통 데이터프레임과 개별속성 데이터프레임 병합')
        print('step6() : change_attribute_name : 속성명 변경')
        print('step7() : drop dumb value : Dumb 값 제거')
        print('execute() : run all steps')

    def show_attributes(self):
        # 인스턴스 속성
        instance_attributes = self.__dict__
        print("Instance attributes:")
        for attr, value in instance_attributes.items():
            print(f"{attr}")

In [4]:
## step 0 : define the path of the data
data_path = "C:\\Users\\ASUS\\Documents\\00_GS_DT\\Data_Insight\\Support Process Management\\test_file"
data_path = os.path.join(data_path, '2101.csv')
cct_path = "C:\\Users\\ASUS\Documents\\00_GS_DT\\Data_Insight\\Support Process Management\\test_file\\Attr_header.json"
melting_data = MeltingData(data_path)
melting_data.help()

step1() : load data : 데이터 불러오기
step2() : filtering data : 데이터 필터링
step3() : extract the common part in dataframe : 데이터프레임의 공통 속성 부분 추출
step4() : make attribute dataframe data : 속성값 데이터프레임 생성
step5() : merge common dataframe and attribute dataframe : 공통 데이터프레임과 개별속성 데이터프레임 병합
step6() : change_attribute_name : 속성명 변경
step7() : drop dumb value : Dumb 값 제거
execute() : run all steps


In [5]:
df = pd.read_csv(data_path, encoding='utf8')
dic_cct = get_cct_dictionary(df, attr_nm='속성1', save_json=True, save_path=os.getcwd())

100%|██████████| 142/142 [00:00<00:00, 1140.16it/s]


In [6]:
import json
json_path = "C:\\Users\\ASUS\Documents\\00_GS_DT\\Data_Insight\\Support Process Management\\test_file\\Attr_header.json"
with open(json_path, 'r') as f:
    dic_cct = json.load(f)

In [7]:
melting_data.step1()
melting_data.step2()
melting_data.step3()
melting_data.step4(drop_null=False)
melting_data.step5()
melting_data.step6(dic_cct)
melting_data.step7()
melting_data.show_attributes()

100%|██████████| 20809651/20809651 [01:37<00:00, 212636.49it/s]


Instance attributes:
data_path
df
df_filtered
df_common
attr_1_col_no
df_attrs
df_indiv
dict_idx
header_list


In [8]:
df_result = melting_data.df_indiv

In [10]:
df_result = df_result[df_result['속성명'] != 'Dumb']
df_result['SR_No_ATTR'] = df_result['SR No'] + '|' + df_result['속성명']
df_result

Unnamed: 0,SR No,속성순번,속성값,공정,C|C|T,속성명,SR_No_ATTR
0,DOF522002061,속성1,3810mm /2740mm,522,FIXED EQUIPMENT|VESSEL|COLUMN,HEADER SIZE ID,DOF522002061|HEADER SIZE ID
1,DOF522004113,속성1,1680mm / 3048mm,522,FIXED EQUIPMENT|VESSEL|COLUMN,HEADER SIZE ID,DOF522004113|HEADER SIZE ID
2,DOF522005601,속성1,1220|mm,522,FIXED EQUIPMENT|VESSEL|COLUMN,HEADER SIZE ID,DOF522005601|HEADER SIZE ID
3,DOF522000246,속성1,1900|mm,522,FIXED EQUIPMENT|VESSEL|HORIZONTAL,HEADER SIZE ID,DOF522000246|HEADER SIZE ID
4,DOF522000344,속성1,2300|mm,522,FIXED EQUIPMENT|VESSEL|HORIZONTAL,HEADER SIZE ID,DOF522000344|HEADER SIZE ID
...,...,...,...,...,...,...,...
6298735,DOF522004708,속성985,CASTABLE (INCT - 110s),522,FIXED EQUIPMENT|FIRED HEATER|CABIN/BOX TYPE,MECHANICAL DESIGN CONDITIONS HEADER BOXES LINI...,DOF522004708|MECHANICAL DESIGN CONDITIONS HEAD...
6305133,DOF522001061,속성986,50|mm,522,FIXED EQUIPMENT|FIRED HEATER|CABIN/BOX TYPE,MECHANICAL DESIGN CONDITIONS HEADER BOXES LOCA...,DOF522001061|MECHANICAL DESIGN CONDITIONS HEAD...
6305134,DOF522003260,속성986,50|mm,522,FIXED EQUIPMENT|FIRED HEATER|CABIN/BOX TYPE,MECHANICAL DESIGN CONDITIONS HEADER BOXES LOCA...,DOF522003260|MECHANICAL DESIGN CONDITIONS HEAD...
6305135,DOF522004381,속성986,50|mm,522,FIXED EQUIPMENT|FIRED HEATER|CABIN/BOX TYPE,MECHANICAL DESIGN CONDITIONS HEADER BOXES LOCA...,DOF522004381|MECHANICAL DESIGN CONDITIONS HEAD...


In [11]:
df_result.to_csv('2101_개별속성.csv', index=False, encoding='utf8')

In [17]:
df_service = melting_data.df_filtered
df_servie = df_service[df_service['속성 그룹 코드'].isin(['03_DATA'])]
df_service = df_service[['SR No', '출처', '파일목록', '공정', 'Tag No 수정', '카테고리', '클래스', '타입', 'C|C|T']]
df_service.drop_duplicates(subset=['SR No'], keep='first', inplace=True)

In [18]:
df_service.to_csv('2101_공통속성.csv', index=False, encoding='utf8')