#### 전선 데이터 처리

In [31]:
import pandas as pd
from datetime import datetime

from freeman.aiddd.data_manager import read_data, write_data, common_columns

#### 데이터 불러오기

In [32]:
_start_time = datetime.now()

# 4차 전처리: 공사비 + 전주 데이터 
_df_cons = read_data('1c-merge-cons-pole', process_seq='4th')

# 4차 전처리: 전주 데이터(확인용)
_df_pole = read_data('1c-preprocessed-pole', process_seq='4th')

# 3번째 제공받은 전선 데이터
_df_line = read_data('LINE_DATA', data_type='provide', process_seq='3rd')

print(
    f'Total Elapsed Time for Data Load: {datetime.now() - _start_time}\n'
    f'Provided Data Shape: {_df_line.shape}\n'
    f'Preprocessed Data(cons) Shape: {_df_cons.shape}\n'
    f'Preprocessed Data(pole) Shape: {_df_pole.shape}'
)

Total Elapsed Time for Data Load: 0:00:21.825749
Provided Data Shape: (40019, 77)
Preprocessed Data(cons) Shape: (14728, 35)
Preprocessed Data(pole) Shape: (26920, 20)


In [33]:
df_cons, df_pole, df_line = _df_cons.copy(), _df_pole.copy(), _df_line.copy()

#### 데이터 확인 및 학습 컬럼 추출

In [34]:
df_line.columns

Index(['공사번호', '공사변경순번', '전산화번호', 'GISID', '공사구분코드', '전원측전산화번호', '전압구분코드',
       '설치순서', '공사성격코드', '회선명', '결선방식코드', '상배열구분코드', '지지물간거리', '전선종류코드1',
       '전선규격코드1', '전선조수1', '전선구분코드1', '전선조정코드1', '전선공사특성코드2', '전선종류코드2',
       '전선규격코드2', '전선조수2', '전선구분코드2', '전선조정코드2', '전선펴기여부', '중성선공사특성코드',
       '중성선종류코드', '중성선규격코드', '중성선구분코드', '중성선조정코드', '중성선공용구분코드', '직선접속공사특성코드',
       '직선접속수량', '슬리브접속수량', '시공방법코드', '상부가설여부', '수목접촉여부', '건물방호관코드', '건물방호관수량',
       '고객제공여부', '직선접속중성선공사성격코드', '직선접속중성선수량', '전압선중성선사용여부', '중성선전압선사용여부',
       '전선펴기긴선여부', '기설전선여부', '전선공사특성코드3', '전선종류코드3', '전선규격코드3', '전선구분코드3',
       '전선조정코드3', '기설설비유실여부', '사업소코드', '변전소코드', '회선코드', '기설설비상태코드', '위험표지판수량',
       '종단접속공사코드', '종단접속수량', '조가형태공사코드', '조가형태코드', '파형관규격코드', '디지털장력계여부',
       '분산형전원표시판수', '케이블헤더수량', 'NDIS사업소코드', '최초등록일시', '최초등록자사번', '최종변경일시',
       '최종변경자사번', '사업소명', '접수종류코드', '접수종류명', '공사형태코드', '공사형태명', '계약전력',
       'X좌표-Y좌표'],
      dtype='object')

In [35]:
[col for col in df_line.columns if '전선' in col]

['전선종류코드1',
 '전선규격코드1',
 '전선조수1',
 '전선구분코드1',
 '전선조정코드1',
 '전선공사특성코드2',
 '전선종류코드2',
 '전선규격코드2',
 '전선조수2',
 '전선구분코드2',
 '전선조정코드2',
 '전선펴기여부',
 '전선펴기긴선여부',
 '기설전선여부',
 '전선공사특성코드3',
 '전선종류코드3',
 '전선규격코드3',
 '전선구분코드3',
 '전선조정코드3']

In [36]:
df_line['중성선규격코드'].value_counts()

중성선규격코드
32.0     18885
0.0       6973
22.0      6606
58.0      4397
95.0      1299
38.0       813
35.0       467
60.0       415
100.0      126
150.0        9
Name: count, dtype: int64

In [37]:
# 학습에 필요한 컬럼 추출
training_columns = [
    '공사번호', '전원측전산화번호', '전산화번호', '결선방식코드',
    '전선종류코드1', '전선규격코드1', '전선조수1', '지지물간거리',
    '중성선종류코드', '중성선규격코드', 'X좌표-Y좌표'
]

rename_columns = {
    '공사번호': common_columns['공사번호'],
    '전원측전산화번호': common_columns['전원측전산화번호'],
    '전산화번호': common_columns['전산화번호'],
    '결선방식코드': common_columns['결선방식코드'],
    '전선종류코드1': common_columns['전선종류코드1'],
    '전선규격코드1': common_columns['전선규격코드1'],
    '전선조수1': common_columns['전선조수1'],
    '지지물간거리': common_columns['지지물간거리'],
    '중성선종류코드': common_columns['중성선종류코드'],
    '중성선규격코드': common_columns['중성선규격코드'],
    'X좌표-Y좌표': common_columns['X좌표-Y좌표'],
}

df_line = df_line[training_columns]
df_line.rename(columns=rename_columns, inplace=True)
df_line.shape

(40019, 11)

#### 데이터 전처리

##### 결측치 처리

In [38]:
# 결측값 확인
df_line.isna().sum()

cons_id             0
from_comp_id        0
comp_id             0
wiring_scheme       0
line_type_cd        0
line_spec_cd        0
line_phase_cd       0
span                0
neutral_type_cd    29
neutral_spec_cd    29
coordinate          0
dtype: int64

In [39]:
df_line.neutral_spec_cd.value_counts()

neutral_spec_cd
32.0     18885
0.0       6973
22.0      6606
58.0      4397
95.0      1299
38.0       813
35.0       467
60.0       415
100.0      126
150.0        9
Name: count, dtype: int64

In [40]:
# 0.0값이 존재하기 때문에 NaN값을 999.0으로 치환
df_line.neutral_spec_cd.fillna(999.0, inplace=True)

In [41]:
df_line.neutral_type_cd.value_counts()

neutral_type_cd
AL    24565
WO     7592
ZZ     6973
AO      470
OW      377
OC       12
C1        1
Name: count, dtype: int64

In [42]:
# NaN값을 'NaN'으로 치환
df_line.neutral_type_cd.fillna('NaN', inplace=True)

In [43]:
df_line.neutral_type_cd.value_counts()

neutral_type_cd
AL     24565
WO      7592
ZZ      6973
AO       470
OW       377
NaN       29
OC        12
C1         1
Name: count, dtype: int64

In [44]:
# 결선방식 확인
df_line.wiring_scheme.value_counts()

wiring_scheme
13    29843
43    10175
41        1
Name: count, dtype: int64

In [45]:
# 41 -> 43으로 치환
df_line.wiring_scheme = df_line.wiring_scheme.replace(41, 43)

##### 전선 전체길이 컬럼 추가

In [46]:
# 전체 전선 길이 = 선로길이(긍장, span) * 조수(전선의 갯 수, phase)
df_line['line_length'] = df_line.span * df_line.line_phase_cd

##### One-Hot Encoding

In [47]:
df_line.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40019 entries, 0 to 40018
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   cons_id          40019 non-null  object 
 1   from_comp_id     40019 non-null  object 
 2   comp_id          40019 non-null  object 
 3   wiring_scheme    40019 non-null  int64  
 4   line_type_cd     40019 non-null  object 
 5   line_spec_cd     40019 non-null  float64
 6   line_phase_cd    40019 non-null  int64  
 7   span             40019 non-null  int64  
 8   neutral_type_cd  40019 non-null  object 
 9   neutral_spec_cd  40019 non-null  float64
 10  coordinate       40019 non-null  object 
 11  line_length      40019 non-null  int64  
dtypes: float64(2), int64(4), object(6)
memory usage: 3.7+ MB


In [48]:
df_line = pd.get_dummies(
    df_line,
    columns=[
        'wiring_scheme', 'line_type_cd', 'line_spec_cd', 'line_phase_cd',
        'neutral_type_cd', 'neutral_spec_cd'
    ],
    prefix=[
        'wiring_scheme', 'line_type', 'line_spec', 'line_phase',
        'neutral_type', 'neutral_spec'
    ]
)

In [49]:
df_line.shape

(40019, 51)

In [50]:
# bool형을 int형으로 변환
df_line = df_line.applymap(lambda x: int(x) if isinstance(x, bool) else x)

##### 학습대상 레코드 추출

In [51]:
df_line = df_line[df_line.cons_id.isin(df_cons.cons_id)]
df_line.shape

(29704, 51)

In [52]:
write_data('1d-preprocessed-line', df_line, process_seq='4th')

#### 공사비 기준 전선 그룹 레코드 만들기

In [53]:
# 유일값의 공사번호들
unique_cons_ids = df_line.cons_id.unique()
# 해당 공사번호의 레코드 값
cons_id_row_values = []
# 합산할 컬럼들
columns_for_summation = ['span'] + df_line.columns.tolist()[5:]

In [54]:
# 공사비의 공사번호 갯 수와 전처리된 전선에 있는 고유 공사번호 갯 수 비교
df_cons.shape[0], len(unique_cons_ids)

# 공사비에 있는 공사번호 모두에 전주가 포함되어 있음을 알 수 있음.

(14728, 14728)

In [55]:
_start_time = datetime.now()

for cons_id in unique_cons_ids:
    df_temp = df_line[df_line.cons_id == cons_id]
    new_row = df_temp[columns_for_summation].sum().values.tolist()
    cons_id_row_values.append([cons_id] + new_row)
    
# 그룹 데이터프레임 만들기
df_group = pd.DataFrame(
    cons_id_row_values, columns=['cons_id'] + columns_for_summation
)
df_group.shape

print(f'Total Elapsed Time for Data Load: {datetime.now() - _start_time}')

Total Elapsed Time for Data Load: 0:00:31.681026


In [56]:
# 잘 더해 졌는지 임의 공사번호를 불러와 확인
_CONS_ID = '477420193349'
_df1 = df_line[columns_for_summation][df_line.cons_id==_CONS_ID]
_df2 = df_group[columns_for_summation][df_group.cons_id==_CONS_ID]
_df3 = pd.concat([_df1, _df2], ignore_index=True)
_indexs = _df3.index.tolist()
_indexs[-1] = 'group'
_df3.index = _indexs
_df3

Unnamed: 0,span,line_length,wiring_scheme_13,wiring_scheme_43,line_type_AO,line_type_C1,line_type_C2,line_type_C4,line_type_D2,line_type_D4,...,neutral_spec_22.0,neutral_spec_32.0,neutral_spec_35.0,neutral_spec_38.0,neutral_spec_58.0,neutral_spec_60.0,neutral_spec_95.0,neutral_spec_100.0,neutral_spec_150.0,neutral_spec_999.0
0,28,28,1,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,39,39,1,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,61,61,1,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,35,35,1,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
group,163,163,4,0,4,0,0,0,0,0,...,0,3,0,0,1,0,0,0,0,0


In [57]:
df_pole[df_pole.cons_id==_CONS_ID]
# 이상하다!!
# 전주가 8개 설치된 공사인데, 전선이 4개다.(나머지 4개는 지선주?)

Unnamed: 0,cons_id,comp_id,x,y,pole_shape_G,pole_shape_O,pole_shape_V,pole_type_1,pole_type_B,pole_type_C,pole_type_E,pole_type_H,pole_type_M,pole_spec_6.0,pole_spec_8.0,pole_spec_10.0,pole_spec_11.0,pole_spec_12.0,pole_spec_14.0,pole_spec_16.0
6,477420193349,7103S622,128.345893,37.146079,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0
7,477420193349,7103S723,128.346364,37.145936,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0
8,477420193349,7103S724,128.346539,37.146082,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1
9,477420193349,7103S725,128.346487,37.146335,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1
10,477420193349,7103S821,128.348403,37.146904,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0
11,477420193349,7103S921,128.348422,37.146354,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0
12,477420193349,7103Y021,128.348137,37.146136,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0
13,477420193349,7103Y121,128.348501,37.146327,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0


In [58]:
df_cons.shape, df_group.shape

((14728, 35), (14728, 48))

In [59]:
df_cons_group = pd.merge(
    df_cons, df_group,
    left_on='cons_id', right_on='cons_id', how='left'
)
df_cons_group.shape

(14728, 82)

In [60]:
write_data('1d-merge-cons-pole-line', df_cons_group, process_seq='4th')