#### 전주관련 전처리

In [53]:
import pandas as pd

import matplotlib.pyplot as plt
from freeman.plt_setting import plt_settings
from freeman.aiddd.data_manager import read_data, write_data

# 차트 한글처리 지원
plt_settings()

In [54]:
MIN_POLE_LINE_COUNTS = 2

#### 데이터 불러오기

In [55]:
# 설비 갯 수가 추가된 학습용 데이터
filename = 'bb-device-counts' if MIN_POLE_LINE_COUNTS == 1 else 'bb-device-counts-2'
_df_cons = read_data(filename)
# 유의미한 좌표가 추가된 3차 제공 데이터
_df_pole = read_data('POLE_DATA', data_type='provide')

In [56]:
df_cons, df_pole = _df_cons.copy(), _df_pole.copy()

In [57]:
df_cons.shape, df_pole.shape

((5710, 18), (38533, 63))

#### 데이터 전처리

##### 공사비 레코드에 전주 좌표 추가

In [58]:
# 전주 데이터에서 좌표 추가를 위해 필요한 컬럼만 추출
columns_for_training = [
    '공사번호', '전산화번호', 'GISID', 
    '전주형태코드', '전주종류코드', '전주규격코드', 
    'X좌표-Y좌표',
]
df_pole = df_pole[columns_for_training].copy()
df_pole.shape

(38533, 7)

In [59]:
# 컬럼명 변경: 사용하기 좋게 영문으로
rename_columns = {
    '공사번호': 'cons_no', '전산화번호': 'comp_no', 'GISID': 'gisid', 
    '전주형태코드': 'pole_shape_cd', '전주종류코드': 'pole_type_cd',
    '전주규격코드': 'pole_spec_cd','X좌표-Y좌표': 'position'
}
df_pole.rename(columns=rename_columns, inplace=True)

In [60]:
# `position` 컬럼을 이용해 좌표관련 컬럼 추가
df_pole[['x', 'y', 'temp1', 'temp2']] = \
    df_pole.position.str.split(',', expand=True)
    
df_pole.drop(['temp1', 'temp2'], axis=1, inplace=True)

# 공사비 데이터셋에 있는 공사번호만 남기고 나머지는 제거
df_pole = df_pole[df_pole.cons_no.isin(df_cons.cons_no)]
df_pole.shape

(17555, 9)

In [61]:
# 컬럼 확인
df_pole.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17555 entries, 220 to 38521
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   cons_no        17555 non-null  object 
 1   comp_no        17555 non-null  object 
 2   gisid          17555 non-null  int64  
 3   pole_shape_cd  17555 non-null  object 
 4   pole_type_cd   17469 non-null  object 
 5   pole_spec_cd   17469 non-null  float64
 6   position       17555 non-null  object 
 7   x              17555 non-null  object 
 8   y              17555 non-null  object 
dtypes: float64(1), int64(1), object(7)
memory usage: 1.3+ MB


In [62]:
df_pole['pole_shape_cd'].value_counts()

pole_shape_cd
O    16995
G      474
V       86
Name: count, dtype: int64

In [63]:
# OneHot Encoding: `pole_shape_cd` column
df_pole = pd.get_dummies(df_pole, columns=['pole_shape_cd'], prefix=['pole_shape'])
df_pole.shape

(17555, 11)

In [64]:
df_pole.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17555 entries, 220 to 38521
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   cons_no       17555 non-null  object 
 1   comp_no       17555 non-null  object 
 2   gisid         17555 non-null  int64  
 3   pole_type_cd  17469 non-null  object 
 4   pole_spec_cd  17469 non-null  float64
 5   position      17555 non-null  object 
 6   x             17555 non-null  object 
 7   y             17555 non-null  object 
 8   pole_shape_G  17555 non-null  bool   
 9   pole_shape_O  17555 non-null  bool   
 10  pole_shape_V  17555 non-null  bool   
dtypes: bool(3), float64(1), int64(1), object(6)
memory usage: 1.3+ MB


In [65]:
df_pole['pole_type_cd'].value_counts()

pole_type_cd
C    16185
H     1117
B      150
1       14
M        3
Name: count, dtype: int64

In [66]:
# OneHot Encoding: `pole_type_cd` column
df_pole = pd.get_dummies(df_pole, columns=['pole_type_cd'], prefix=['pole_type'])
df_pole.shape

(17555, 15)

In [67]:
df_pole['pole_spec_cd'].value_counts()

pole_spec_cd
10.0    12238
12.0     3685
16.0      810
14.0      583
6.0       150
11.0        2
8.0         1
Name: count, dtype: int64

In [68]:
# OneHot Encoding: `pole_spec_cd` column
df_pole = pd.get_dummies(df_pole, columns=['pole_spec_cd'], prefix=['pole_spec'])
df_pole.shape

(17555, 21)

In [69]:
df_pole.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17555 entries, 220 to 38521
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   cons_no         17555 non-null  object
 1   comp_no         17555 non-null  object
 2   gisid           17555 non-null  int64 
 3   position        17555 non-null  object
 4   x               17555 non-null  object
 5   y               17555 non-null  object
 6   pole_shape_G    17555 non-null  bool  
 7   pole_shape_O    17555 non-null  bool  
 8   pole_shape_V    17555 non-null  bool  
 9   pole_type_1     17555 non-null  bool  
 10  pole_type_B     17555 non-null  bool  
 11  pole_type_C     17555 non-null  bool  
 12  pole_type_H     17555 non-null  bool  
 13  pole_type_M     17555 non-null  bool  
 14  pole_spec_6.0   17555 non-null  bool  
 15  pole_spec_8.0   17555 non-null  bool  
 16  pole_spec_10.0  17555 non-null  bool  
 17  pole_spec_11.0  17555 non-null  bool  
 18  pole_spec

In [70]:
# bool형을 int형으로 변환
df_pole = df_pole.applymap(lambda x: int(x) if isinstance(x, bool) else x)

# str형을 float형으로
df_pole['x'] = df_pole['x'].astype(float)
df_pole['y'] = df_pole['y'].astype(float)

In [71]:
df_pole.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17555 entries, 220 to 38521
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   cons_no         17555 non-null  object 
 1   comp_no         17555 non-null  object 
 2   gisid           17555 non-null  int64  
 3   position        17555 non-null  object 
 4   x               17555 non-null  float64
 5   y               17555 non-null  float64
 6   pole_shape_G    17555 non-null  int64  
 7   pole_shape_O    17555 non-null  int64  
 8   pole_shape_V    17555 non-null  int64  
 9   pole_type_1     17555 non-null  int64  
 10  pole_type_B     17555 non-null  int64  
 11  pole_type_C     17555 non-null  int64  
 12  pole_type_H     17555 non-null  int64  
 13  pole_type_M     17555 non-null  int64  
 14  pole_spec_6.0   17555 non-null  int64  
 15  pole_spec_8.0   17555 non-null  int64  
 16  pole_spec_10.0  17555 non-null  int64  
 17  pole_spec_11.0  17555 non-null  in

In [72]:
filename = 'bc-pole' if MIN_POLE_LINE_COUNTS == 1 else 'bc-pole-2'
write_data(filename, df_pole)

#### 공사비를 기준으로 한 로우로 만들기

In [73]:
cons_no_rows = []
unique_cons_no_list = df_pole.cons_no.unique()
sum_columns = [col for col in df_pole.columns if col.startswith('pole_')]

for cons_no in unique_cons_no_list:
    df_temp = df_pole[df_pole.cons_no == cons_no]
    cons_no_sum = df_temp[sum_columns].sum().values.tolist()
    add_data = [cons_no] + cons_no_sum
    cons_no_rows.append(add_data)
    
df_pole_group = pd.DataFrame(cons_no_rows, columns=['cons_no']+sum_columns)

In [74]:
filename = 'bc-pole-group' if MIN_POLE_LINE_COUNTS == 1 else 'bc-pole-group-2'
write_data(filename, df_pole_group)

In [75]:
df_cons_pole_merge = pd.merge(
    df_cons, df_pole_group,
    left_on='cons_no', right_on='cons_no', how='left'
)

In [76]:
filename = 'bc-cons-pole-merge' if MIN_POLE_LINE_COUNTS == 1 else 'bc-cons-pole-merge-2'
write_data(filename, df_cons_pole_merge)