#### 전주관련 전처리

In [1]:
import pandas as pd

import matplotlib.pyplot as plt
from freeman.plt_setting import plt_settings
from freeman.aiddd.data_manager import read_data, write_data

# 차트 한글처리 지원
plt_settings()

#### 데이터 불러오기

In [2]:
# 설비 갯 수가 추가된 학습용 데이터
df_data = read_data('3rd pp counts-base-on-cons-1st')
# 유의미한 좌표가 추가된 3차 제공 데이터
df_pole = read_data('3rd provide pole')

In [3]:
df_data.shape, df_pole.shape

((14729, 18), (38533, 63))

#### 데이터 전처리

##### 공사비 레코드에 전주 좌표 추가

In [4]:
# 전주 데이터에서 좌표 추가를 위해 필요한 컬럼만 추출
columns_for_training = [
    '공사번호', '전산화번호', 'GISID', 
    '전주형태코드', '전주종류코드', '전주규격코드', 
    'X좌표-Y좌표',
]
df_pole = df_pole[columns_for_training].copy()
df_pole.shape

(38533, 7)

In [5]:
# 컬럼명 변경: 사용하기 좋게 영문으로
rename_columns = {
    '공사번호': 'cons_no', '전산화번호': 'comp_no', 'GISID': 'gisid', 
    '전주형태코드': 'pole_shape_cd', '전주종류코드': 'pole_type_cd',
    '전주규격코드': 'pole_spec_cd','X좌표-Y좌표': 'position'
}
df_pole.rename(columns=rename_columns, inplace=True)

In [6]:
# `position` 컬럼을 이용해 좌표관련 컬럼 추가
df_pole[['x', 'y', 'temp1', 'temp2']] = \
    df_pole.position.str.split(',', expand=True)
    
df_pole.drop(['temp1', 'temp2'], axis=1, inplace=True)

# 공사비 데이터셋에 있는 공사번호만 남기고 나머지는 제거
df_pole = df_pole[df_pole.cons_no.isin(df_data.cons_no)]
df_pole.shape

(26921, 9)

In [7]:
# 컬럼 확인
df_pole.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26921 entries, 220 to 38532
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   cons_no        26921 non-null  object 
 1   comp_no        26921 non-null  object 
 2   gisid          26921 non-null  int64  
 3   pole_shape_cd  26921 non-null  object 
 4   pole_type_cd   26826 non-null  object 
 5   pole_spec_cd   26826 non-null  float64
 6   position       26921 non-null  object 
 7   x              26921 non-null  object 
 8   y              26921 non-null  object 
dtypes: float64(1), int64(1), object(7)
memory usage: 2.1+ MB


In [8]:
df_pole['pole_shape_cd'].value_counts()

pole_shape_cd
O    26260
G      566
V       95
Name: count, dtype: int64

In [9]:
# OneHot Encoding: `pole_shape_cd` column
df_pole = pd.get_dummies(df_pole, columns=['pole_shape_cd'], prefix=['pole_shape'])
df_pole.shape

(26921, 11)

In [10]:
df_pole.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26921 entries, 220 to 38532
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   cons_no       26921 non-null  object 
 1   comp_no       26921 non-null  object 
 2   gisid         26921 non-null  int64  
 3   pole_type_cd  26826 non-null  object 
 4   pole_spec_cd  26826 non-null  float64
 5   position      26921 non-null  object 
 6   x             26921 non-null  object 
 7   y             26921 non-null  object 
 8   pole_shape_G  26921 non-null  bool   
 9   pole_shape_O  26921 non-null  bool   
 10  pole_shape_V  26921 non-null  bool   
dtypes: bool(3), float64(1), int64(1), object(6)
memory usage: 1.9+ MB


In [11]:
df_pole['pole_type_cd'].value_counts()

pole_type_cd
C    24968
H     1435
B      402
1       16
M        4
E        1
Name: count, dtype: int64

In [12]:
# OneHot Encoding: `pole_type_cd` column
df_pole = pd.get_dummies(df_pole, columns=['pole_type_cd'], prefix=['pole_type'])
df_pole.shape

(26921, 16)

In [13]:
df_pole['pole_spec_cd'].value_counts()

pole_spec_cd
10.0    19023
12.0     5591
16.0     1067
14.0      739
6.0       402
11.0        3
8.0         1
Name: count, dtype: int64

In [14]:
# OneHot Encoding: `pole_spec_cd` column
df_pole = pd.get_dummies(df_pole, columns=['pole_spec_cd'], prefix=['pole_spec'])
df_pole.shape

(26921, 22)

In [15]:
df_pole.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26921 entries, 220 to 38532
Data columns (total 22 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   cons_no         26921 non-null  object
 1   comp_no         26921 non-null  object
 2   gisid           26921 non-null  int64 
 3   position        26921 non-null  object
 4   x               26921 non-null  object
 5   y               26921 non-null  object
 6   pole_shape_G    26921 non-null  bool  
 7   pole_shape_O    26921 non-null  bool  
 8   pole_shape_V    26921 non-null  bool  
 9   pole_type_1     26921 non-null  bool  
 10  pole_type_B     26921 non-null  bool  
 11  pole_type_C     26921 non-null  bool  
 12  pole_type_E     26921 non-null  bool  
 13  pole_type_H     26921 non-null  bool  
 14  pole_type_M     26921 non-null  bool  
 15  pole_spec_6.0   26921 non-null  bool  
 16  pole_spec_8.0   26921 non-null  bool  
 17  pole_spec_10.0  26921 non-null  bool  
 18  pole_spec

In [20]:
# bool형을 int형으로 변환
df_pole = df_pole.applymap(lambda x: int(x) if isinstance(x, bool) else x)

# str형을 float형으로
df_pole['x'] = df_pole['x'].astype(float)
df_pole['y'] = df_pole['y'].astype(float)

In [21]:
df_pole.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26921 entries, 220 to 38532
Data columns (total 22 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   cons_no         26921 non-null  object 
 1   comp_no         26921 non-null  object 
 2   gisid           26921 non-null  int64  
 3   position        26921 non-null  object 
 4   x               26921 non-null  float64
 5   y               26921 non-null  float64
 6   pole_shape_G    26921 non-null  int64  
 7   pole_shape_O    26921 non-null  int64  
 8   pole_shape_V    26921 non-null  int64  
 9   pole_type_1     26921 non-null  int64  
 10  pole_type_B     26921 non-null  int64  
 11  pole_type_C     26921 non-null  int64  
 12  pole_type_E     26921 non-null  int64  
 13  pole_type_H     26921 non-null  int64  
 14  pole_type_M     26921 non-null  int64  
 15  pole_spec_6.0   26921 non-null  int64  
 16  pole_spec_8.0   26921 non-null  int64  
 17  pole_spec_10.0  26921 non-null  in

In [22]:
write_data('3rd pp pole-base-on-cons-1st', df_pole)