#### 전선관련 전처리

In [1]:
import pandas as pd
from freeman.aiddd.data_manager import read_data, write_data

In [2]:
# 설비 갯 수가 추가된 학습용 데이터
df_data = read_data('3rd pp counts-base-on-cons-1st')
# 유의미한 좌표가 추가된 3차 제공 데이터
df_line = read_data('3rd provide line')

df_data.shape, df_line.shape

((14729, 18), (40019, 77))

#### 컬럼확인

In [12]:
# 학습에 필요한 컬럼 추출
columns_for_training = [
    '공사번호', '전산화번호', 'GISID', '전원측전산화번호', '지지물간거리',
    '전선종류코드1', '전선규격코드1', '전선구분코드1', 'X좌표-Y좌표'
]
df = df_line[columns_for_training].copy()
df.shape

(40019, 9)

In [13]:
# 컬럼명 변경: 사용하기 좋게 영문으로
rename_columns = {
    '공사번호': 'cons_no', 'GISID': 'gisid', 
    '전산화번호': 'comp_no', '전원측전산화번호': 'from_comp_no',
    '지지물간거리': 'distance', 
    '전선종류코드1': 'line_type_cd', '전선규격코드1': 'line_spec_cd',
    '전선구분코드1': 'line_class_cd', 'X좌표-Y좌표': 'position',
}
df.rename(columns=rename_columns, inplace=True)

#### 데이터전처리

##### 좌표값

In [15]:
all_positions = []
positions_length = []
for _, row in df.iterrows():
    current_positions_soc = row['position'].split(',')
    p_len = len(current_positions_soc) // 2
    positions_length.append(p_len)
    current_positions = [
        [float(current_positions_soc[i]), float(current_positions_soc[i+1])]
        for i in range(0, p_len*2, 2)
    ]
    all_positions.append(current_positions)

In [17]:
df['positions_len'] = positions_length
df['positions'] = all_positions

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40019 entries, 0 to 40018
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   cons_no        40019 non-null  object 
 1   comp_no        40019 non-null  object 
 2   gisid          40019 non-null  int64  
 3   from_comp_no   40019 non-null  object 
 4   distance       40019 non-null  int64  
 5   line_type_cd   40019 non-null  object 
 6   line_spec_cd   40019 non-null  float64
 7   line_class_cd  38826 non-null  object 
 8   position       40019 non-null  object 
 9   positions_len  40019 non-null  int64  
 10  positions      40019 non-null  object 
dtypes: float64(1), int64(3), object(7)
memory usage: 3.4+ MB


In [21]:
df.line_type_cd.value_counts()

line_type_cd
AO    22090
OW    10024
C2     3464
C4     3095
OC      934
D2      389
EW       16
D4        5
C1        2
Name: count, dtype: int64

In [23]:
df = pd.get_dummies(
    df, 
    columns=['line_type_cd', 'line_spec_cd', 'line_class_cd'], 
    prefix=['line_type', 'line_spec', 'line_class']
)

In [24]:
df.columns

Index(['cons_no', 'comp_no', 'gisid', 'from_comp_no', 'distance', 'position',
       'positions_len', 'positions', 'line_type_AO', 'line_type_C1',
       'line_type_C2', 'line_type_C4', 'line_type_D2', 'line_type_D4',
       'line_type_EW', 'line_type_OC', 'line_type_OW', 'line_spec_3.2',
       'line_spec_22.0', 'line_spec_25.0', 'line_spec_32.0', 'line_spec_35.0',
       'line_spec_38.0', 'line_spec_58.0', 'line_spec_60.0', 'line_spec_70.0',
       'line_spec_100.0', 'line_spec_120.0', 'line_spec_150.0',
       'line_class_NJ', 'line_class_NP', 'line_class_NT', 'line_class_NW'],
      dtype='object')

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40019 entries, 0 to 40018
Data columns (total 33 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   cons_no          40019 non-null  object
 1   comp_no          40019 non-null  object
 2   gisid            40019 non-null  int64 
 3   from_comp_no     40019 non-null  object
 4   distance         40019 non-null  int64 
 5   position         40019 non-null  object
 6   positions_len    40019 non-null  int64 
 7   positions        40019 non-null  object
 8   line_type_AO     40019 non-null  bool  
 9   line_type_C1     40019 non-null  bool  
 10  line_type_C2     40019 non-null  bool  
 11  line_type_C4     40019 non-null  bool  
 12  line_type_D2     40019 non-null  bool  
 13  line_type_D4     40019 non-null  bool  
 14  line_type_EW     40019 non-null  bool  
 15  line_type_OC     40019 non-null  bool  
 16  line_type_OW     40019 non-null  bool  
 17  line_spec_3.2    40019 non-null

In [26]:
# bool형을 int형으로 변환
df = df.applymap(lambda x: int(x) if isinstance(x, bool) else x)

In [28]:
df.shape

(40019, 33)

In [29]:
df = df[df.cons_no.isin(df_data.cons_no)]
df.shape

(29705, 33)

In [30]:
write_data('3rd pp line-base-on-cons-1st', df)