#### 전선관련 전처리

In [39]:
import ast
import pandas as pd
from freeman.aiddd.data_manager import read_data, write_data

In [40]:
MIN_POLE_LINE_COUNTS = 2

In [41]:
# 전선정보까지 포함된 전처리 데이터
filename = 'bc-cons-pole-merge' if MIN_POLE_LINE_COUNTS == 1 else 'bc-cons-pole-merge-2'
_df_data = read_data(filename)

# 전처리된 전주정보
filename = 'bc-pole' if MIN_POLE_LINE_COUNTS == 1 else 'bc-pole-2'
_df_pole = read_data(filename)
# 제공받은 3차 전선정보
_df_line = read_data('LINE_DATA', 'provide')

_df_data.shape, _df_pole.shape, _df_line.shape

((5710, 33), (17555, 21), (40019, 77))

In [42]:
df_data, df_pole, df_line = _df_data.copy(), _df_pole.copy(), _df_line.copy()

#### 컬럼확인

In [43]:
# 학습에 필요한 컬럼 추출
columns_for_training = [
    '공사번호', '전산화번호', 'GISID', '전원측전산화번호', '지지물간거리',
    '전선종류코드1', '전선규격코드1', '전선구분코드1', 'X좌표-Y좌표'
]
df = df_line[columns_for_training].copy()
df.shape

(40019, 9)

In [44]:
# 컬럼명 변경: 사용하기 좋게 영문으로
rename_columns = {
    '공사번호': 'cons_no', 'GISID': 'gisid', 
    '전산화번호': 'comp_no', '전원측전산화번호': 'from_comp_no',
    '지지물간거리': 'distance', 
    '전선종류코드1': 'line_type_cd', '전선규격코드1': 'line_spec_cd',
    '전선구분코드1': 'line_class_cd', 'X좌표-Y좌표': 'position',
}
df.rename(columns=rename_columns, inplace=True)

#### 데이터전처리

##### 좌표값

In [45]:
# all_positions = []
# positions_length = []
# for _, row in df.iterrows():
#     current_positions_soc = row['position'].split(',')
#     p_len = len(current_positions_soc) // 2
#     positions_length.append(p_len)
#     current_positions = [
#         [float(current_positions_soc[i]), float(current_positions_soc[i+1])]
#         for i in range(0, p_len*2, 2)
#     ]
#     all_positions.append(current_positions)

In [46]:
# df['positions_len'] = positions_length
# df['positions'] = all_positions

In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40019 entries, 0 to 40018
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   cons_no        40019 non-null  object 
 1   comp_no        40019 non-null  object 
 2   gisid          40019 non-null  int64  
 3   from_comp_no   40019 non-null  object 
 4   distance       40019 non-null  int64  
 5   line_type_cd   40019 non-null  object 
 6   line_spec_cd   40019 non-null  float64
 7   line_class_cd  38826 non-null  object 
 8   position       40019 non-null  object 
dtypes: float64(1), int64(2), object(6)
memory usage: 2.7+ MB


In [48]:
df = pd.get_dummies(
    df, 
    columns=['line_type_cd', 'line_spec_cd', 'line_class_cd'], 
    prefix=['line_type', 'line_spec', 'line_class']
)

In [49]:
df.columns

Index(['cons_no', 'comp_no', 'gisid', 'from_comp_no', 'distance', 'position',
       'line_type_AO', 'line_type_C1', 'line_type_C2', 'line_type_C4',
       'line_type_D2', 'line_type_D4', 'line_type_EW', 'line_type_OC',
       'line_type_OW', 'line_spec_3.2', 'line_spec_22.0', 'line_spec_25.0',
       'line_spec_32.0', 'line_spec_35.0', 'line_spec_38.0', 'line_spec_58.0',
       'line_spec_60.0', 'line_spec_70.0', 'line_spec_100.0',
       'line_spec_120.0', 'line_spec_150.0', 'line_class_NJ', 'line_class_NP',
       'line_class_NT', 'line_class_NW'],
      dtype='object')

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40019 entries, 0 to 40018
Data columns (total 31 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   cons_no          40019 non-null  object
 1   comp_no          40019 non-null  object
 2   gisid            40019 non-null  int64 
 3   from_comp_no     40019 non-null  object
 4   distance         40019 non-null  int64 
 5   position         40019 non-null  object
 6   line_type_AO     40019 non-null  bool  
 7   line_type_C1     40019 non-null  bool  
 8   line_type_C2     40019 non-null  bool  
 9   line_type_C4     40019 non-null  bool  
 10  line_type_D2     40019 non-null  bool  
 11  line_type_D4     40019 non-null  bool  
 12  line_type_EW     40019 non-null  bool  
 13  line_type_OC     40019 non-null  bool  
 14  line_type_OW     40019 non-null  bool  
 15  line_spec_3.2    40019 non-null  bool  
 16  line_spec_22.0   40019 non-null  bool  
 17  line_spec_25.0   40019 non-null

In [51]:
# bool형을 int형으로 변환
df = df.applymap(lambda x: int(x) if isinstance(x, bool) else x)

In [52]:
df.shape

(40019, 31)

In [53]:
df = df[df.cons_no.isin(df_data.cons_no)]
df.shape

(18566, 31)

In [54]:
filename = 'bd-line' if MIN_POLE_LINE_COUNTS == 1 else 'bd-line-2'
write_data(filename, df)

#### 공사비를 기준으로 로우 만들기

In [55]:
cons_no_rows = []
unique_cons_no_list = df.cons_no.unique()
sum_columns = [col for col in df.columns if col.startswith('line_')]

In [56]:
for cons_no in unique_cons_no_list:
    df_temp = df[df.cons_no == cons_no]
    cons_no_sum = df_temp[sum_columns].sum().values.tolist()
    add_data = [cons_no] + cons_no_sum
    cons_no_rows.append(add_data)
    
df_line_group = pd.DataFrame(cons_no_rows, columns=['cons_no']+sum_columns)

In [57]:
filename = 'bd-line-group' if MIN_POLE_LINE_COUNTS == 1 else 'bd-line-group-2'
write_data(filename, df_line_group)

In [58]:
df_cons_pole_line_merge = pd.merge(
    df_data, df_line_group,
    left_on='cons_no', right_on='cons_no', how='left'
)

In [59]:
filename = 'bd-cons-pole-line-merge' if MIN_POLE_LINE_COUNTS == 1 else 'bd-cons-pole-line-merge-2'
write_data(filename, df_cons_pole_line_merge)

In [60]:
# df_pole의 전산화번호를 이용해 좌표 comp_no: [x,y]인 딕션너리 만들기
pole_dict = df_pole[['comp_no', 'x', 'y']].set_index('comp_no').T.to_dict('list')

In [61]:
cons_no_pole_path = []
cons_no_distance_sum = []
# 제외되는 공사번호 저장
exception_cons_no_list = []

for cons_no in unique_cons_no_list:
    df_temp = df[df.cons_no == cons_no]
    
    try:
        comp_no_values = df_temp.comp_no.tolist()
        from_comp_no_values = df_temp.from_comp_no.tolist()
        distance_values = df_temp.distance.tolist()
        only_comp_no_value = [
            item for item in comp_no_values if item not in from_comp_no_values
        ][0]
        only_from_comp_no_value = [
            item for item in from_comp_no_values if item not in comp_no_values
        ][0]
    except IndexError:
        # 공사번호에 전주가 하나뿐이여서 한쪽에만 있는 전주번호가 없음
        exception_cons_no_list.append(cons_no)
        continue
    
    pole_path = [only_from_comp_no_value, pole_dict.get(only_from_comp_no_value, [0,0])]
    # 전원측전산화번오에만 있는 값의 인덱스를 구해야 하기 때문에 다음값에 최초값을 넣어줌
    next_value = only_from_comp_no_value
    loop_size = len(from_comp_no_values)
    distance_sum = 0
    try:
        for _ in range(loop_size):
            next_index = from_comp_no_values.index(next_value)
            next_distance = distance_values[next_index]
            distance_sum += next_distance
            next_value = comp_no_values[next_index]
            pole_path.append(next_distance)
            pole_path.append(next_value)
            pole_path.append(pole_dict.get(next_value, [0,0]))
        cons_no_pole_path.append([cons_no] + pole_path)
        cons_no_distance_sum.append([cons_no] + [distance_sum])
    except ValueError:
        # 한 공사에서 모든 전주가 한 라인에 연결되지 않고, 끈어진 경우
        exception_cons_no_list.append(cons_no)
        continue

In [62]:
# 컬럼명 만들기
column_names = ['cons_no']
MAX_LINE_COUNTS = 12
for index in range(1, MAX_LINE_COUNTS):
    column_names.append(f'pole{index}_comp_no')
    column_names.append(f'pole{index}_xy')
    column_names.append(f'pole{index+1}_distance')
column_names.append('pole12_comp_no')
column_names.append('pole12_xy')

In [63]:
df_pole_path = pd.DataFrame(cons_no_pole_path, columns=column_names)
df_total_distance = pd.DataFrame(cons_no_distance_sum, columns=['cons_no', 'distance_sum'])

In [64]:
df_pole_path.head()

Unnamed: 0,cons_no,pole1_comp_no,pole1_xy,pole2_distance,pole2_comp_no,pole2_xy,pole3_distance,pole3_comp_no,pole3_xy,pole4_distance,...,pole9_xy,pole10_distance,pole10_comp_no,pole10_xy,pole11_distance,pole11_comp_no,pole11_xy,pole12_distance,pole12_comp_no,pole12_xy
0,477420193243,7385D851,"[0, 0]",50,7385D742,"[128.393826881391, 36.8320877291799]",50,7385D731,"[128.393672553828, 36.8316526370489]",43.0,...,,,,,,,,,,
1,477420193827,7696C351,"[0, 0]",57,7696C262,"[128.451141424479, 37.0311195266744]",53,7696C162,"[128.450624308738, 37.0313530823716]",37.0,...,,,,,,,,,,
2,477420203272,6795A841,"[0, 0]",42,6795A842,"[128.252547411503, 37.0160534603552]",29,6795A851,"[128.252650514568, 37.0163067389827]",,...,,,,,,,,,,
3,477420203601,6994A441,"[0, 0]",57,6994A361,"[0, 0]",63,6994A371,"[0, 0]",57.0,...,,,,,,,,,,
4,477420203712,6998H531,"[0, 0]",31,6998H541,"[0, 0]",30,6998H551,"[128.312198570629, 37.0660126357728]",38.0,...,,,,,,,,,,


In [65]:
df_cons_pole_line_path_merge = pd.merge(
    df_cons_pole_line_merge, df_pole_path,
    left_on='cons_no', right_on='cons_no', how='right'
)

In [66]:
df_cons_pole_line_path_merge = pd.merge(
    df_cons_pole_line_path_merge, df_total_distance,
    left_on='cons_no', right_on='cons_no', how='left'
)

In [67]:
filtered_pole_num_columns = [
    col for col in df_cons_pole_line_path_merge.columns 
    if col.startswith('pole') and col[4].isdigit()
]

In [68]:
df_temp = df_cons_pole_line_path_merge[
    ['line_cnts', 'pole_cnts'] + filtered_pole_num_columns
].copy()

In [69]:
df_cons_pole_line_path_merge.isna().sum().sort_values(ascending=False)

pole12_xy          4416
pole12_comp_no     4416
pole12_distance    4416
pole11_xy          4411
pole11_comp_no     4411
                   ... 
pole_spec_11.0        0
pole_spec_10.0        0
pole_spec_8.0         0
pole_spec_6.0         0
distance_sum          0
Length: 94, dtype: int64

In [70]:
df_cons_pole_line_path_merge.fillna(0.0, inplace=True)

In [71]:
def change_position(data):
    change_data = ast.literal_eval(data)
    if change_data not in [[0,0], 0.0]:
        return change_data[0], change_data[1]
    else:
        return 0, 0

In [72]:
# xy로 '[x, y]'로 되어있는 컬럼을 이용해 x,y컬럼을 추가
MAX_POLE_COUNTS = 12
for idx in range(1, MAX_POLE_COUNTS+1):
    source_column = f'pole{idx}_xy'
    target_x_column = f'pole{idx}_x_pos'
    target_y_column = f'pole{idx}_y_pos'
    # df_cons_pole_line_path_merge[[target_x_column, target_y_column]] = \
    #     df_cons_pole_line_path_merge[source_column].apply(change_position)
    df_cons_pole_line_path_merge[target_x_column] = \
        df_cons_pole_line_path_merge[source_column].apply(
            lambda x: x[0] if x not in [[0, 0], 0.0] else 0
        )
    df_cons_pole_line_path_merge[target_y_column] = \
        df_cons_pole_line_path_merge[source_column].apply(
            lambda x: x[1] if x not in [[0, 0], 0.0] else 0
        )

In [73]:
df_cons_pole_line_path_merge[['pole1_xy', 'pole1_x_pos', 'pole1_y_pos', 'pole2_xy', 'pole2_x_pos', 'pole2_y_pos']].head()

Unnamed: 0,pole1_xy,pole1_x_pos,pole1_y_pos,pole2_xy,pole2_x_pos,pole2_y_pos
0,"[0, 0]",0.0,0.0,"[128.393826881391, 36.8320877291799]",128.393827,36.832088
1,"[0, 0]",0.0,0.0,"[128.451141424479, 37.0311195266744]",128.451141,37.03112
2,"[0, 0]",0.0,0.0,"[128.252547411503, 37.0160534603552]",128.252547,37.016053
3,"[0, 0]",0.0,0.0,"[0, 0]",0.0,0.0
4,"[0, 0]",0.0,0.0,"[0, 0]",0.0,0.0


In [74]:
filename = 'bd-all-merge' if MIN_POLE_LINE_COUNTS == 1 else 'bd-all-merge-2'
write_data(filename, df_cons_pole_line_path_merge)