# Data Preprocessing

## Crime(보이스피싱) Data

In [2]:
# package load
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd

In [90]:
# csv load(7.1s)
KP2020 = pd.read_csv('data/KP2020.csv', encoding='cp949')
KP2021 = pd.read_csv('data/KP2021.csv', encoding='cp949')
NPA2020 = pd.read_csv('data/NPA2020.csv', encoding='cp949')

### 보이스피싱 범죄 Dataset Generate
- 사건코드 : EVT_CL_CD == 215 </br>
- 동일사건 제외 : SME_EVT_YN != Y

In [91]:
KP2020 = KP2020.loc[(KP2020.EVT_CL_CD == 215) & (KP2020.SME_EVT_YN != 'Y'),:]
KP2021 = KP2021.loc[(KP2021.EVT_CL_CD == 215) & (KP2021.SME_EVT_YN != 'Y'),:]
NPA2020 = NPA2020.loc[(NPA2020.EVT_CL_CD == 215) & (NPA2020.SME_EVT_YN != 'Y'),:]

In [92]:
# Datetime 통일
NPA2020['RECV_CPLT_DM'] = None
NPA2020['RECV_CPLT_DM'] = NPA2020['RECV_CPLT_DM'].astype('str')
NPA2020['RECV_CPLT_DT'] = NPA2020['RECV_CPLT_DT'].astype('str')
NPA2020['RECV_CPLT_TM'] = NPA2020['RECV_CPLT_TM'].astype('str')

NPA2020['RECV_CPLT_DM'] = [i + j.zfill(6) for i,j in NPA2020[['RECV_CPLT_DT','RECV_CPLT_TM']].values] # 시간 6자리로 코딩 후 병합
NPA2020 = NPA2020[NPA2020.columns.drop(['RECV_CPLT_DT','RECV_CPLT_TM'])]

In [93]:
KP2020['RECV_CPLT_DM'] = pd.to_datetime(KP2020['RECV_CPLT_DM'], yearfirst=True)
KP2021['RECV_CPLT_DM'] = pd.to_datetime(KP2021['RECV_CPLT_DM'], yearfirst=True)
NPA2020['RECV_CPLT_DM'] = pd.to_datetime(NPA2020['RECV_CPLT_DM'], yearfirst=True)

In [95]:
# 사용컬럼만 추출
cols = ['RECV_CPLT_DM', 'NPA_CL', 'RPTER_SEX', 'HPPN_X', 'HPPN_Y']
KP2020 = KP2020[cols]
KP2021 = KP2021[cols]
NPA2020 = NPA2020[cols]

In [108]:
# Merge
df_crime = pd.concat([KP2020, KP2021, NPA2020], axis=0).dropna(axis=0).sort_values(by='RECV_CPLT_DM').reset_index(drop=True)

# to GeoDataFrame
gdf_crime = gpd.GeoDataFrame(df_crime, geometry=gpd.points_from_xy(df_crime['HPPN_X'], df_crime['HPPN_Y']), crs='WGS84')
gdf_crime = gdf_crime[gdf_crime.columns.drop(['HPPN_X','HPPN_Y'])]

In [109]:
gdf_crime.to_file('data/gdf_crime.gpkg')