# CCTV 입지 모델링

## 📌 목표
- 전처리한 데이터 활용하여 학습 데이터 생성, 격자별 cctv 설치 유무 예측 모델 학습(2018 ~ 2020)
- 예측 모델 학습 시 주간, 야간 신고 모델 따로 만들기
- 학습 모델로 2021년 격자 데이터에 예측
- proba 값 추출
- `낮 * 0.3` / `밤 * 0.7` 가중치로 최종 스코어(설치우선순위) 추출 

In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon, LineString
from tqdm import tqdm 
import folium as f
import warnings
from catboost import CatBoostClassifier
from sklearn import metrics, model_selection
warnings.filterwarnings('ignore')
tqdm.pandas()

In [2]:
buildings_status = pd.read_csv('./data/buildings_status.csv')
candi_cctv_cover_area = pd.read_csv('./data/candi_cctv_coverage_area.csv')
pop_status = pd.read_csv('./data/pop_status_total.csv')
report_cnt = pd.read_csv('./data/report_class_cnt.csv')
lights_bells_status = pd.read_csv('./data/lights_bells_status.csv')
facilities_count = pd.read_csv('./data/facilities_count.csv')

## 학습 데이터 생성 
전처리한 데이터 로드해서 merge

In [3]:
# 마라230983' -> 이상치
t = pd.merge(pop_status, report_cnt, how='outer')
temp = pd.merge(t, candi_cctv_cover_area, how='outer')
temp = pd.merge(temp, buildings_status, how='outer')
temp = pd.merge(temp, lights_bells_status, how='outer')
df = pd.merge(temp, facilities_count, how='outer')

df.head()

Unnamed: 0,gid,year,float_pop,total_pop,total_weak,total_foreign,car_accident_cnt,misdemeanour_cnt,etc_cnt,fire_accident_cnt,...,old_building_total_area,recreation_total_area,security_business_count,bell_cnt,securitylight_cnt,nur_nm,kin_nm,school_nm,child_safety_count,carpark_cnt
0,마라095999,2018.0,20.27,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,,,,,,,,,,
1,마라096990,2018.0,3.14,0.0,0.0,0.0,,,,,...,,,,,,,,,,
2,마라096991,2018.0,34.71,0.0,0.0,0.0,,,,,...,,,,,,,,,,
3,마라096992,2018.0,31.34,0.0,0.0,0.0,,,,,...,,,,,,,,,,
4,마라096993,2018.0,0.0,0.0,0.0,0.0,,,,,...,,,,,,,,,,


In [4]:
df = df[df['gid']!='마라230983']
df.reset_index(drop=True, inplace=True)

In [5]:
temp = df[['gid', 'year','medicare_total_area', 'transport_total_area', 'attraction_total_area', 'old_building_total_area', 
           'security_business_count','bell_cnt', 'securitylight_cnt', 'nur_nm', 'kin_nm', 'school_nm', 'child_safety_count', 
           'carpark_cnt', 'recreation_total_area']].groupby(['gid', 'year']).sum().groupby(level=0).cumsum().reset_index()
df = pd.merge(df.drop(['medicare_total_area', 'transport_total_area', 'attraction_total_area', 'old_building_total_area',
               'security_business_count','recreation_total_area', 'bell_cnt', 'securitylight_cnt', 'nur_nm', 'kin_nm',
                'school_nm', 'child_safety_count', 'carpark_cnt'],axis=1), temp)

In [6]:
candi_grids = gpd.read_file('./data/candidate_grids.geojson')
candi_grids['gid'].nunique()

1563

In [7]:
df = df[df['gid'].isin(candi_grids['gid'].unique())]
del df['total_area'], df['non_cover_area']

In [8]:
df.fillna(0, inplace=True)

In [9]:
df.columns

Index(['gid', 'year', 'float_pop', 'total_pop', 'total_weak', 'total_foreign',
       'car_accident_cnt', 'misdemeanour_cnt', 'etc_cnt', 'fire_accident_cnt',
       'ordinary_crime_cnt', 'top_5_crime_cnt', 'sucide_accident_cnt',
       'midnight_report_yn', 'daytime_report_yn', 'covering_cctv_cnt',
       'coverage_area', 'medicare_total_area', 'transport_total_area',
       'attraction_total_area', 'old_building_total_area',
       'security_business_count', 'bell_cnt', 'securitylight_cnt', 'nur_nm',
       'kin_nm', 'school_nm', 'child_safety_count', 'carpark_cnt',
       'recreation_total_area'],
      dtype='object')

In [10]:
df.to_csv('./data/train.csv', index=None)

---

In [11]:
df['year'].value_counts()

2019.0    1563
2018.0    1563
2021.0    1563
2020.0    1563
2017.0    1563
Name: year, dtype: int64

In [12]:
# from xgboost import XGBClassifier

# model = XGBClassifier()
# model.fit(X_tr, y_tr)
# pred = model.predict(X_te)
# print(metrics.f1_score(pred,y_te))
# print(metrics.accuracy_score(pred,y_te))

---

In [13]:
df = df[df['year']!=2017]

In [14]:
# df.corr()[['cctv_yn']].sort_values(by='cctv_yn', ascending=False)

In [15]:
# 유동(float_pop)은 2020년까지만

In [16]:
# df.to_csv('./data/train.csv', index=None)

In [17]:
df.sort_values("gid",inplace=True)
df.reset_index(drop=True,inplace=True)

In [18]:
# y1 = df['coverage_area']
# y2 = df['covering_cctv_cnt']

In [19]:
# def cctv_classification(x) :
#     if x == 0 :
#         return 0
#     elif x <= 3 :
#         return 1
#     else : 
#         return 2

In [20]:
# 설치 cctv 개수가 1 이상이면 1, 없으면 0
df['cctv_yn'] = df['covering_cctv_cnt'].apply(lambda x : 1 if x > 0 else 0)

# 설치 cctv 개수가 0개면 0, 1~3개면 1, 4개 이상이면 2
# df['cctv_class'] = df['covering_cctv_cnt'].apply(cctv_classification)

---

### 로지스틱 회귀 

In [21]:
# logisticRegression
import statsmodels.api as sm

X = df.drop(['coverage_area', 'covering_cctv_cnt', 'gid', 'float_pop', 'year', 'daytime_report_yn','cctv_yn'], axis=1)
y = df[['cctv_yn']]

lr_model = sm.Logit(y, X)
result = lr_model.fit_regularized(alpha=0.1 )
result.summary()

Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.6019789864034376
            Iterations: 316
            Function evaluations: 354
            Gradient evaluations: 316


Try increasing solver accuracy or number of iterations, decreasing alpha, or switch solvers


0,1,2,3
Dep. Variable:,cctv_yn,No. Observations:,6252.0
Model:,Logit,Df Residuals:,6228.0
Method:,MLE,Df Model:,23.0
Date:,"Fri, 19 Aug 2022",Pseudo R-squ.:,0.1285
Time:,09:28:27,Log-Likelihood:,-3763.0
converged:,True,LL-Null:,-4318.1
Covariance Type:,nonrobust,LLR p-value:,4.913e-220

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
total_pop,0.0020,0.001,3.842,0.000,0.001,0.003
total_weak,-0.0108,0.002,-6.283,0.000,-0.014,-0.007
total_foreign,-0.0057,0.003,-1.915,0.055,-0.012,0.000
car_accident_cnt,-0.0523,0.006,-8.713,0.000,-0.064,-0.041
misdemeanour_cnt,0.0172,0.005,3.251,0.001,0.007,0.028
etc_cnt,0.0019,0.001,2.042,0.041,7.47e-05,0.004
fire_accident_cnt,0.0967,0.050,1.946,0.052,-0.001,0.194
ordinary_crime_cnt,-0.0046,0.004,-1.171,0.242,-0.012,0.003
top_5_crime_cnt,-0.0284,0.008,-3.470,0.001,-0.044,-0.012


---

In [22]:
import numpy as np
np.exp(result.params)

total_pop                  1.002017
total_weak                 0.989266
total_foreign              0.994297
car_accident_cnt           0.949081
misdemeanour_cnt           1.017331
etc_cnt                    1.001859
fire_accident_cnt          1.101556
ordinary_crime_cnt         0.995418
top_5_crime_cnt            0.972040
sucide_accident_cnt        0.981202
midnight_report_yn         1.003179
medicare_total_area        0.999791
transport_total_area       1.000442
attraction_total_area      0.999678
old_building_total_area    1.000468
security_business_count    0.956986
bell_cnt                   1.238254
securitylight_cnt          1.237999
nur_nm                     0.897524
kin_nm                     7.200753
school_nm                  1.056813
child_safety_count         0.175725
carpark_cnt                2.921743
recreation_total_area      0.998376
dtype: float64

## 심야시간 모델 학습

In [23]:
# X, y 분리
X = df[df['year']!=2021].drop(['coverage_area', 'covering_cctv_cnt', 'gid', 'float_pop', 'year', 'daytime_report_yn', 'cctv_yn'], axis=1)
y = df[df['year']!=2021]['cctv_yn']

In [24]:
X_tr, X_te, y_tr, y_te = model_selection.train_test_split(X,y, test_size=0.2, random_state=2022)

In [25]:
from sklearn import *
from catboost import CatBoostClassifier,CatBoostRegressor

In [26]:
model = CatBoostClassifier(logging_level="Silent", random_state=2022)
model.fit(X_tr,y_tr)
pred = model.predict(X_te)

In [27]:
print(metrics.f1_score(pred,y_te))
print(metrics.accuracy_score(pred,y_te))

0.7674897119341564
0.7590618336886994


## 심야시간 Model 예측

In [28]:
# 예측하고자 하는 연도(2021)를 타겟으로 정함
target = df[df['year']==2021]
target.sort_values("gid",inplace=True)
target.reset_index(drop=True,inplace=True)

In [29]:
target_gid = target['gid']
target = target.drop(['gid', 'year', 'coverage_area', 'covering_cctv_cnt', 'float_pop', 'daytime_report_yn', 'cctv_yn'],axis=1)

pred_proba = model.predict_proba(target)

In [30]:
# 각 후보 gid에 대한 proba 구하기
proba_df = pd.DataFrame(pred_proba)
final_df_midnight = pd.concat([pd.Series(target_gid),proba_df],axis=1)

In [31]:
final_df_midnight

Unnamed: 0,gid,0,1
0,마라122929,0.349213,0.650787
1,마라135954,0.712164,0.287836
2,마라141954,0.288245,0.711755
3,마라143979,0.713986,0.286014
4,마라145881,0.736905,0.263095
...,...,...,...
1558,마마226043,0.556897,0.443103
1559,마마266018,0.785853,0.214147
1560,마마295001,0.499160,0.500840
1561,마마300023,0.093332,0.906668


## 주간 Model 학습

In [32]:
# X, y 분리
X = df[df['year']!=2021].drop(['coverage_area', 'covering_cctv_cnt', 'gid', 'float_pop', 'year', 'midnight_report_yn', 'cctv_yn'], axis=1)
y = df[df['year']!=2021]['cctv_yn']

In [33]:
from sklearn import *
from catboost import CatBoostClassifier,CatBoostRegressor

In [34]:
X_tr, X_te, y_tr, y_te = model_selection.train_test_split(X,y, test_size=0.2, random_state=2022)

In [35]:
model = CatBoostClassifier(logging_level="Silent", random_state=2022)
model.fit(X_tr,y_tr)
pred = model.predict(X_te)

In [36]:
print(metrics.f1_score(pred,y_te))
print(metrics.accuracy_score(pred,y_te))

0.7689161554192228
0.7590618336886994


## 주간 Model 예측

In [37]:
# 예측하고자 하는 연도(2021)를 타겟으로 정함
target = df[df['year']==2021]
target.sort_values("gid",inplace=True)
target.reset_index(drop=True,inplace=True)

In [38]:
target_gid = target['gid']
target = target.drop(['gid', 'year', 'coverage_area', 'covering_cctv_cnt', 'float_pop', 'midnight_report_yn', 'cctv_yn'],axis=1)

pred_proba = model.predict_proba(target)

In [39]:
# 각 후보 gid에 대한 proba 구하기
proba_df = pd.DataFrame(pred_proba)
final_df_daytime = pd.concat([pd.Series(target_gid),proba_df],axis=1)
final_df_daytime

Unnamed: 0,gid,0,1
0,마라122929,0.249521,0.750479
1,마라135954,0.685026,0.314974
2,마라141954,0.336042,0.663958
3,마라143979,0.619935,0.380065
4,마라145881,0.775450,0.224550
...,...,...,...
1558,마마226043,0.562149,0.437851
1559,마마266018,0.801433,0.198567
1560,마마295001,0.460167,0.539833
1561,마마300023,0.096778,0.903222


## 종합 및 가중치 부여

In [40]:
# proba : 심야시간 점수, 주간 점수 추출
# 가중치 midnight * 0.7 / daytime * 0.3
# 최종 스코어 = (daytime * 0.3) + (midnight * 0.7)
final_df_midnight = final_df_midnight.rename(columns={1 : 'midnight_score'})
final_df_daytime = final_df_daytime.rename(columns={1 : 'daytime_score'})

result = pd.merge(final_df_daytime[['gid', 'daytime_score']], final_df_midnight[['gid', 'midnight_score']])
result['final_score'] = (result['midnight_score'] * 0.7) + (result['daytime_score'] * 0.3)
result.head()

Unnamed: 0,gid,daytime_score,midnight_score,final_score
0,마라122929,0.750479,0.650787,0.680695
1,마라135954,0.314974,0.287836,0.295977
2,마라141954,0.663958,0.711755,0.697416
3,마라143979,0.380065,0.286014,0.314229
4,마라145881,0.22455,0.263095,0.251532


#### 기설치 cctv 입지확인


In [41]:
def geo_transform(DataFrame) :
    # csv to geopandas
    # lon, lat data를 geometry로 변경
    DataFrame['lat'] = DataFrame['lat'].astype(float)
    DataFrame['lon'] = DataFrame['lon'].astype(float)
    DataFrame['geometry'] = DataFrame.apply(lambda row : Point([row['lon'], row['lat']]), axis=1)
    DataFrame = gpd.GeoDataFrame(DataFrame, geometry='geometry')
    DataFrame.crs = {'init':'epsg:4326'}
    DataFrame = DataFrame.to_crs({'init':'epsg:4326'}) # 좌표계 epsg : 4326
    return DataFrame

In [42]:
cctv_status = pd.read_csv('./data/1.김해시_CCTV설치현황.csv')
cctv_status = geo_transform(cctv_status)

grid = gpd.read_file('./data/5.김해시_격자(100X100).geojson')

In [43]:
# 중복 위치 제거(한위치 다중설치)
cctv_status['round_lon'] = cctv_status['lon'].apply(lambda x : round(x, 5))
cctv_status['round_lat'] = cctv_status['lat'].apply(lambda x : round(x, 5))
cctv_status.drop_duplicates(subset=['round_lon', 'round_lat'], inplace=True)
cctv_status.reset_index(drop=True, inplace=True)

In [44]:
# 반경 50M 설정
cctv_status['geometry'] = cctv_status.to_crs({'init':'epsg:5179'}).buffer(50).to_crs({'init':'epsg:4326'})

cctv_status['year'] = cctv_status['install_ym'].apply(lambda x : 2017 if int(x[1:5]) < 2018 else int(x[1:5]))

# cctv 반경에 격자 위치 매핑
cctv_overlay = gpd.overlay(cctv_status, grid, how='intersection')
# cctv_overlay.head()

In [45]:
# 2021년에 기 설치된 cctv
installed_cctv_gid_list = cctv_overlay[cctv_overlay['year']==2021]['gid'].unique()
print(len(installed_cctv_gid_list))
result.drop(result[result['gid'].isin(installed_cctv_gid_list)].index, inplace=True)

413


In [46]:
# final score 기준 내림차순 100 선정(기대효과에서 top 50 거름)
result = result.sort_values(by='final_score', ascending=False).head(100)
result.reset_index(drop=True, inplace=True)

In [47]:
result.to_csv("./data/final_score.csv", index=None)

In [48]:
result

Unnamed: 0,gid,daytime_score,midnight_score,final_score
0,마마118014,0.977725,0.982696,0.981205
1,마라254937,0.981995,0.977600,0.978919
2,마라232927,0.966124,0.975427,0.972636
3,마라257943,0.969133,0.972981,0.971827
4,마라235940,0.974831,0.970411,0.971737
...,...,...,...,...
95,마라254932,0.896257,0.898342,0.897716
96,마라272938,0.899588,0.894947,0.896339
97,마라273946,0.910378,0.889000,0.895414
98,마라255938,0.873381,0.904706,0.895309
