# Preparation

In [1]:
from itertools import combinations
from math import sin, cos, sqrt, atan2, radians

import pandas as pd
import numpy as np
from tqdm import tqdm_notebook

import matplotlib.pyplot as plt
from scipy.cluster.vq import kmeans2, whiten

from db.mongo import MyMongo

In [2]:
with MyMongo() as db:
    cvs = db.get_df_from_table('cvs', 'cvs')

<--Mongo Connected.
Mongo Connection Closed.-->


# Function

In [3]:
def get_distance_from_coords(lat1, lon1, lat2, lon2):
    # approximate radius of earth in km
    R = 6371.0

    lat1 = radians(lat1)
    lon1 = radians(lon1)
    lat2 = radians(lat2)
    lon2 = radians(lon2)
    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    distance = R * c * 1000
    return distance

In [4]:
idx_lat_empty = cvs['lat'].isna()
cvs.loc[~idx_lat_empty].head(1)

Unnamed: 0,_id,lat,lng,개방서비스ID,개방서비스명,개방자치단체코드,관리번호,데이터갱신구분,데이터갱신일자,도로명우편번호,...,인허가일자,인허가취소일자,재개업일자,좌표정보(X),좌표정보(Y),지정일자,최종수정시점,폐업일자,휴업시작일자,휴업종료일자
0,5c08bec59085ca65eac3dbc9,37.57559556952654,126.98582687874092,11_43_02_P,담배소매업,3000000,2006300010105600037,I,2018-08-31 23:59:59.0,,...,20060921,,,198658.943470079,452604.254902527,,20080221000000,,,


In [5]:
print(cvs.columns)
len(cvs.loc[~idx_lat_empty])

Index(['_id', 'lat', 'lng', '개방서비스ID', '개방서비스명', '개방자치단체코드', '관리번호', '데이터갱신구분',
       '데이터갱신일자', '도로명우편번호', '도로명전체주소', '민원종류명', '번호', '사업장명', '상세영업상태명',
       '상세영업상태코드', '소재지면적', '소재지우편번호', '소재지전체주소', '소재지전화', '업태구분명', '영업상태구분코드',
       '영업상태명', '인허가일자', '인허가취소일자', '재개업일자', '좌표정보(X)', '좌표정보(Y)', '지정일자',
       '최종수정시점', '폐업일자', '휴업시작일자', '휴업종료일자'],
      dtype='object')


80666

In [6]:
pd.set_option('display.max_columns', None)
cols = ['lat', 'lng', '관리번호', '사업장명', '상세영업상태명', '도로명전체주소']
cvs.loc[~idx_lat_empty, cols].head(1)

cvs['도로명전체주소'] = cvs['도로명전체주소'].fillna('')

idx_blank = cvs['도로명전체주소']==''
idx_seoul = cvs['도로명전체주소'].str.contains('서울')
idx_busan = cvs['도로명전체주소'].str.contains('부산')
idx_daegu = cvs['도로명전체주소'].str.contains('대구')
idx_incheon = cvs['도로명전체주소'].str.contains('인천')
idx_ulsan = cvs['도로명전체주소'].str.contains('울산')
idx_gwangju = cvs['도로명전체주소'].str.contains('광주')
idx_daejeon = cvs['도로명전체주소'].str.contains('대전')
idx_jeju = cvs['도로명전체주소'].str.contains('제주')
idx_sejong = cvs['도로명전체주소'].str.contains('세종')
idx_kyungki = cvs['도로명전체주소'].str.contains('경기')
idx_gangwon = cvs['도로명전체주소'].str.contains('강원')
idx_cb = cvs['도로명전체주소'].str.contains('충청북도') | cvs['도로명전체주소'].str.contains('충북')
idx_cn = cvs['도로명전체주소'].str.contains('충청남도') | cvs['도로명전체주소'].str.contains('충남')
idx_jb = cvs['도로명전체주소'].str.contains('전라북도') | cvs['도로명전체주소'].str.contains('전북')
idx_jn = cvs['도로명전체주소'].str.contains('전라남도') | cvs['도로명전체주소'].str.contains('전남')
idx_kb = cvs['도로명전체주소'].str.contains('경상북도') | cvs['도로명전체주소'].str.contains('경북')
idx_kn = cvs['도로명전체주소'].str.contains('경상남도') | cvs['도로명전체주소'].str.contains('경남')

idx_all = idx_blank| idx_seoul| idx_busan| idx_daegu| idx_incheon| idx_ulsan| idx_gwangju| idx_daejeon| idx_jeju| idx_sejong| idx_kyungki|\
            idx_gangwon| idx_cb| idx_cn| idx_jb| idx_jn| idx_kb| idx_kn

idx_regions = [idx_seoul, idx_busan, idx_daegu, idx_incheon, idx_ulsan, idx_gwangju, idx_daejeon, idx_jeju, idx_sejong, idx_kyungki,
               idx_gangwon, idx_cb, idx_cn, idx_jb, idx_jn, idx_kb, idx_kn,]
idx_regions = [idx_blank |idx  for idx in idx_regions]

# idx_ = cvs['도로명전체주소'].str.contains('')
# idx_ = cvs['도로명전체주소'].str.contains('')
# idx_ = cvs['도로명전체주소'].str.contains('')






In [7]:
print(len(cvs))
print(len(cvs.loc[idx_all, cols]))
print(len(cvs.loc[idx_seoul, cols]))

cvs.loc[~idx_all, cols].head(3)

# len(cvs.loc[cvs['도로명전체주소'].isna()])

81607
81607
16539


Unnamed: 0,lat,lng,관리번호,사업장명,상세영업상태명,도로명전체주소


In [8]:
print(len(cvs.loc[idx_jeju]))

1921


# kmeans

In [9]:
# coordinates = np.array(cvs.loc[~idx_lat_empty & idx_busan, ['lat', 'lng']].astype(float))
# print(len(coordinates))
# # print(whiten(coordinates))

# x, y = kmeans2(whiten(coordinates), 3, iter = 20)
# plt.scatter(coordinates[:,0], coordinates[:,1], c=y);
# plt.show()

In [10]:
# y

# geohash

In [11]:
import pygeohash as pgh
from collections import defaultdict

geos = np.array(cvs.loc[~idx_lat_empty, ['관리번호', 'lat', 'lng', '사업장명']])
# len(geos)
geo_hash_list = [(pgh.encode(float(g[1]), float(g[2])), g[0]) for g in geos]

geo_hash_table = defaultdict(list)

for g in geo_hash_list:
    geo_hash_table[g[0]].append(g[1])

i = 0
for k, v in geo_hash_table.items():
    if i == 3:
        break
    i += 1
    print(k, v)
    
# print(geo_hash_list[0])
# print(geo_hash_table[:3])

# pgh.geohash_approximate_distance(hashes[0], hashes[1])
# pgh.geohash_haversine_distance(hashes[4], hashes[1])
# 118, 19

wydmc8km8nxq ['2006300010105600037', '2006300007605600015']
wydmf1bkwpyb ['1999300007605600150']
wydmc255070b ['2007300010105600043']


In [12]:
geos[:3]
id_geo = {}
for g in geos:
    id_geo[g[0]] = (float(g[1]), float(g[2]), g[3])

In [13]:
i = 0
for k, v in id_geo.items():
    if i == 3:
        break
    i += 1
    print(k, v)

2006300010105600037 (37.57559556952654, 126.98582687874092, '세븐일레븐 종로허브점')
1999300007605600150 (37.58366929825265, 127.0025632117959, 'GS25동숭점')
2007300010105600043 (37.573773352199254, 126.97312606247175, '이마트24 광화문시대점')


In [14]:
tmp = pd.DataFrame(columns=['geohash', '관리번호'], data=geo_hash_list); tmp.head(2)

Unnamed: 0,geohash,관리번호
0,wydmc8km8nxq,2006300010105600037
1,wydmf1bkwpyb,1999300007605600150


In [15]:
tmp.groupby('geohash').count()

Unnamed: 0_level_0,관리번호
geohash,Unnamed: 1_level_1
wvctpu8rcr9d,1
wvctrq31gdnr,2
wvctrq929gn2,3
wvctrqdye8t1,1
wvctwfrq30zp,1
wvctxr5v3q3n,3
wvctxrw1ysrn,1
wvctz9e0ndp8,1
wvcu3kbxg3rn,1
wvcu3m2cqju7,1


In [16]:
hashes = [g[0] for g in geo_hash_list]
comb_hash = combinations(hashes, 2)
comb_hash

# for idx_region in idx_regions:
#     ids = cvs.loc[~idx_lat_empty & idx_region]['관리번호'].tolist()
#     print(len(ids))
#     comb_list.append(combinations(ids, 2))

# print(comb_list)

<itertools.combinations at 0x7f146647a598>

In [17]:
result_hash = []

for c in comb_hash:
    d = pgh.geohash_approximate_distance(c[0], c[1])
    if d < 200:
        result_hash.append(c)


In [18]:
len(result_hash)
# prev: 60588

99179

In [19]:
src_dst = pd.DataFrame(columns=['src', 'dst'], data=result_hash)
print(len(src_dst))
src_dst.drop(src_dst[(src_dst['src']==src_dst['dst'])].index, inplace=True)
print(len(src_dst))
src_dst.drop_duplicates(inplace=True)
print(len(src_dst))
src_dst.head(1)

# prev: 60588, 24296, 12251

99179
43378
18651


Unnamed: 0,src,dst
1,wydmf1bkwpyb,wydmf1b673r1


In [20]:
result_ids = []
i = 0
for idx, row in src_dst.iterrows():
#     if i == 3:
#         break
#     i += 1
    src = row['src']
    dst = row['dst']
    srces = geo_hash_table[src]
    dsts = geo_hash_table[dst]
    
    
    for s in srces:
        for d in dsts:
            result_ids.append((s, d))

In [21]:
print(len(result_ids))
print(result_ids[:3])
# prev: 37493

66827
[('1999300007605600150', '2009300012905600010'), ('1999300007605600150', '2016300016905600088'), ('1999300007605600150', '2007300010105600048')]


In [22]:
df_result_ids = pd.DataFrame(columns=['src', 'dst'], data=result_ids); df_result_ids.head(1)

Unnamed: 0,src,dst
0,1999300007605600150,2009300012905600010


In [23]:
# with MyMongo() as db:
#     db.delete_and_insert_df('cvs', 'src_dst', df_result_ids)

In [24]:
src_dst_with_line_distance = []
for r in result_ids:
    src = id_geo[r[0]]
    dst = id_geo[r[1]]
    lat1, lon1, lat2, lon2 = src[0], src[1], dst[0], dst[1]
    src_name = src[2]
    dst_name = dst[2]
    d = get_distance_from_coords(lat1, lon1, lat2, lon2)
    src_dst_with_line_distance.append((src_name, r[0], lat1, lon1, dst_name, r[1], lat2, lon2, d))
#     print(r[0], r[1], d)
print(len(src_dst_with_line_distance))

66827


In [25]:
src_dst_with_line_distance[-10:]

[('훼미리마트탑동파크점',
  '2004549012305600104',
  33.51627081203372,
  126.52439883355675,
  'GS25 제주탑동점',
  '2012651005205600003',
  33.51636685509181,
  126.52499245493942,
  56.059140453556516),
 ('훼미리마트탑동파크점',
  '2004549012305600104',
  33.51627081203372,
  126.52439883355675,
  'LG25탑동점',
  '2002549007505600126',
  33.51636685509181,
  126.52499245493942,
  56.059140453556516),
 ('훼미리마트탑동파크점',
  '2004549012305600104',
  33.51627081203372,
  126.52439883355675,
  'LG25탑동점',
  '2003549012305600252',
  33.51636685509181,
  126.52499245493942,
  56.059140453556516),
 ('훼미리마트 제주로아점',
  '2008651005805600031',
  33.489854569463915,
  126.49267145966195,
  '씨유 신제주점',
  '2004549012305600219',
  33.48959685447329,
  126.49314641117076,
  52.546406195787746),
 ('훼미리마트 제주로아점',
  '2008651005805600031',
  33.489854569463915,
  126.49267145966195,
  '훼미리마트신제주점',
  '2001549007505600172',
  33.48959685447329,
  126.49314641117076,
  52.546406195787746),
 ('훼미리마트 제주로아점',
  '2008651005805600031',
  33.4898

In [26]:

new_distance = pd.DataFrame(columns=['src_name', 'src', 'src_lat', 'src_lng', 'dst_name', 'dst', 'dst_lat',
                                                      'dst_lng', 'line_distance'], data=src_dst_with_line_distance)
print(len(new_distance))
new_distance.head(1)
# del df_src_dst_with_line_distance

66827


Unnamed: 0,src_name,src,src_lat,src_lng,dst_name,dst,dst_lat,dst_lng,line_distance
0,GS25동숭점,1999300007605600150,37.583669,127.002563,세븐일레븐 종로대학로점,2009300012905600010,37.583248,127.002445,47.934733


In [27]:
with MyMongo() as db:
    prev_distance = db.get_df_from_table('cvs', 'src_dst')

<--Mongo Connected.
Mongo Connection Closed.-->


In [28]:
print(len(prev_distance))
prev_distance.head(1)

42125


Unnamed: 0,_id,dst,dst_lat,dst_lng,dst_name,line_distance,naver_distance,src,src_lat,src_lng,src_name
0,5c0d43659085ca6d107fa1f6,2009300012905600010,37.583248,127.002445,세븐일레븐 종로대학로점,47.934733,44.0,1999300007605600150,37.583669,127.002563,GS25동숭점


In [29]:
42125 - 37493

4632

In [30]:
exc = new_distance[['src', 'dst']].merge(prev_distance[['src', 'dst', 'line_distance']], on=['src', 'dst'], how='outer'); exc
print(len(exc.loc[exc['line_distance'].isna()]))

24702


In [31]:
exc_distance = new_distance.loc[exc['line_distance'].isna()]; exc_distance.head(5)

Unnamed: 0,src_name,src,src_lat,src_lng,dst_name,dst,dst_lat,dst_lng,line_distance
2,GS25동숭점,1999300007605600150,37.583669,127.002563,훼미리마트 혜화역점,2007300010105600048,37.583096,127.002101,75.60947
16,GS25 종로안국점,2008300012905600029,37.576936,126.985926,훼미리마트 안국스카이,2009300012905600059,37.576306,126.985578,76.566394
17,GS25 종로안국점,2003300007605600011,37.576936,126.985926,훼미리마트 안국스카이,2009300012905600059,37.576306,126.985578,76.566394
19,훼미리마트 종로성대점,2006300010105600052,37.584437,126.99702,(주)코리아세븐 성대점,1999300007605600094,37.584621,126.997147,23.292839
21,씨유 종로성대점,2010300012905600034,37.584437,126.99702,바이더웨이 종로성대점,2007300012905600012,37.584289,126.997672,59.766675


# DB: update exclusive 'distance'

In [32]:
docs = exc_distance.to_dict(orient='records')
with MyMongo() as db:
    db.update_one_bulk('cvs', 'src_dst', docs, 'src', 'dst')

<--Mongo Connected.
- bulk_write result:
match, insert, modify, upsert
0 0 0 24702
Mongo Connection Closed.-->


## Validation

In [33]:
# 2013300012905600009 2010300012905600088
tmp_list = ['2007300010105600014', '2015300016905600063',]
cvs.loc[(cvs['관리번호'].isin(tmp_list)), cols]

Unnamed: 0,lat,lng,관리번호,사업장명,상세영업상태명,도로명전체주소
29,37.57253163403659,126.9817930397272,2007300010105600014,세븐일레븐 종로두산점,정상영업,"서울특별시 종로구 삼봉로 81 (수송동,두산위브파빌리온 106호)"
392,37.57191549161327,126.98271235969273,2015300016905600063,씨유종로공평점,폐업처리,"서울특별시 종로구 삼봉로 100, 1층 (공평동)"


# bruteforce

In [34]:
# comb_list = []
# for idx_region in idx_regions:
#     ids = cvs.loc[~idx_lat_empty & idx_region]['관리번호'].tolist()
#     print(len(ids))
#     comb_list.append(combinations(ids, 2))

# print(comb_list)

In [35]:
# result = []
# # i = 0
# cvs_region = cvs.loc[idx_regions[-1]]
# for c in tqdm_notebook(comb_list[-1]):
# #     if i == 3:
# #         break
# #     i += 1
   
#     id1 = c[0]
#     id2 = c[1]

#     cvs1 = cvs_region.loc[cvs_region['관리번호']==id1]
#     cvs2 = cvs_region.loc[cvs_region['관리번호']==id2]
    
#     lat1 = cvs1['lat']
#     lon1 = cvs1['lng']
#     lat2 = cvs2['lat']
#     lon2 = cvs2['lng']
    
#     distance = get_distance_from_coords(lat1, lon1, lat2, lon2)
#     if distance <= 1000:
#         result.append((id1, id2, distance))
    
# #     if len(result) > 5:
# #         break

# #     print(cvs0.loc[::, cols])
# #     print(cvs1.loc[::, cols])
# print(len(result))