In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
import time
import statsmodels.api as sm
from sklearn import linear_model
from tqdm import tnrange, tqdm_notebook
from sklearn.metrics import mean_squared_error
from statsmodels.regression.quantile_regression import QuantReg

from sklearn.preprocessing import StandardScaler

import os

from sklearn.model_selection import train_test_split

In [2]:
path = 'data'+os.path.sep
df = pd.read_csv(path+'market_info_soul.csv',encoding='utf-8',delimiter='|')

In [3]:
df = df[['상호명','상가업소번호','지점명','상권업종대분류명','상권업종중분류명','상권업종소분류명','표준산업분류명','시도명','법정동명','경도','위도']]

In [4]:
df[df['상권업종대분류명']=='학문/교육']['상권업종중분류명'].unique()

array(['학원-어학', '학원-음악미술무용', '학원-보습교습입시', '학원-예능취미체육', '유아교육', '학원기타',
       '학문교육기타', '학원-자격/국가고시', '학원-컴퓨터', '도서관/독서실', '학원-창업취업취미'],
      dtype=object)

In [5]:
df[df['상권업종대분류명']=='학문/교육'].groupby(['상권업종중분류명','상권업종소분류명']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,상호명,상가업소번호,지점명,상권업종대분류명,표준산업분류명,시도명,법정동명,경도,위도
상권업종중분류명,상권업종소분류명,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
도서관/독서실,고시원,853,853,47,853,853,853,853,853,853
도서관/독서실,독서실,914,914,56,914,914,914,914,914,914
도서관/독서실,이동도서관운영,18,18,1,18,18,18,18,18,18
유아교육,놀이방,116,116,1,116,116,116,116,116,116
유아교육,동화구연,2,2,0,2,2,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...
학원기타,학원-말더듬,7,7,0,7,7,7,7,7,7
학원기타,학원-실내운전,11,11,1,11,11,11,11,11,11
학원기타,학원-심리변론,39,39,4,39,39,39,39,39,39
학원기타,학원-역학,9,9,2,9,9,9,9,9,9


In [6]:
df.columns

Index(['상호명', '상가업소번호', '지점명', '상권업종대분류명', '상권업종중분류명', '상권업종소분류명', '표준산업분류명',
       '시도명', '법정동명', '경도', '위도'],
      dtype='object')

####  '상권업종중분류명' : '학원-컴퓨터'

In [7]:
itaca = df[df['상권업종중분류명']=='학원-컴퓨터']

In [8]:
del df

In [9]:
itaca = itaca[['상호명','지점명','경도','위도']]

In [11]:
!pip install folium

Collecting folium
  Downloading folium-0.12.1-py2.py3-none-any.whl (94 kB)
Collecting branca>=0.3.0
  Using cached branca-0.4.2-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.2 folium-0.12.1


In [12]:
import folium
colors = {-1:'gray', 0:'coral', 1:'blue', 2:'green', 3:'red', 4:'purple', 
          5:'orange', 6:'brown', 7:'brick', 8:'yellow', 9:'magenta', 10:'cyan',11:'pink'}

df = itaca
cluster2_map = folium.Map(location=[37.55,126.98], tiles='Stamen Terrain', 
                        zoom_start=12)

for name, lat, lng in zip(df.상호명, df.위도, df.경도):  
    folium.CircleMarker([lat, lng],
                        radius=5,                   # 원의 반지름
                        color=colors[1],         # 원의 둘레 색상
                        fill=True,
                        fill_color=colors[1],    # 원을 채우는 색
                        fill_opacity=0.7,           # 투명도    
                        popup=name
    ).add_to(cluster2_map)
cluster2_map
# 지도를 html 파일로 저장하기
# cluster2_map.save('./seoul_mschool_cluster2.html')


In [10]:
import folium
from sklearn import preprocessing
from sklearn import cluster
colors = {-1:'gray', 0:'coral', 1:'blue', 2:'green', 3:'red', 4:'purple', 
          5:'orange', 6:'brown', 7:'brick', 8:'yellow', 9:'magenta', 10:'cyan',11:'pink'}

columns_list =['경도','위도']
X3 = itaca.loc[:, columns_list]
x = preprocessing.StandardScaler().fit(X3).transform(X3)

# dbm = cluster.DBSCAN()#eps=0.2, min_samples=5
dbm = cluster.KMeans(init='k-means++', n_clusters=5, n_init=10)
dbm.fit(x)  

itaca['cluster'] = dbm.labels_   

cluster3_map = folium.Map(location=[37.55,126.98], tiles='Stamen Terrain', 
                        zoom_start=12)

for name, lat, lng, clus in zip(itaca.상호명, itaca.위도, itaca.경도, itaca.cluster):  
    folium.CircleMarker([lat, lng],
                        radius=5,                   # 원의 반지름
                        color=colors[clus],         # 원의 둘레 색상
                        fill=True,
                        fill_color=colors[clus],    # 원을 채우는 색
                        fill_opacity=0.7,           # 투명도    
                        popup=name
    ).add_to(cluster3_map)
cluster3_map

#### catch outliers 

In [11]:
from sklearn.neighbors import LocalOutlierFactor
#from sklearn.ensemble import IsolationForest
#from sklearn.svm import OneClassSVM

outlier = LocalOutlierFactor()
#outlier = IsolationForest()
#outlier = OneClassSVM()

y_predict = outlier.fit_predict(df[['경도','위도'] ]  )
# print(y_predict) #[ 1  1  1  1  1  1  1 -1  1  1  1]
df['outlier'] = y_predict

In [14]:
dfout = df[df.outlier==-1]
df.drop(index=dfout.index,inplace=True)

In [15]:
colors = {-1:'gray', 0:'coral', 1:'blue', 2:'green', 3:'red', 4:'purple', 
          5:'orange', 6:'brown', 7:'brick', 8:'yellow', 9:'magenta', 10:'cyan',11:'pink'}

cmap = folium.Map(location=[37.55,126.98], tiles='Stamen Terrain', 
                        zoom_start=12)

for name, lat, lng in zip(dfout.상호명, dfout.위도, dfout.경도):  
    clus =1
    folium.CircleMarker([lat, lng],
                        radius=5,                   # 원의 반지름
                        color=colors[clus],         # 원의 둘레 색상
                        fill=True,
                        fill_color=colors[clus],    # 원을 채우는 색
                        fill_opacity=0.3,           # 투명도    
                        popup=name
    ).add_to(cmap)
for name, lat, lng in zip(df.상호명, df.위도, df.경도):  
    clus =-1
    folium.CircleMarker([lat, lng],
                        radius=5,                   # 원의 반지름
                        color=colors[clus],         # 원의 둘레 색상
                        fill=True,
                        fill_color=colors[clus],    # 원을 채우는 색
                        fill_opacity=0.3,           # 투명도    
                        popup=name
    ).add_to(cmap)
cmap

In [16]:
columns_list =['경도','위도']
X3 = df.loc[:, columns_list]
x = preprocessing.StandardScaler().fit(X3).transform(X3)

# dbm = cluster.DBSCAN()#eps=0.2, min_samples=5
# init='k-means++' 군집 중심 초기화를 단순 랜덤이 아닌 기법을 사용
# n_init 군집 중심 초기화를 반복 수행하여 best results를 return
dbm = cluster.KMeans(init='k-means++', n_clusters=5, n_init=10)
dbm.fit(x)  
df['cluster'] = dbm.labels_   

cluster3_map = folium.Map(location=[37.55,126.98], tiles='Stamen Terrain', 
                        zoom_start=12)

for name, lat, lng, clus in zip(df.상호명, df.위도, df.경도, df.cluster):  
    folium.CircleMarker([lat, lng],
                        radius=5,                   # 원의 반지름
                        color=colors[clus],         # 원의 둘레 색상
                        fill=True,
                        fill_color=colors[clus],    # 원을 채우는 색
                        fill_opacity=0.7,           # 투명도    
                        popup=name
    ).add_to(cluster3_map)
cluster3_map