In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings('ignore')
import seaborn

# feature extraction

## `collected_data`

In [30]:
collected_data = pd.read_csv("../data/data_d_final/2d201_1230_collected_data.csv")
collected_data.set_index("지역", inplace=True)
collected_data

Unnamed: 0_level_0,쏘카존 수,녹지,주거,공업,상업,인구,대학교 수,아파트 수,지하철역 수,정류장 수,환승역 수
지역,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
가평군,7,1.0,0.0,0.0,0.0,62197,1,39,4,928,0
고양시 덕양구,48,0.65625,0.34375,0.0,0.0,487874,6,274,9,1205,2
고양시 일산동구,51,0.461538,0.461538,0.0,0.076923,296590,1,133,6,454,0
고양시 일산서구,29,0.5,0.375,0.0,0.125,290738,0,140,4,635,0
과천시,3,0.5,0.5,0.0,0.0,77775,1,15,5,111,0
광명시,28,0.222222,0.666667,0.0,0.111111,288182,0,89,3,445,0
광주시,16,0.923077,0.076923,0.0,0.0,391704,11,129,4,1238,0
구리시,19,0.5,0.5,0.0,0.0,188876,0,111,2,313,0
군포시,10,0.333333,0.444444,0.222222,0.0,266531,7,155,7,438,2
김포시,55,0.4,0.533333,0.066667,0.0,484194,3,229,9,1325,0


## `hackathon_data`

In [31]:
hackathon_data_original = pd.read_csv("../data/20211022_수요예측_hackathon_data.csv", encoding='cp949')
hackathon_data_original

Unnamed: 0,region1,region2,reservation_return_at,reservation_start_at,age_group,gender,car_model
0,울산광역시,남구,2019-09-29 21:25:40+00:00,2019-09-29 19:20:00+00:00,1,male,경형
1,울산광역시,남구,2019-07-13 13:11:21+00:00,2019-07-13 11:00:00+00:00,1,male,준중형
2,울산광역시,남구,2019-09-16 23:43:08+00:00,2019-09-16 19:10:00+00:00,1,male,준중형
3,울산광역시,남구,2019-08-09 06:17:41+00:00,2019-08-09 00:20:00+00:00,1,male,준중형
4,울산광역시,남구,2019-07-24 12:36:29+00:00,2019-07-24 09:00:00+00:00,2,male,경형
...,...,...,...,...,...,...,...
457062,경기도,고양시　일산서구,2019-09-14 12:25:29+00:00,2019-09-14 12:00:00+00:00,1,male,경형
457063,경기도,고양시　일산서구,2019-07-25 15:30:00+00:00,2019-07-24 18:30:00+00:00,4,female,준중형
457064,경기도,고양시　일산서구,2019-07-14 22:38:09+00:00,2019-07-14 21:30:00+00:00,1,male,경형
457065,경기도,고양시　일산서구,2019-06-20 18:25:37+00:00,2019-06-20 18:10:00+00:00,1,unknown,소형SUV


In [32]:
# regions from hackathon data
regions_h= ['남구', '동구', '북구', '중구', '강서구', '광명시', '구리시', '김포시', '세종시', '양평군',
           '울주군', '하남시', '화성시', '남양주시', '의정부시', '고양시 덕양구',
           '성남시 분당구', '성남시 수정구', '성남시 중원구', '안양시 동안구',
           '안양시 만안구', '전주시 덕진구', '전주시 완산구', '고양시 일산동구', '고양시 일산서구']
D = dict(zip(hackathon_data_original['region2'].unique(),regions_h))
hackathon_data_original['region2'] = hackathon_data_original['region2'].map(D)
hackathon_data = pd.DataFrame(hackathon_data_original['region2'].value_counts())
hackathon_data.columns = ['이용수']
hackathon_data

Unnamed: 0,이용수
성남시 분당구,46782
화성시,35249
고양시 일산동구,31906
남양주시,26546
고양시 덕양구,26253
의정부시,25225
안양시 동안구,22918
광명시,22645
남구,21836
김포시,21500


## intersected

In [33]:
intersected_rows = set.intersection(set(collected_data.index), set(hackathon_data.index))
intersected_rows = list(intersected_rows)
intersected_rows

['하남시',
 '화성시',
 '안양시 만안구',
 '고양시 일산서구',
 '남양주시',
 '의정부시',
 '김포시',
 '고양시 덕양구',
 '광명시',
 '구리시',
 '안양시 동안구',
 '양평군',
 '성남시 분당구',
 '성남시 수정구',
 '성남시 중원구',
 '고양시 일산동구']

In [34]:
collected_data_intersected = collected_data[collected_data.index.isin(intersected_rows)]
collected_data_intersected.sort_index(ascending=True, inplace=True)
hackathon_data_intersected = hackathon_data[hackathon_data.index.isin(intersected_rows)]
hackathon_data_intersected.sort_index(ascending=True, inplace=True)
intersected = pd.concat([collected_data_intersected, hackathon_data_intersected],axis=1)
intersected['쏘카존수 대비 이용수'] = intersected['이용수']/intersected['쏘카존 수']
intersected

Unnamed: 0,쏘카존 수,녹지,주거,공업,상업,인구,대학교 수,아파트 수,지하철역 수,정류장 수,환승역 수,이용수,쏘카존수 대비 이용수
고양시 덕양구,48,0.65625,0.34375,0.0,0.0,487874,6,274,9,1205,2,26253,546.9375
고양시 일산동구,51,0.461538,0.461538,0.0,0.076923,296590,1,133,6,454,0,31906,625.607843
고양시 일산서구,29,0.5,0.375,0.0,0.125,290738,0,140,4,635,0,16052,553.517241
광명시,28,0.222222,0.666667,0.0,0.111111,288182,0,89,3,445,0,22645,808.75
구리시,19,0.5,0.5,0.0,0.0,188876,0,111,2,313,0,13011,684.789474
김포시,55,0.4,0.533333,0.066667,0.0,484194,3,229,9,1325,0,21500,390.909091
남양주시,57,0.526316,0.473684,0.0,0.0,737366,2,385,13,1803,0,26546,465.719298
성남시 분당구,58,0.5,0.5,0.0,0.0,482026,1,231,12,694,3,46782,806.586207
성남시 수정구,18,0.631579,0.368421,0.0,0.0,232125,11,91,9,436,2,18773,1042.944444
성남시 중원구,13,0.555556,0.222222,0.111111,0.111111,206211,1,79,1,251,0,16238,1249.076923


## pearson correlation

In [35]:
intersected.corr()

Unnamed: 0,쏘카존 수,녹지,주거,공업,상업,인구,대학교 수,아파트 수,지하철역 수,정류장 수,환승역 수,이용수,쏘카존수 대비 이용수
쏘카존 수,1.0,-0.275398,0.305132,0.033706,-0.106532,0.812603,0.112405,0.791505,0.380477,0.598342,0.410081,0.80623,-0.537069
녹지,-0.275398,1.0,-0.931103,-0.022967,-0.313009,-0.154536,0.393989,-0.142347,0.155343,0.197761,0.185389,-0.369392,-0.148646
주거,0.305132,-0.931103,1.0,-0.237979,-0.008059,0.175273,-0.366502,0.196848,-0.06033,-0.176481,-0.090934,0.381949,0.060353
공업,0.033706,-0.022967,-0.237979,1.0,0.255624,0.15901,0.164733,0.048032,-0.289476,0.22093,-0.043676,-0.004037,0.237283
상업,-0.106532,-0.313009,-0.008059,0.255624,1.0,-0.169417,-0.242572,-0.215756,-0.155862,-0.242405,-0.323283,-0.011755,0.166735
인구,0.812603,-0.154536,0.175273,0.15901,-0.169417,1.0,0.466642,0.950637,0.30006,0.875202,0.400453,0.649762,-0.390717
대학교 수,0.112405,0.393989,-0.366502,0.164733,-0.242572,0.466642,1.0,0.382193,-0.015686,0.627799,0.401665,0.153327,0.012843
아파트 수,0.791505,-0.142347,0.196848,0.048032,-0.215756,0.950637,0.382193,1.0,0.46703,0.771214,0.332206,0.623385,-0.388349
지하철역 수,0.380477,0.155343,-0.06033,-0.289476,-0.155862,0.30006,-0.015686,0.46703,1.0,0.17159,0.185562,0.2862,-0.312177
정류장 수,0.598342,0.197761,-0.176481,0.22093,-0.242405,0.875202,0.627799,0.771214,0.17159,1.0,0.328058,0.33551,-0.491441


In [43]:
X = np.array(collected_data.drop(columns=['대학교 수']))

# Clustering

## K=2

In [56]:
clusterer = KMeans(n_clusters=2, n_init="auto")
cluster_labels = clusterer.fit_predict(X)
cluster_labels

array([1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0])

In [57]:
regions = np.array(collected_data.index)
print("군집0 : ", regions[np.where(cluster_labels==0)])
print()
print("군집1 : ", regions[np.where(cluster_labels==1)])

군집0 :  ['고양시 덕양구' '김포시' '남양주시' '부천시' '성남시 분당구' '시흥시' '용인시 기흥구' '의정부시' '파주시' '평택시'
 '화성시']

군집1 :  ['가평군' '고양시 일산동구' '고양시 일산서구' '과천시' '광명시' '광주시' '구리시' '군포시' '동두천시'
 '성남시 수정구' '성남시 중원구' '수원시 권선구' '수원시 영통구' '수원시 장안구' '수원시 팔달구' '안산시 단원구'
 '안산시 상록구' '안성시' '안양시 동안구' '안양시 만안구' '양주시' '양평군' '여주시' '연천군' '오산시'
 '용인시 수지구' '용인시 처인구' '의왕시' '이천시' '포천시' '하남시']


## K=3

In [58]:
clusterer = KMeans(n_clusters=3, n_init="auto")
cluster_labels = clusterer.fit_predict(X)
cluster_labels

array([1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 2, 1, 2, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 2])

In [59]:
regions = np.array(collected_data.index)
print("군집0 : ", regions[np.where(cluster_labels==0)])
print()
print("군집1 : ", regions[np.where(cluster_labels==1)])
print()
print("군집2 : ", regions[np.where(cluster_labels==2)])

군집0 :  ['고양시 덕양구' '광주시' '김포시' '성남시 분당구' '수원시 권선구' '수원시 영통구' '시흥시' '안산시 상록구'
 '용인시 기흥구' '용인시 수지구' '의정부시' '파주시' '평택시']

군집1 :  ['가평군' '고양시 일산동구' '고양시 일산서구' '과천시' '광명시' '구리시' '군포시' '동두천시' '성남시 수정구'
 '성남시 중원구' '수원시 장안구' '수원시 팔달구' '안산시 단원구' '안성시' '안양시 동안구' '안양시 만안구' '양주시'
 '양평군' '여주시' '연천군' '오산시' '용인시 처인구' '의왕시' '이천시' '포천시' '하남시']

군집2 :  ['남양주시' '부천시' '화성시']


## K=4

In [60]:
clusterer = KMeans(n_clusters=4, n_init="auto")
cluster_labels = clusterer.fit_predict(X)
cluster_labels

array([1, 0, 3, 3, 1, 3, 0, 1, 3, 0, 2, 1, 2, 0, 3, 3, 3, 3, 3, 1, 0, 3,
       3, 1, 3, 3, 3, 1, 1, 1, 3, 0, 0, 3, 1, 0, 3, 0, 0, 1, 3, 2])

In [61]:
regions = np.array(collected_data.index)
print("군집0 : ", regions[np.where(cluster_labels==0)])
print()
print("군집1 : ", regions[np.where(cluster_labels==1)])
print()
print("군집2 : ", regions[np.where(cluster_labels==2)])
print()
print("군집3 : ", regions[np.where(cluster_labels==3)])

군집0 :  ['고양시 덕양구' '광주시' '김포시' '성남시 분당구' '시흥시' '용인시 기흥구' '용인시 수지구' '의정부시' '파주시'
 '평택시']

군집1 :  ['가평군' '과천시' '구리시' '동두천시' '수원시 팔달구' '안성시' '양평군' '여주시' '연천군' '의왕시' '포천시']

군집2 :  ['남양주시' '부천시' '화성시']

군집3 :  ['고양시 일산동구' '고양시 일산서구' '광명시' '군포시' '성남시 수정구' '성남시 중원구' '수원시 권선구' '수원시 영통구'
 '수원시 장안구' '안산시 단원구' '안산시 상록구' '안양시 동안구' '안양시 만안구' '양주시' '오산시' '용인시 처인구'
 '이천시' '하남시']


## K=5

In [63]:
clusterer = KMeans(n_clusters=5, n_init="auto")
cluster_labels = clusterer.fit_predict(X)
cluster_labels

array([1, 0, 4, 4, 1, 4, 4, 3, 3, 0, 2, 1, 2, 0, 3, 3, 4, 4, 3, 3, 0, 4,
       4, 3, 4, 3, 3, 1, 1, 1, 3, 0, 4, 3, 1, 0, 3, 0, 0, 1, 4, 2])

In [64]:
regions = np.array(collected_data.index)
print("군집0 : ", regions[np.where(cluster_labels==0)])
print()
print("군집1 : ", regions[np.where(cluster_labels==1)])
print()
print("군집2 : ", regions[np.where(cluster_labels==2)])
print()
print("군집3 : ", regions[np.where(cluster_labels==3)])
print()
print("군집4 : ", regions[np.where(cluster_labels==4)])

군집0 :  ['고양시 덕양구' '김포시' '성남시 분당구' '시흥시' '용인시 기흥구' '의정부시' '파주시' '평택시']

군집1 :  ['가평군' '과천시' '동두천시' '양평군' '여주시' '연천군' '의왕시' '포천시']

군집2 :  ['남양주시' '부천시' '화성시']

군집3 :  ['구리시' '군포시' '성남시 수정구' '성남시 중원구' '수원시 장안구' '수원시 팔달구' '안성시' '안양시 만안구' '양주시'
 '오산시' '용인시 처인구' '이천시']

군집4 :  ['고양시 일산동구' '고양시 일산서구' '광명시' '광주시' '수원시 권선구' '수원시 영통구' '안산시 단원구' '안산시 상록구'
 '안양시 동안구' '용인시 수지구' '하남시']


## K=6

In [77]:
clusterer = KMeans(n_clusters=6, n_init="auto")
cluster_labels = clusterer.fit_predict(X)
cluster_labels

array([0, 1, 3, 3, 0, 3, 3, 4, 4, 1, 5, 0, 5, 1, 4, 4, 3, 3, 4, 4, 1, 3,
       3, 4, 3, 4, 4, 0, 0, 0, 4, 1, 3, 4, 0, 1, 4, 1, 1, 0, 3, 2])

In [69]:
regions = np.array(collected_data.index)
print("군집0 : ", regions[np.where(cluster_labels==0)])
print()
print("군집1 : ", regions[np.where(cluster_labels==1)])
print()
print("군집2 : ", regions[np.where(cluster_labels==2)])
print()
print("군집3 : ", regions[np.where(cluster_labels==3)])
print()
print("군집4 : ", regions[np.where(cluster_labels==4)])
print()
print("군집5 : ", regions[np.where(cluster_labels==5)])

군집0 :  ['고양시 덕양구' '김포시' '성남시 분당구' '시흥시' '용인시 기흥구' '의정부시' '파주시' '평택시']

군집1 :  ['구리시' '성남시 중원구' '수원시 팔달구' '안성시' '의왕시' '포천시']

군집2 :  ['남양주시' '부천시' '화성시']

군집3 :  ['고양시 일산동구' '고양시 일산서구' '광명시' '군포시' '성남시 수정구' '수원시 장안구' '안산시 단원구' '안양시 만안구'
 '양주시' '오산시' '용인시 처인구' '이천시']

군집4 :  ['광주시' '수원시 권선구' '수원시 영통구' '안산시 상록구' '안양시 동안구' '용인시 수지구' '하남시']

군집5 :  ['가평군' '과천시' '동두천시' '양평군' '여주시' '연천군']


시각화

In [85]:
import folium
import geopandas

In [88]:
#Import Libraries
import geopandas as gpd
import pandas as pd
import numpy as np
import folium
from folium.features import GeoJsonTooltip

#Read the geoJSON file using geopandas
geojson = gpd.read_file('georef-united-states-of-america-county.geojson')
geojson=geojson[['coty_code','geometry']] #only select 'coty_code' (country fips) and 'geometry' columns

#Read the covid-19 data using pandas
covid_df=pd.read_csv('United_States_COVID-19_County_Level.csv')


FileNotFoundError: [Errno 2] No such file or directory: 'United_States_COVID-19_County_Level.csv'

In [82]:
us_map = folium.Map(location=[40, -96], zoom_start=4,tiles='openstreetmap')
us_map

In [83]:
#Create the choropleth map add it to the base map
custom_scale = (df_final['new_cases_7days'].quantile((0,0.2,0.4,0.6,0.8,1))).tolist()
folium.Choropleth(
            geo_data=r'...\georef-united-states-of-america-county.geojson',
            data=df_final,
            columns=['fips_code', 'new_cases_7days'],  #Here we tell folium to get the county fips and plot new_cases_7days metric for each county
            key_on='feature.properties.coty_code', #Here we grab the geometries/county boundaries from the geojson file using the key 'coty_code' which is the same as county fips
            threshold_scale=custom_scale, #use the custom scale we created for legend
            fill_color='YlOrRd',
            nan_fill_color="White", #Use white color if there is no data available for the county
            fill_opacity=0.7,
            line_opacity=0.2,
            legend_name='New Cases Past 7 Days (Per 100K Population) ', #title of the legend
            highlight=True,
            line_color='black').add_to(us_map) 

us_map


NameError: name 'df_final' is not defined