In [None]:
import json
import os
import sys
from collections import defaultdict

import geopandas as gpd
import pandas as pd
import osmnx as ox
import numpy as np
import matplotlib.pyplot as plt
from pyproj import Geod
import seaborn as sns
from tqdm import tqdm
from scipy.stats import gaussian_kde
from scipy.integrate import quad
from matplotlib.colors import LinearSegmentedColormap, to_hex

import warnings
warnings.filterwarnings('ignore')

geod = Geod(ellps='WGS84')

In [None]:
cities = json.load(open('data_index/bldg/cities.json'))

In [None]:

FILTER = 50

df_cplxs = []
for key, cities_list in cities.items():
    for i, city in enumerate(cities_list):
        indexes = pd.read_csv(f'data_index/bldg/{key}/buildings_index_{city}.csv') # Load一遍index csv，需要8s   
        indexes['city'] = city
        if FILTER is not None:
            indexes = indexes[indexes['cplx'] < FILTER]
        df_cplx = indexes[['city', 'cplx', 'area']]
        df_cplxs.append(df_cplx)
df_cplxs = pd.concat(df_cplxs)


In [None]:

base_colors = ['#FF0000', '#00FF00', '#0000FF', '#FFFF00', '#FF00FF']


def generate_shades(base_color, n):
    cmap = LinearSegmentedColormap.from_list("shade", ["#FFFFFF", base_color], N=n+5)
    # return [base_color for _ in range(n)]
    return [to_hex(cmap(i)) for i in range(cmap.N)][5:]


colors = []
for key, city in cities.items():
    colors += generate_shades(base_colors.pop(0), len(city))
    # colors += generate_red_shades(len(city))


plt.figure(figsize=(16, 5))
sns.boxplot(x='city', y='cplx', data=df_cplxs, showfliers=False, width=0.5, dodge=True, palette=colors)

plt.xticks(rotation=45, ha='right', fontname='Times New Roman')
plt.xlabel('Cities', fontsize=16, fontname='Times New Roman', weight='bold')
plt.ylabel('Complexity', fontsize=16, fontname='Times New Roman', weight='bold')
plt.ylim(0, 50)
plt.savefig('data_index/bldg/cplx_boxplot.pdf', bbox_inches='tight', dpi=300)

In [None]:
assert not os.path.exists('data_index/bldg/idx_cities.json'), '直接运行下一个block'

idx_cities = defaultdict(list)

def entropy_integral(x):
    density = kde(x)
    return density * np.log(density)
for df_cplx in tqdm(df_cplxs.groupby('city')):
    city, df_cplx = df_cplx
    data = df_cplx['cplx'].values
    kde = gaussian_kde(data)
    lower_bound = 0
    upper_bound = 50
    entropy_value, _ = quad(lambda x: -entropy_integral(x), lower_bound, upper_bound)
    idx_cities[city].append(entropy_value)

    cplx_IoD = data.var() / data.mean()
    idx_cities[city].append(cplx_IoD)

    data = df_cplx['area'].values
    kde = gaussian_kde(data)
    lower_bound = 0
    upper_bound = 10000
    entropy_value, _ = quad(lambda x: -entropy_integral(x), lower_bound, upper_bound)
    idx_cities[city].append(entropy_value)
    area_IoD = data.var() / data.mean()
    idx_cities[city].append(area_IoD)

json.dump(idx_cities, open('data_index/bldg/idx_cities.json', 'w'))


In [None]:
idx_column = [
    "cplx_entropy",
    "cplx_IoD",
    "area_entropy",
    "area_IoD",
]
idx_cities = json.load(open('data_index/bldg/idx_cities.json'))
df_idx = pd.DataFrame.from_dict(idx_cities, orient='index', columns=idx_column)

X = df_idx.area_entropy.values
Y = df_idx.cplx_entropy.values


from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=4, random_state=0).fit(np.array([X, Y]).T)

plt.figure(figsize=(8, 8))
plt.scatter(X[kmeans.labels_ == 0], Y[kmeans.labels_ == 0], marker='s')
plt.scatter(X[kmeans.labels_ == 1], Y[kmeans.labels_ == 1], marker='o')
plt.scatter(X[kmeans.labels_ == 2], Y[kmeans.labels_ == 2], marker='^')
plt.scatter(X[kmeans.labels_ == 3], Y[kmeans.labels_ == 3], marker='*')

plt.xlabel('Area Entropy', fontsize=16, fontname='Times New Roman', weight='bold')
plt.ylabel('Complexity Entropy', fontsize=16, fontname='Times New Roman', weight='bold')
for city in df_idx.index:
    if city in ['Bangkok', 'Lisbon', 'Melbourne',  'Seoul', 'San Francisco', 'Munich', 'Singapore', 'Tokyo']:   # 右
        plt.annotate(city, (df_idx.loc[city, 'area_entropy']+0.025, df_idx.loc[city, 'cplx_entropy']-0.009), fontname='Times New Roman') 
    elif city in [ 'Kiev', "Kampala", 'Los Angeles', 'Shanghai', 'Vancouver', 'Tianjin', 'Delft']:   # 下
        plt.annotate(city, (df_idx.loc[city, 'area_entropy']-0.01*len(city), df_idx.loc[city, 'cplx_entropy']-0.036), fontname='Times New Roman')
    elif city in ['Manchester', 'Sao Paulo', 'Paris']: # 左
        plt.annotate(city, (df_idx.loc[city, 'area_entropy']-0.031*len(city), df_idx.loc[city, 'cplx_entropy']-0.009), fontname='Times New Roman')
    elif city in ['Guangzhou', 'Mexico City','New York']:
        plt.annotate(city, (df_idx.loc[city, 'area_entropy']-0.013*len(city), df_idx.loc[city, 'cplx_entropy']+0.02), fontname='Times New Roman')
    else:
        plt.annotate(city, (df_idx.loc[city, 'area_entropy']-0.01*len(city), df_idx.loc[city, 'cplx_entropy']+0.015), fontname='Times New Roman')
# plt.xlim(4, 8)
plt.savefig('data_index/bldg/area_cplx_entropy.pdf', bbox_inches='tight', dpi=300)

In [None]:
idx_column = [
    "cplx_entropy",
    "cplx_IoD",
    "area_entropy",
    "area_IoD",
]
idx_cities = json.load(open('data_index/bldg/idx_cities.json'))
df_idx = pd.DataFrame.from_dict(idx_cities, orient='index', columns=idx_column)

X = df_idx.area_IoD.values
Y = df_idx.cplx_IoD.values
X_ = X / X.max()
Y_ = Y / Y.max()

# K-means
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=4, random_state=0).fit(np.array([X_, Y_]).T)
plt.figure(figsize=(8, 8))
plt.scatter(X[kmeans.labels_ == 0], Y[kmeans.labels_ == 0], marker='s', color='b')
plt.scatter(X[kmeans.labels_ == 1], Y[kmeans.labels_ == 1], marker='o', color='orange')
plt.scatter(X[kmeans.labels_ == 2], Y[kmeans.labels_ == 2], marker='^', color='green')
plt.scatter(X[kmeans.labels_ == 3], Y[kmeans.labels_ == 3], marker='*', color='red')

plt.xlabel('Area IoD', fontsize=16, fontname='Times New Roman', weight='bold')
plt.ylabel('Complexity IoD', fontsize=16, fontname='Times New Roman', weight='bold')
for city in df_idx.index:
    plt.annotate(city, (df_idx.loc[city, 'area_IoD'], df_idx.loc[city, 'cplx_IoD']), fontname='Times New Roman')


In [None]:
import joypy

plt.rcParams['font.sans-serif'] = ['Times New Roman']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.size'] = 16
fig, axes = joypy.joyplot(df_cplxs[df_cplxs['city'].isin(df_idx.index.values[:24])], by="city", column="cplx", figsize=(10, 10), legend=False, x_range=[0, 50], overlap=0.5, linewidth=1, color=colors[:24])
# plt.xlabel('Complexity', fontsize=16, fontname='Times New Roman', weight='bold')
# plt.yticks(df_idx.index.values[:24], fontname='Times New Roman', fontsize=16, weight='bold')
plt.savefig('data_index/bldg/cplx_joyplot-1.pdf', bbox_inches='tight', dpi=300)

In [None]:
import joypy

plt.rcParams['font.sans-serif'] = ['Times New Roman']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.size'] = 16

fig, axes = joypy.joyplot(df_cplxs[df_cplxs['city'].isin(df_idx.index.values[24:])], by="city", column="cplx", figsize=(10, 10), legend=False, x_range=[0, 50], overlap=0.5, linewidth=1, color=colors[24:])
# plt.xlabel('Complexity', fontsize=16, fontname='Times New Roman', weight='bold')
# for ax in axes:
#     ax.tick_params(axis='y', which='major', labelsize=14, )  # 修改这里的 `labelsize` 来调整字体大小

# plt.ytickslabel (df_idx.index.values[24:], fontname='Times New Roman', fontsize=16, weight='bold')
# for i, ax in enumerate(axes):
#     ax.set_yticklabels(df_idx.index.values[24+i], fontname='Times New Roman', fontsize=16, weight='bold')

# plt.savefig('data_index/bldg/cplx_joyplot-2.pdf', bbox_inches='tight', dpi=300)

In [None]:
FILTER = 50

df = []
for key, cities_list in cities.items():
    for i, city in enumerate(cities_list):
        indexes = pd.read_csv(f'data_index/bldg/{key}/buildings_index_{city}.csv') # Load一遍index csv，需要8s   
        indexes['city'] = city
        # if FILTER is not None:
        #     indexes = indexes[indexes['cplx'] < FILTER]
        df_ = indexes[list(indexes.columns)[1:10]]
        df_['city'] = city
        df.append(df_)
        
df = pd.DataFrame(pd.concat(df))

# for column in list(df.columns)[:-1]:
#     df[column] = (df[column] - df[column].min()) / (df[column].max() - df[column].min())
    # df[column] = df[column] / df[column].max()  


df.head()

In [None]:
df.head()

In [None]:
# classification
from sklearn.cluster import KMeans

data = df.iloc[:, :-1].values 

# data = (data - data.min(axis=0)) / (data.max(axis=0) - data.min(axis=0))
data_shape_raw = data.shape
print(data.shape)
for i in range(data.shape[1]):
    if i==4:
        continue
    data = data[(data[:, i] > np.percentile(data[:, i], 1)) & (data[:, i] < np.percentile(data[:, i], 99))]
print(data.shape)
print(data_shape_raw[0] - data.shape[0], 'samples removed')


In [None]:
kmeans = KMeans(n_clusters=4, random_state=0).fit(data)

In [None]:
# kmeans.labels_
data[kmeans.labels_ == 0].shape[0], data[kmeans.labels_ == 1].shape[0], data[kmeans.labels_ == 2].shape[0], data[kmeans.labels_ == 3].shape[0]

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from math import pi

# data generation
categories = list(df.columns)[:-1]
N = len(categories)
# print(N)


# data_type1 = df.iloc[kmeans.labels_==0].mean()
# data_type2 = df.iloc[kmeans.labels_==1].mean()
# data_type3 = df.iloc[kmeans.labels_==2].mean()
# data_type4 = df.iloc[kmeans.labels_==3].mean()
data_type1 = data[kmeans.labels_==0].mean(axis=0)
data_type2 = data[kmeans.labels_==1].mean(axis=0)
data_type3 = data[kmeans.labels_==2].mean(axis=0)
data_type4 = data[kmeans.labels_==3].mean(axis=0)

d = pd.DataFrame([data_type1, data_type2, data_type3, data_type4], columns=categories)
# print(d.iloc[:, :5])
d_ = (d - d.min()) / (d.max() - d.min())
# print(d_.iloc[:, :5])
# d_ = d / d.max()
# print(d_.iloc[:, :5])

data_type1 = d_.iloc[0, :].values
data_type2 = d_.iloc[1, :].values
data_type3 = d_.iloc[2, :].values
data_type4 = d_.iloc[3, :].values

print(len(data_type1), len(data_type2), len(data_type3), len(data_type4))

data_type1 = np.append(data_type1, data_type1[0])
data_type2 = np.append(data_type2, data_type2[0])
data_type3 = np.append(data_type3, data_type3[0])
data_type4 = np.append(data_type4, data_type4[0])

In [None]:
data_type1, data_type2, data_type3, data_type4

In [None]:


angles = [n / float(N) * 2 * pi for n in range(N)]
angles += angles[:1]


fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(polar=True))

ax.plot(angles, data_type1, linewidth=2, linestyle='solid', label='Building Type 1', color='blue')
ax.fill(angles, data_type1, 'b', alpha=0.1)

ax.plot(angles, data_type2, linewidth=2, linestyle='solid', label='Building Type 2', color='orange')
ax.fill(angles, data_type2, 'orange', alpha=0.1)

ax.plot(angles, data_type3, linewidth=2, linestyle='solid', label='Building Type 3', color='green')
ax.fill(angles, data_type3, 'green', alpha=0.1)

ax.plot(angles, data_type4, linewidth=2, linestyle='solid', label='Building Type 4', color='red')
ax.fill(angles, data_type4, 'red', alpha=0.1)


ax.set_theta_offset(pi / 2)
ax.set_theta_direction(-1)


ax.set_rgrids([0.2, 0.4, 0.6, 0.8, 1.0], angle=0)


plt.xticks(angles[:-1], categories, fontsize=16, ha='center', fontname='Times New Roman', weight='bold')
for label, angle in zip(ax.get_xticklabels(), angles):
    if angle in [0, pi]:
        label.set_horizontalalignment('center')
    elif 0 < angle < pi:
        label.set_horizontalalignment('left')
    else:
        label.set_horizontalalignment('right')
        

ax.tick_params(pad=10)
ax.set_ylim(-0.1, 1)

# ax.yaxis.set_ticks([])


plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1), prop={'family': 'Times New Roman', 'weight': 'bold', 'size': 16})

plt.show()


In [None]:
len(kmeans.labels_)

In [None]:
df.head()

In [None]:
df_cplxs.shape

In [None]:
list(indexes.columns)[1:10]