In [1]:
import numpy as np
from scipy import stats
import math
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import pdist
import sys
from haversine import haversine
import pandas as pd
import geopandas as gpd

In [70]:
#read in the data
q1 = gpd.read_file('../data/processed-labels/clusters/clusters_q1.shp')
q2 = gpd.read_file('../data/processed-labels/clusters/clusters_q2.shp')
q3 = gpd.read_file('../data/processed-labels/clusters/clusters_q3.shp')
q4 = gpd.read_file('../data/processed-labels/clusters/clusters_q4.shp')

In [71]:
#function to fix the problem that cluster_ids are not unique across label_types
def fix_cluster_ids(df):
    #sort the df by label type and cluster id
    df = df.sort_values(by=['label_type', 'cluster_id'])
    #create a new cluster id column that is unique 
    df['new_cluster_id'] = range(1, len(df) + 1)
    #create a mapping of new cluster id to old cluster id
    mapping = df.drop_duplicates(['label_type', 'cluster_id']).set_index(['label_type', 'cluster_id'])['new_cluster_id'].to_dict()
    #map the new cluster id to the old cluster id
    df['new_cluster_id'] = df.apply(lambda row: mapping[(row['label_type'], row['cluster_id'])], axis=1)
    #drop the old cluster id column and rename the new cluster id column
    df = df.drop(columns=['cluster_id'])
    df = df.rename(columns={'new_cluster_id': 'cluster_id'})
    return df

In [72]:
#fix all the cluster ids for the four quadrants
q1 = fix_cluster_ids(q1)
q2 = fix_cluster_ids(q2)
q3 = fix_cluster_ids(q3)
q4 = fix_cluster_ids(q4)

In [73]:
#count the number of lables in each cluster for each quadrant
q1['cluster_label_count'] = q1.groupby('cluster_id')['cluster_id'].transform('count')
q2['cluster_label_count'] = q2.groupby('cluster_id')['cluster_id'].transform('count')
q3['cluster_label_count'] = q3.groupby('cluster_id')['cluster_id'].transform('count')
q4['cluster_label_count'] = q4.groupby('cluster_id')['cluster_id'].transform('count')

In [74]:
#change the cluster id so that it is unique across quadrants
q2['cluster_id'] = q2['cluster_id'] + q1['cluster_id'].max()
q3['cluster_id'] = q3['cluster_id'] + q2['cluster_id'].max()
q4['cluster_id'] = q4['cluster_id'] + q3['cluster_id'].max()

In [75]:
# concatenate the dataframes q1 q2 q3 q4
df = pd.concat([q1, q2, q3, q4], ignore_index=True)

In [76]:
#select only label_id label_type, clustered and count columns
df = df[['label_id', 'label_type', 'cluster_id', 'clustered','cluster_label_count']]

In [77]:
# if label_id appears more than once keep clustered = 1
df = df.groupby('label_id').max()

In [79]:
#reset index
df = df.reset_index()

In [80]:
df

Unnamed: 0,label_id,label_type,cluster_id,clustered,cluster_label_count
0,9,CurbRamp,90627,1,2
1,10,CurbRamp,90041,1,2
2,11,Occlusion,132310,0,1
3,12,SurfaceProblem,142576,0,1
4,13,Occlusion,132301,0,1
...,...,...,...,...,...
195496,233233,Obstacle,188873,0,1
195497,233234,Obstacle,190754,0,1
195498,233235,SurfaceProblem,193878,0,1
195499,233236,CurbRamp,147536,1,3


In [85]:
labels = gpd.read_file('../data/processed-labels/seattle_labels_all/seattle_labels_all.shp')

In [88]:
#merge df with labels how inner
df1 = pd.merge(df, labels, on='label_id', how='outer')
#select only label_id and clustered
df1 = df1[['label_id', 'cluster_id','clustered','cluster_label_count']]
#fill na with 0 
df1 = df1.fillna(0)
#change to int
df1= df1.astype(int)

In [89]:
df1

Unnamed: 0,label_id,cluster_id,clustered,cluster_label_count
0,9,90627,1,2
1,10,90041,1,2
2,11,132310,0,1
3,12,142576,0,1
4,13,132301,0,1
...,...,...,...,...
195538,116833,0,0,0
195539,37995,0,0,0
195540,37983,0,0,0
195541,37989,0,0,0


In [90]:
#export to csv
df1.to_csv('../data/features/clustered.csv', index=False)