In [24]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
%matplotlib inline
import seaborn as sns; sns.set_theme(color_codes=True)
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.colors
import warnings
warnings.filterwarnings('ignore')
from scipy import stats
import math
import numpy as np
from scipy import stats
import math
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import pdist
import sys
from haversine import haversine

In [25]:
#read Project Sidewalk labels
labels = gpd.read_file('attributes_filtered.json')
labels.head()

Unnamed: 0,attribute_id,label_type,street_edge_id,osm_street_id,neighborhood,avg_image_capture_date,avg_label_date,severity,is_temporary,agree_count,disagree_count,notsure_count,geometry
0,70839935,CurbRamp,14822,98239835,Harbor Island,2017-09-01,2019-04-26 21:46:21.100,3.0,False,1,0,0,POINT (-122.35248 47.58456)
1,70839937,CurbRamp,14809,98239825,Harbor Island,2017-09-01,2019-04-22 00:58:33.914,2.0,False,1,0,0,POINT (-122.35236 47.57236)
2,70839941,CurbRamp,14809,98239825,Harbor Island,2017-09-01,2019-04-26 11:45:42.880,3.0,False,2,0,0,POINT (-122.35294 47.57284)
3,70839942,CurbRamp,14809,98239825,Harbor Island,2016-01-16,2019-04-26 11:45:33.643,3.0,False,2,0,0,POINT (-122.35302 47.57291)
4,70839944,CurbRamp,14833,98239870,Harbor Island,2017-09-01,2019-04-21 23:21:35.297,2.0,False,4,0,0,POINT (-122.35578 47.57989)


In [26]:
#select only curb ramps and no curb ramps
curbs = labels[labels['label_type'].isin(['CurbRamp', 'NoCurbRamp'])]
curbs.head()

Unnamed: 0,attribute_id,label_type,street_edge_id,osm_street_id,neighborhood,avg_image_capture_date,avg_label_date,severity,is_temporary,agree_count,disagree_count,notsure_count,geometry
0,70839935,CurbRamp,14822,98239835,Harbor Island,2017-09-01,2019-04-26 21:46:21.100,3.0,False,1,0,0,POINT (-122.35248 47.58456)
1,70839937,CurbRamp,14809,98239825,Harbor Island,2017-09-01,2019-04-22 00:58:33.914,2.0,False,1,0,0,POINT (-122.35236 47.57236)
2,70839941,CurbRamp,14809,98239825,Harbor Island,2017-09-01,2019-04-26 11:45:42.880,3.0,False,2,0,0,POINT (-122.35294 47.57284)
3,70839942,CurbRamp,14809,98239825,Harbor Island,2016-01-16,2019-04-26 11:45:33.643,3.0,False,2,0,0,POINT (-122.35302 47.57291)
4,70839944,CurbRamp,14833,98239870,Harbor Island,2017-09-01,2019-04-21 23:21:35.297,2.0,False,4,0,0,POINT (-122.35578 47.57989)


In [27]:
#cluster function
def custom_dist(u, v):
     return haversine([u[0], u[1]], [v[0], v[1]])
    
thresholds = {'CurbRamp': 0.0035,
                      'NoCurbRamp': 0.0035,
                      'SurfaceProblem': 0.01,
                      'Obstacle': 0.01,
                      'NoSidewalk': 0.01,
                      'Crosswalk': 0.01,
                      'Signal': 0.01,
                      'Occlusion': 0.01,
                      'Other': 0.01,
                      'Curbs': 0.0030}

def cluster(labels, label_type, thresholds):

    # Makes a normal dist matrix for a single user, but uses special dist function for multi-user clustering that
    # prevents the same user's attributes from being clustered together.
    dist_matrix = pdist(np.array(labels[['lat', 'lng']].values), custom_dist)
    link = linkage(dist_matrix, method='complete')
    curr_type = labels['label_type'].values

    # Copies the labels dataframe and adds a column to it for the cluster id each label is in.
    labelsCopy = labels.copy()
    labelsCopy.loc[:,'cluster_id'] = fcluster(link, t=thresholds[label_type], criterion='distance')

    # Cuts tree so that only labels less than clust_threth kilometers apart are clustered.
    clusters = labelsCopy.groupby('cluster_id')

    # Computes the center of each cluster and assigns temporariness and severity.
    cluster_list = [] # list of tuples (label_type, cluster_num, lat, lng, severity, temporary).
    for clust_num, clust in clusters:
        #ave_pos = np.mean(clust['coords'].tolist(), axis=0) # use ave pos of clusters.
        #ave_sev = None if pd.isnull(clust['severity']).all() else int(round(np.median(clust['severity'][~np.isnan(clust['severity'])])))
        #ave_temp = None if pd.isnull(clust['temporary']).all() else bool(round(np.mean(clust['temporary'])))

        cluster_list.append((curr_type, clust_num))

    cluster_df = pd.DataFrame(cluster_list, columns=['label_type', 'cluster_id'])

    return (cluster_df, labelsCopy)


# Pick which label types should be included in clustering, and which should be included in the "Problem" type.
label_types = ['Curbs', 'CurbRamp', 'NoSidewalk', 'Occlusion', 'SurfaceProblem', 'Obstacle', 'Other', 'NoCurbRamp', 'Crosswalk', 'Signal']
curb_types = ['CurbRamp', 'NoCurbRamp']

# These are the columns required in the POST requests for the labels and clusters, respectively.
label_cols = ['label_id', 'label_type', 'cluster_id']
cluster_cols = ['label_type', 'cluster_id', 'lat', 'lng', 'severity', 'temporary']


def cluster_label_type_at_index(label_data,i):
        clusters_for_type_i = pd.DataFrame(columns=cluster_cols)
        labels_for_type_i = pd.DataFrame(columns=label_cols)

        label_type = label_types[i]
        if label_type == 'Curbs':
            type_data = label_data[label_data.label_type.isin(curb_types)]
        else:
             type_data = label_data[label_data.label_type == label_type]

        # If there are >1 labels, we can do clustering. Otherwise just copy the 1 (or 0) labels.
        if type_data.shape[0] > 1:
            (clusters_for_type_i, labels_for_type_i) = cluster(type_data, label_type, thresholds)
        elif type_data.shape[0] == 1:
            labels_for_type_i = type_data.copy()
            labels_for_type_i.loc[:,'cluster_id'] = 1 # Gives the single cluster a cluster_id of 1.
            labels_for_type_i.loc[:,'label_type'] = label_type # Gives Problem type if needed.
            clusters_for_type_i = labels_for_type_i.filter(items=cluster_cols)

        return (label_type, clusters_for_type_i, labels_for_type_i)

In [28]:
#break point geometry into lat and long for curbs
curbs['lat'] = curbs['geometry'].apply(lambda x: x.y)
curbs['lng'] = curbs['geometry'].apply(lambda x: x.x)

In [29]:
q1 = curbs.cx[-122.323 : -122.418, 47.734: 47.649]
q2 = curbs.cx[-122.239 : -122.323, 47.734: 47.649]
q3 = curbs.cx[-122.239 : -122.323, 47.651: 47.495]
q4 = curbs.cx[-122.323 : -122.418, 47.651: 47.495]

In [30]:
clustered_q1= cluster_label_type_at_index(q1,0)[2]

In [35]:
clustered_q2= cluster_label_type_at_index(q2,0)[2]

In [36]:
clustered_q3= cluster_label_type_at_index(q3,0)[2]

In [37]:
clustered_q4= cluster_label_type_at_index(q4,0)[2]

In [33]:
def filter_conflicts(df):
    df['avg_label_date'] = pd.to_datetime(df['avg_label_date'])
    # Group by cluster_id, count unique label_type
    label_counts = df.groupby('cluster_id')['label_type'].nunique()
    # Get cluster_id that have only one label_type
    single_label_clusters = label_counts[label_counts == 1].index
    # Filter out those rows from df
    df_single_label = df[df['cluster_id'].isin(single_label_clusters)]
    # Remove those rows from original df
    df = df[~df['cluster_id'].isin(single_label_clusters)]
    df = df.sort_values(['cluster_id', 'avg_label_date'])
    df = df.drop_duplicates('cluster_id', keep='last')
    # Concatenate the two dataframes back together
    df = pd.concat([df, df_single_label], ignore_index=True)
    return df



In [38]:
filtered_q1 = filter_conflicts(clustered_q1)
filtered_q2 = filter_conflicts(clustered_q2)
filtered_q3 = filter_conflicts(clustered_q3)
filtered_q4 = filter_conflicts(clustered_q4)

In [43]:
#combine all quadrants into one dataframe
curbs_all = pd.concat([filtered_q1, filtered_q2, filtered_q3, filtered_q4])

In [49]:
#drop duplicates in attribute_id
curbs_all = curbs_all.drop_duplicates(subset=['attribute_id'])

In [56]:
#save to json
curbs_all.to_file("curbs_processed.json", driver="GeoJSON")