# Segmenting CycleAtlanta Users/Trips
This module is for performing k-means clustering using detour rate and/or speed on
cycleatlanta users

In [20]:
#https://keurfonluu.github.io/stochopy/api/optimize.html

from pathlib import Path
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import geopandas as gpd
import networkx as nx
from tqdm import tqdm
import json
import pickle

fp = Path.home() / "Documents/GitHub/Impedance-Calibration"

#import matched traces
export_fp = Path.home() / 'Documents/BikewaySimData/Projects/gdot/gps_traces'

with (export_fp/'test_matches.pkl').open('rb') as fh:
    trips_df = pickle.load(fh)


In [21]:
trips_df.columns

Index(['tripid', 'start', 'end', 'start_end_dist_ft', 'match_ratio', 'linkids',
       'geometry', 'length_ft', 'shortest_length_ft', 'shortest_linkids',
       'shortest_geo', 'shortest_exact_overlap_length',
       'shortest_exact_overlap_prop', 'shortest_buffer_overlap',
       'shortest_intersect_geo'],
      dtype='object')

In [22]:
#add these earlier in the process
trips_df['detour_pct'] = ((trips_df['length_ft'] - trips_df['shortest_length_ft']) * 100).round(1)

In [76]:
data_fp = Path.home() / 'Documents/ridership_data/CycleAtlantaClean/9-10-16 Trip Lines and Data/raw data'

#%% import trip info
trip = pd.read_csv(data_fp/"trip.csv", header = None)
col_names = ['tripid','userid','trip_type','description','starttime','endtime','notsure']
trip.columns = col_names
trip.drop(columns=['notsure'],inplace=True)

#not sure what to do with the notes yet
#note = pd.read_csv(data_fp/'note.csv',header=None)

# import user info and filter columns
user = pd.read_csv(data_fp/"user.csv", header=None)
user_col = ['userid','created_date','device','email','age','gender','income','ethnicity','homeZIP','schoolZip','workZip','cyclingfreq','rider_history','rider_type','app_version']
user.columns = user_col
user.drop(columns=['device','app_version','app_version','email'], inplace=True)

# merge trip and users
#join the user information with trip information
trip_and_user = pd.merge(trip,user,on='userid')

In [77]:
#import mapping dictionary
user_data_definitions = json.load(open(fp/'user_data_definition.json'))
for col in set(trip_and_user.columns.tolist()) & set(user_data_definitions.keys()):
    trip_and_user[col] = trip_and_user[col].astype(str)
    trip_and_user[col] = trip_and_user[col].map(user_data_definitions[col])

In [81]:
# replace NAs
trip_and_user.replace(-1,np.nan,inplace=True)
trip_and_user.replace('-1',np.nan,inplace=True)
trip_and_user.replace('no data',np.nan,inplace=True)

In [82]:
trip_and_user.loc[4,'schoolZip']

nan

In [83]:
trip_and_user

Unnamed: 0,tripid,userid,trip_type,description,starttime,endtime,created_date,age,gender,income,ethnicity,homeZIP,schoolZip,workZip,cyclingfreq,rider_history,rider_type
0,63,10,Commute,Lovely ride home the long way for excercise,2012-10-10 14:26:28,2012-10-10 17:25:55,2012-10-10 14:26:23,35-44,Male,"$100,000 or greater",White,30306,,30308,Several times per week,Since childhood,Strong & fearless
1,67,10,Commute,,2012-10-11 05:50:06,2012-10-11 08:49:49,2012-10-10 14:26:23,35-44,Male,"$100,000 or greater",White,30306,,30308,Several times per week,Since childhood,Strong & fearless
2,70,10,Commute,,2012-10-11 11:35:03,2012-10-11 14:34:44,2012-10-10 14:26:23,35-44,Male,"$100,000 or greater",White,30306,,30308,Several times per week,Since childhood,Strong & fearless
3,77,10,Commute,,2012-10-12 05:47:47,2012-10-12 08:47:21,2012-10-10 14:26:23,35-44,Male,"$100,000 or greater",White,30306,,30308,Several times per week,Since childhood,Strong & fearless
4,297,10,Commute,The long way for exercise.,2012-10-17 05:43:50,2012-10-17 08:42:34,2012-10-10 14:26:23,35-44,Male,"$100,000 or greater",White,30306,,30308,Several times per week,Since childhood,Strong & fearless
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34038,34335,1734,Social,,2016-09-02 04:28:43,2016-09-02 07:28:40,2016-08-12 04:19:24,25-34,Female,,White,37405,,37402,Daily,Several years,Enthused & confident
34039,34337,1734,Social,,2016-09-02 05:07:35,2016-09-02 08:07:31,2016-08-12 04:19:24,25-34,Female,,White,37405,,37402,Daily,Several years,Enthused & confident
34040,34155,1735,Other,,2016-08-22 13:22:23,2016-08-22 16:22:22,2016-08-22 13:22:23,25-34,Male,"$20,000 to $39,999",White,,,,,Several years,"Comfortable, but cautious"
34041,34276,1737,Commute,,2016-08-30 05:43:39,2016-08-30 08:43:38,2016-08-30 05:43:39,25-34,Male,"$100,000 or greater",White,30317,30332,,Daily,Since childhood,Strong & fearless


In [84]:
trip_and_user.to_pickle(export_fp/'trip_and_user.pkl')

In [38]:

# merge to trips_df
trips_df = trips_df.merge(trip_and_user,on='tripid')
trips_df

Unnamed: 0,tripid,start,end,start_end_dist_ft,match_ratio,linkids,geometry,length_ft,shortest_length_ft,shortest_linkids,...,age,gender,income,ethnicity,homeZIP,schoolZip,workZip,cyclingfreq,rider_history,rider_type
0,8429,69464795,69282152,5532.278624,0.668571,"{50954.0, 50955.0, 39564.0, 39565.0, 39566.0, ...","MULTILINESTRING ((2239413.649 1375914.706, 223...",6854.630428,6043.405276,"{50954, 50955, 39564, 39565, 39566, 39567, 395...",...,25-34,Male,"$75,000 to $99,999",White,30307.0,30332,30332.0,no data,Since childhood,Enthused & confident
1,14941,1928281303,69191157,3199.67715,0.825,"{35526.0, 39015.0, 39016.0, 39017.0, 39018.0, ...","MULTILINESTRING ((2237909.462 1369925.713, 223...",5064.53796,3995.127609,"{38956, 38959, 38960, 38964, 38965, 38966, 389...",...,no data,Male,no data,no data,,,,no data,Several years,Enthused & confident
2,12708,69365780,3582046274,6063.648914,1.0,"{33536.0, 33541.0, 33560.0, 33472.0, 33475.0, ...","MULTILINESTRING ((2230345.963 1373229.917, 223...",6964.087559,6180.880607,"{33536, 33540, 33541, 33560, 33472, 33475, 334...",...,no data,no data,"$100,000 or greater",White,30308.0,,30303.0,Daily,Several years,Enthused & confident
3,30000,69365800,11204019840,6903.309982,1.0,"{32641.0, 33174.0, 33175.0, 35233.0, 35234.0, ...","MULTILINESTRING ((2233023.600 1373054.796, 223...",9094.985905,7620.85259,"{34304, 34307, 34308, 34310, 34311, 34312, 343...",...,25-34,Male,"$20,000 to $39,999",White,30308.0,30318,30318.0,Several times per week,Since childhood,"Comfortable, but cautious"
4,17052,7841654198,69182898,9527.541296,0.67451,"{51842.0, 33795.0, 33796.0, 51843.0, 33798.0, ...","MULTILINESTRING ((2229918.855 1365200.427, 222...",13265.968891,10651.686457,"{33796, 35593, 35347, 38940, 35360, 35361, 353...",...,no data,no data,no data,no data,30030.0,Im going to be 15-20 minutes lat,30308.0,no data,no data,no data


In [None]:
#export

In [40]:
#drop trips more than five miles
less_than_five_miles = trips_df['length_ft']<(5*5280)
print(less_than_five_miles.sum())
trips_df = trips_df[less_than_five_miles]

#for now, throw out trips mentioning group rides and those with detour rate above 100 (twice the distance)
removal_words = ['critical mass','mobile social','dikov ride']

291


In [None]:
#%%
def export_segments(column_name,categorical,trips_df,values_to_exclude,user_data_definitions):
    if categorical:
        trips_df[column_name] = trips_df[column_name].astype(str)
        trips_df[column_name] = trips_df[column_name].map(user_data_definitions[column_name])

    for value in trips_df[column_name].dropna().unique():
        if value in values_to_exclude:
            continue
        to_sample = trips_df[trips_df[column_name]==value]
        
        try:
            sample = to_sample.sample(200)
            sample.to_csv(fp/f'segments/{column_name}-{value}.csv',index=False)
        except:
            print(value,'did not have enough values')
            continue

    #trips_df.drop(columns=[column_name+'temp'],inplace=True)
    
export_segments('gender',True,trips_df,['no data'],user_data_definitions)
export_segments('ethnicity',True,trips_df,['no data'],user_data_definitions)
export_segments('age',True,trips_df,['no data'],user_data_definitions)
export_segments('income',True,trips_df,['no data'],user_data_definitions)
export_segments('trip_type',False,trips_df,['no data'],user_data_definitions)
export_segments('rider_type',False,trips_df,['no data'],user_data_definitions)

#%%


# K-means

Variables:
trip distance
detour %
dist to: work, home, school

aim for 3-4 clusters

on longer trips people are more likely to detour, for short trips directness prefereed?
casual riders are travelling shorter distances and may be more avoidant of certain roads


some of the really high detour trip are still loops
valid but need to have better detection for pauses


most of the data is just winding up in one cluster, so i need to think harder about what i am clustering/grouping on


In [None]:

import matplotlib.pyplot as plt

#cluster using trip distance and detour %


fig, axis = plt.subplots(figsize =(10, 5))
bins = np.array([x for x in range(0, 300, 5)])
axis.hist(trips_df['detour_rate'], bins = bins)
plt.xlabel('Percent Detour')
plt.ylabel('Frequency')

# Calculate the median
median_value = np.median(trips_df['detour_rate'])

# Draw a vertical line at the median
plt.axvline(median_value, color='red', linestyle='dashed', linewidth=2, label=f'Median = {median_value}')

# Label the vertical line
#plt.text(median_value + 10, 20, f'Median = {median_value}', rotation=90, color='red')

# Displaying the graph
plt.legend()
plt.show()

#%%


#turn to array
X = np.asarray(trips_df[['detour_rate','chosen_length_ft']])

# do clustering
kmeans = KMeans(n_clusters=3).fit(X)

trips_df['cluster_label'] = kmeans.labels_
results = pd.DataFrame(kmeans.cluster_centers_, columns = ['detour_rate','chosen_length_ft'])
print(results)

# for cluster_label in trips_df['cluster_label'].dropna().unique():
#     to_sample = trips_df[trips_df['cluster_label']==cluster_label]
#     sample = to_sample.sample(50)
#     sample.to_csv(fp/f'segments/cluster_{cluster_label}.csv',index=False)



#cluter using euclidean distance to work/home/school too


'''
come back to, right now most of the data is just in one cluster

K-prototypes (accepts both numerical and catagorical)

Trip purpose
ethnicity
gender
income

'''







