# Segmenting CycleAtlanta Users/Trips
This module is for performing k-means clustering using detour rate and/or speed on
cycleatlanta users

In [None]:
#https://keurfonluu.github.io/stochopy/api/optimize.html

from pathlib import Path
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import geopandas as gpd
import networkx as nx
from tqdm import tqdm
import json
import pickle

fp = Path.home() / "Documents/GitHub/Impedance-Calibration"

#import matched traces
export_fp = Path.home() / 'Documents/BikewaySimData/Projects/gdot/gps_traces'

with (export_fp/'test_matches.pkl').open('rb') as fh:
    trips_df = pickle.load(fh)


In [None]:
trips_df.columns

In [None]:
#add these earlier in the process
trips_df['detour_pct'] = ((trips_df['length_ft'] - trips_df['shortest_length_ft']) * 100).round(1)

In [None]:
data_fp = Path.home() / 'Documents/ridership_data/CycleAtlantaClean/9-10-16 Trip Lines and Data/raw data'

#%% import trip info
trip = pd.read_csv(data_fp/"trip.csv", header = None)
col_names = ['tripid','userid','trip_type','description','starttime','endtime','notsure']
trip.columns = col_names
trip.drop(columns=['notsure'],inplace=True)

#not sure what to do with the notes yet
#note = pd.read_csv(data_fp/'note.csv',header=None)

# import user info and filter columns
user = pd.read_csv(data_fp/"user.csv", header=None)
user_col = ['userid','created_date','device','email','age','gender','income','ethnicity','homeZIP','schoolZip','workZip','cyclingfreq','rider_history','rider_type','app_version']
user.columns = user_col
user.drop(columns=['device','app_version','app_version','email'], inplace=True)

# merge trip and users
#join the user information with trip information
trip_and_user = pd.merge(trip,user,on='userid')

In [None]:
#import mapping dictionary
user_data_definitions = json.load(open(fp/'user_data_definition.json'))
for col in set(trip_and_user.columns.tolist()) & set(user_data_definitions.keys()):
    trip_and_user[col] = trip_and_user[col].astype(str)
    trip_and_user[col] = trip_and_user[col].map(user_data_definitions[col])

In [None]:
# replace NAs
trip_and_user.replace(-1,np.nan,inplace=True)
trip_and_user.replace('-1',np.nan,inplace=True)
trip_and_user.replace('no data',np.nan,inplace=True)

In [None]:
trip_and_user.loc[4,'schoolZip']

In [None]:
trip_and_user

In [None]:
trip_and_user.to_pickle(export_fp/'trip_and_user.pkl')

In [None]:

# merge to trips_df
trips_df = trips_df.merge(trip_and_user,on='tripid')
trips_df

In [None]:
#export

In [None]:
#drop trips more than five miles
less_than_five_miles = trips_df['length_ft']<(5*5280)
print(less_than_five_miles.sum())
trips_df = trips_df[less_than_five_miles]

#for now, throw out trips mentioning group rides and those with detour rate above 100 (twice the distance)
removal_words = ['critical mass','mobile social','dikov ride']

In [None]:
#%%
def export_segments(column_name,categorical,trips_df,values_to_exclude,user_data_definitions):
    if categorical:
        trips_df[column_name] = trips_df[column_name].astype(str)
        trips_df[column_name] = trips_df[column_name].map(user_data_definitions[column_name])

    for value in trips_df[column_name].dropna().unique():
        if value in values_to_exclude:
            continue
        to_sample = trips_df[trips_df[column_name]==value]
        
        try:
            sample = to_sample.sample(200)
            sample.to_csv(fp/f'segments/{column_name}-{value}.csv',index=False)
        except:
            print(value,'did not have enough values')
            continue

    #trips_df.drop(columns=[column_name+'temp'],inplace=True)
    
export_segments('gender',True,trips_df,['no data'],user_data_definitions)
export_segments('ethnicity',True,trips_df,['no data'],user_data_definitions)
export_segments('age',True,trips_df,['no data'],user_data_definitions)
export_segments('income',True,trips_df,['no data'],user_data_definitions)
export_segments('trip_type',False,trips_df,['no data'],user_data_definitions)
export_segments('rider_type',False,trips_df,['no data'],user_data_definitions)

#%%


# K-means

Variables:
trip distance
detour %
dist to: work, home, school

aim for 3-4 clusters

on longer trips people are more likely to detour, for short trips directness prefereed?
casual riders are travelling shorter distances and may be more avoidant of certain roads


some of the really high detour trip are still loops
valid but need to have better detection for pauses


most of the data is just winding up in one cluster, so i need to think harder about what i am clustering/grouping on


In [None]:

import matplotlib.pyplot as plt

#cluster using trip distance and detour %


fig, axis = plt.subplots(figsize =(10, 5))
bins = np.array([x for x in range(0, 300, 5)])
axis.hist(trips_df['detour_rate'], bins = bins)
plt.xlabel('Percent Detour')
plt.ylabel('Frequency')

# Calculate the median
median_value = np.median(trips_df['detour_rate'])

# Draw a vertical line at the median
plt.axvline(median_value, color='red', linestyle='dashed', linewidth=2, label=f'Median = {median_value}')

# Label the vertical line
#plt.text(median_value + 10, 20, f'Median = {median_value}', rotation=90, color='red')

# Displaying the graph
plt.legend()
plt.show()

#%%


#turn to array
X = np.asarray(trips_df[['detour_rate','chosen_length_ft']])

# do clustering
kmeans = KMeans(n_clusters=3).fit(X)

trips_df['cluster_label'] = kmeans.labels_
results = pd.DataFrame(kmeans.cluster_centers_, columns = ['detour_rate','chosen_length_ft'])
print(results)

# for cluster_label in trips_df['cluster_label'].dropna().unique():
#     to_sample = trips_df[trips_df['cluster_label']==cluster_label]
#     sample = to_sample.sample(50)
#     sample.to_csv(fp/f'segments/cluster_{cluster_label}.csv',index=False)



#cluter using euclidean distance to work/home/school too


'''
come back to, right now most of the data is just in one cluster

K-prototypes (accepts both numerical and catagorical)

Trip purpose
ethnicity
gender
income

'''







