This notebook integrates user inputs request times in a day, in a month, and in January 2021 for all users. The scatter at the bottom shows the relationship between homogeneity score and the median of user input request proportion in a day on valid common trips after the first round clustering for all users.

In [None]:
import logging

# Our imports
import emission.core.get_database as edb
import emission.analysis.modelling.tour_model.cluster_pipeline as pipeline
import emission.analysis.modelling.tour_model.similarity as similarity
import emission.analysis.modelling.tour_model.featurization as featurization
import emission.analysis.modelling.tour_model.representatives as representatives
import emission.storage.decorations.analysis_timeseries_queries as esda
import pandas as pd
from numpy import *
import confirmed_trips_eval_bins_clusters as evaluation
from sklearn import metrics
from pandas.testing import assert_frame_equal
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import numpy as np
from matplotlib import cm

In [None]:
participant_uuid_obj = list(edb.get_profile_db().find({"install_group": "participant"}, {"user_id": 1, "_id": 0}))
all_users = [u["user_id"] for u in participant_uuid_obj]

In [None]:
radius = 100

In [None]:
req_day = []

In [None]:
req_month = []

In [None]:
req_propor_median = []

In [None]:
# build a base dataframe for request times in January 
date_df = pd.DataFrame(data = {'date':np.arange(1,32),'drop_col':np.arange(1,32)})
date_df.set_index(['date'], inplace=True)

# get valid user list
user_ls,valid_users = evaluation.get_user_ls(all_users,radius)

for a in range(len(all_users)):
    user = all_users[a]
    filter_trips,sim,trips = evaluation.filter_data(user,radius)
    logging.debug("len(filter_trips)is %s "% len(filter_trips))

    # filter out users that don't have enough valid labeled trips
    if not evaluation.valid_user(filter_trips,trips):
        req_day.append(NaN)
        req_month.append(NaN) 
        continue
    sim.bin_data()
    sim.delete_bins()
    bins = sim.bins
    
    # collect requested trips and common trips(no need to request) indices above cutoff
    ab_trip_ls = []
    no_req_trip_ls = []
    for bin in bins:
        early_trip_index, index = evaluation.find_first_trip(filter_trips,bin)
        ab_trip_ls.append(early_trip_index)
        
        for k in range(len(bin)):
            if k != index:
                no_req_trip_idx = bin[k]
                no_req_trip_ls.append(no_req_trip_idx)


    
    # bins below cutoff
    bl_bins = sim.below_cutoff
    
    # collect requested trips indices below cutoff
    bl_trip_ls = []
    for bin in bl_bins:
        for trip_index in bin:
            bl_trip_ls.append(trip_index)
            
    # whole list of requested trips indices
    req_trips_ls=ab_trip_ls+bl_trip_ls
    
    
    # collect request times in a day
    bin_day = evaluation.bin_date(req_trips_ls,filter_trips,day=True)
    req_day_ls = []
    for bin in bin_day:
        req_day_ls.append(len(bin))
               
    # collect 0 request days 
    for trip_index in no_req_trip_ls:
        trip = filter_trips[trip_index]
        match = False
        for bin in bin_day:
            if evaluation.match_day(trip,bin,filter_trips):
                match = True
                break
        if not match:
            req_day_ls.append(0)
            
    # collect request times in a day for every user
    req_day.append(req_day_ls)
    
    # collect user input request proportion in a day
    filter_trips_df = pd.DataFrame(filter_trips)
    filter_trips_idx_ls = filter_trips_df.index.values.tolist()
    bin_filter_trips_day = evaluation.bin_date(filter_trips_idx_ls,filter_trips,day=True)
    propor_single_user = []
    for valid_trips_bin in bin_filter_trips_day:
        match = False
        for req_trips_bin in bin_day:
            req_trip = filter_trips[req_trips_bin[0]]
            if evaluation.match_day(req_trip,valid_trips_bin,filter_trips):
                proportion = round(len(req_trips_bin)/len(valid_trips_bin), 2)
                propor_single_user.append(proportion)
                match = True
                break
        if not match:
            propor_single_user.append(0) 
            
    # get user input request proportion median in a day
    median = np.median(propor_single_user)
    
    # collect medians for every user
    req_propor_median.append(median)
   
    
    # collect request times in a month
    bin_month = evaluation.bin_date(req_trips_ls,filter_trips,month=True)
    req_month_ls = []
    for bin in bin_month:
        req_month_ls.append(len(bin))

    # collect request times in a month for every user
    req_month.append(req_month_ls)
    
    # select the trips that are in Jan 2021
    jan_trips = []
    for trip_index in req_trips_ls:
        if filter_trips[trip_index]['data']['start_local_dt']['year']==2021 and filter_trips[trip_index]['data']['start_local_dt']['month']==1:
            jan_trips.append(trip_index)
    
    # create the data frame for request times in Jan 2021
    jan_date = []
    for trip_index in jan_trips:
        trip_date = filter_trips[trip_index]['data']['start_local_dt']['day']
        jan_date.append(trip_date)
        new_date_df = pd.DataFrame(data = jan_date)
        new_date_df=new_date_df.value_counts(sort = False).rename_axis('date').to_frame('user'+str(a+1)).reset_index()
        new_date_df.set_index(['date'], inplace=True)
    date_df = date_df.join(new_date_df,how='outer')
date_df=date_df.drop(columns=['drop_col'])

In [None]:
req_day

In [None]:
req_month

### Plot request times in a day

In [None]:
# show request times in a data frame
base = 0
for i in range (len(req_day)):
    if req_day[i] is not NaN:
            req_day_ls_df = pd.DataFrame(data = req_day[i])
            req_day_df=req_day_ls_df.value_counts(sort = False).rename_axis('request times').to_frame('user'+str(i+1)).reset_index()
            req_day_df.set_index(['request times'], inplace=True)
            base = i
            break
            
for i in range (len(req_day)):
    if req_day[i] is not NaN and i is not base:
        new_day_df = pd.DataFrame(data = req_day[i])
        new_day_df = new_day_df.value_counts(sort = False).rename_axis('request times').to_frame('user'+str(i+1)).reset_index()
        new_day_df.set_index(['request times'], inplace=True)
        req_day_df=req_day_df.join(new_day_df,how='outer',sort='request times')
        
req_day_df

In [None]:
# plot a graph for all valid users
yticks_max = req_day_df.max().max()
graph_day = req_day_df.plot(kind='bar',figsize=(14,16),title='request times in a day',fontsize=18,yticks=np.arange(0, yticks_max+4, step=4))
graph_day.title.set_size(20)
plt.xlabel('request times',fontsize=16)
plt.ylabel('days', fontsize=16)

In [None]:
# subplots
day_ax_arr = req_day_df.plot(subplots=True,layout=(2,4),kind='bar',figsize=(16,8),fontsize=15,sharey=True)

for ax in day_ax_arr[-1]:
    ax.set_xlabel('query times', fontsize=16)
    
for ax_arr in day_ax_arr:
    ax_arr[0].yaxis.set_major_locator(MaxNLocator(integer=True))
    ax_arr[0].set_ylabel('days', fontsize=16)
plt.tight_layout()

### Plot request times in a month

Note: not all users have data on the same number of months. 

In [None]:
# put all request times in a month for all valid users in a data frame
start = 0

for i in range (len(req_month)):
    if req_month[i] is not NaN:
        req_month_df = pd.DataFrame({'user'+str(i+1):req_month[i]}).rename_axis('month indicies').reset_index()
        req_month_df.set_index(['month indicies'], inplace=True)
        start = i
        break
req_month_df

        
for t in range(len(req_month)):
    if req_month[t] is not NaN and t is not start:
        new_month_df = pd.DataFrame({'user'+str(t+1):req_month[t]}).rename_axis('month indicies').reset_index()
        new_month_df.set_index(['month indicies'], inplace=True)
        req_month_df = req_month_df.join(new_month_df,how='outer')

req_month_df

In [None]:
# plot a graph for all valid users
graph_month = req_month_df.plot(kind='bar',figsize=(12,8),title='request times in a month',fontsize=18)
graph_month.title.set_size(20)
plt.xlabel('month indicies',fontsize=16)
plt.ylabel('request times', fontsize=16)
graph_month.yaxis.set_major_locator(MaxNLocator(integer=True))

### Get request times for all valid users in January

In [None]:
date_df

In [None]:
# plot data graph
graph_date = date_df.plot(kind='bar',figsize=(20,10),title='request times in January',fontsize=18)
graph_date.title.set_size(20)
plt.xlabel('date',fontsize=16)
plt.ylabel('request times', fontsize=16)
graph_date.yaxis.set_major_locator(MaxNLocator(integer=True))

In [None]:
# subplots on January request times
date_ax_arr = date_df.plot(subplots=True,layout=(2,4),kind='bar',figsize=(16,8),fontsize=15,sharey =True)

for ax in date_ax_arr[-1]:
    ax.set_xlabel("date", fontsize=16)
    
for ax_arr in date_ax_arr:
    ax_arr[0].yaxis.set_major_locator(MaxNLocator(integer=True))
    ax_arr[0].set_ylabel("request times", fontsize=16)
    
date_ax_arr[-1][0].set_xticks(list(range(0,len(date_df.index),5)))
date_ax_arr[-1][0].set_xticklabels(list(range(date_df.index[0],date_df.index[-1]+1,5)))

## Scatter(homogeneity score on valid trips above cutoff, user input request proportion median)

In [None]:
# Here we use homogeneity score on bins above cutoff after changing language and converting purposes and modes
homo_score_cvt, comp_score_cvt, v_score_cvt = evaluation.v_measure_bins(all_users,radius,cvt_pur_mo=True,cutoff=True)
homo_score_df = pd.DataFrame(data = {'homo score':homo_score_cvt}).dropna().reset_index(drop=True)
valid_homo_score = homo_score_df['homo score'].values.tolist()

In [None]:
x=req_propor_median
y=valid_homo_score
v=valid_users
cmp = cm.get_cmap('Dark2', len(valid_users))

for i in range(len(valid_users)):
    plt.scatter(x[i], y[i], color=cmp.colors[i], label=v[i], s=70, alpha=0.7)
plt.legend(markerscale=0.7, scatterpoints=1)
plt.xlabel('user input request proportion median',fontsize=16)
plt.ylabel('homogeneity score',fontsize=16)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)