In [None]:
import logging

# Our imports
import emission.core.get_database as edb
import emission.analysis.modelling.tour_model.cluster_pipeline as pipeline
import emission.analysis.modelling.tour_model.similarity as similarity
import emission.analysis.modelling.tour_model.featurization as featurization
import emission.analysis.modelling.tour_model.representatives as representatives
import emission.storage.decorations.analysis_timeseries_queries as esda
import pandas as pd
from numpy import *
import confirmed_trips_eval_bins_clusters as evaluation
from sklearn import metrics
from pandas.testing import assert_frame_equal
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import numpy as np
# import matplotlib.ticker as ticker

In [None]:
participant_uuid_obj = list(edb.get_profile_db().find({"install_group": "participant"}, {"user_id": 1, "_id": 0}))
all_users = [u["user_id"] for u in participant_uuid_obj]

In [None]:
radius = 100

In [None]:
query_day = []

In [None]:
query_month = []

In [None]:
def match_day(trip,bin):
    if bin:
        t = filter_trips[bin[0]]
        if trip['data']['start_local_dt']['year']==t['data']['start_local_dt']['year']and trip['data']['start_local_dt']['month']==t['data']['start_local_dt']['month']and trip['data']['start_local_dt']['day']==t['data']['start_local_dt']['day']:
            return True
    return False

In [None]:
def match_month(trip,bin):
    if bin:
        t = filter_trips[bin[0]]
        if trip['data']['start_local_dt']['year']==t['data']['start_local_dt']['year']and trip['data']['start_local_dt']['month']==t['data']['start_local_dt']['month']:
            return True
    return False

In [None]:
# build a base dataframe for query times in January 
date_df = pd.DataFrame(data = {'date':np.arange(1,32),'drop_col':np.arange(1,32)})
date_df.set_index(['date'], inplace=True)


for a in range(len(all_users)):
    user = all_users[a]
    filter_trips,sim,trips = evaluation.filter_data(user,radius)
    logging.debug("len(filter_trips)is %s "% len(filter_trips))

    # filter out users that don't have enough valid labeled trips
    if not evaluation.valid_user(filter_trips,trips):
        query_day.append(NaN)
        query_month.append(NaN)      
        continue
        
    sim.bin_data()
    sim.delete_bins()
    bins = sim.bins
    
    # collect query trips and common trips(no need to query) indices above cutoff
    ab_trip_ls = []
    no_query_trip_ls = []
    for bin in bins:
        early_trip = filter_trips[bin[0]]
        trip_index = 0
        for i in range(1,len(bin)):
            compare_trip = filter_trips[bin[i]]
            if early_trip['data']['start_local_dt']['year']>compare_trip['data']['start_local_dt']['year']:
                early_trip = compare_trip
                trip_index = i
            elif early_trip['data']['start_local_dt']['year']==compare_trip['data']['start_local_dt']['year'] and early_trip['data']['start_local_dt']['month']>compare_trip['data']['start_local_dt']['month']:
                early_trip = compare_trip
                trip_index = i
            elif early_trip['data']['start_local_dt']['year']==compare_trip['data']['start_local_dt']['year'] and early_trip['data']['start_local_dt']['month']==compare_trip['data']['start_local_dt']['month'] and early_trip['data']['start_local_dt']['day']>compare_trip['data']['start_local_dt']['day']:
                early_trip = compare_trip
                trip_index = i
        ab_trip_ls.append(bin[trip_index])
        
        for k in range(len(bin)):
            if k != trip_index:
                no_query_trip_ls.append(bin[k])


    
    # bins below cutoff
    bl_bins = sim.below_cutoff
    
    # collect query trips indices below cutoff
    bl_trip_ls = []
    for bin in bl_bins:
        for index in bin:
            bl_trip_ls.append(index)
            
    # whole list of query trips indices
    query_trips_ls=ab_trip_ls+bl_trip_ls
    
    
    # collect query times in a day
    bin_day = []
    for trip_index in query_trips_ls:
        added = False
        trip = filter_trips[trip_index]
        for bin in bin_day:
            if match_day(trip,bin):
                bin.append(trip_index)
                added = True
        if not added:
            bin_day.append([trip_index])
    query_day_ls = []
    for bin in bin_day:
        query_day_ls.append(len(bin))
        
    # collect 0 query days 
    for trip_index in no_query_trip_ls:
        trip = filter_trips[trip_index]
        match = False
        for bin in bin_day:
            if match_day(trip,bin):
                match = True
                break
        if not match:
            query_day_ls.append(0)
            

    # collect query times in a day for every user
    query_day.append(query_day_ls)
    
   
    # collect query times in a month
    bin_month = []
    for trip_index in query_trips_ls:
        added = False
        trip = filter_trips[trip_index]
        for bin in bin_month:
            if match_month(trip,bin):
                bin.append(trip_index)
                added = True
        if not added:
            bin_month.append([trip_index])
        query_month_ls = []
    for bin in bin_month:
        query_month_ls.append(len(bin))

    # collect query times in a month for every user
    query_month.append(query_month_ls)
    
    # select the trips that are in Jan 2021
    jan_trips = []
    for trip_index in query_trips_ls:
        if filter_trips[trip_index]['data']['start_local_dt']['year']==2021 and filter_trips[trip_index]['data']['start_local_dt']['month']==1:
            jan_trips.append(trip_index)
    
    # create the data frame for query times in Jan 2021
    date = []
    for trip_index in jan_trips:
        trip_date = filter_trips[trip_index]['data']['start_local_dt']['day']
        date.append(trip_date)
        new_date_df = pd.DataFrame(data = date)
        new_date_df=new_date_df.value_counts(sort = False).rename_axis('date').to_frame('user'+str(a+1)).reset_index()
        new_date_df.set_index(['date'], inplace=True)
    date_df = date_df.join(new_date_df,how='outer')
date_df=date_df.drop(columns=['drop_col'])

In [None]:
query_day

In [None]:
query_month

### Plot query times in a day

In [None]:
# show query times in a data frame
base = 0
for i in range (len(query_day)):
    if query_day[i] is not NaN:
            query_day_ls_df = pd.DataFrame(data = query_day[i])
            query_day_df=query_day_ls_df.value_counts(sort = False).rename_axis('query times').to_frame('user'+str(i+1)).reset_index()
            query_day_df.set_index(['query times'], inplace=True)
            base = i
            break
            
for i in range (base+1,len(query_day)):
    if query_day[i] is not NaN:
        new_day_df = pd.DataFrame(data = query_day[i])
        new_day_df = new_day_df.value_counts(sort = False).rename_axis('query times').to_frame('user'+str(i+1)).reset_index()
        new_day_df.set_index(['query times'], inplace=True)
        query_day_df=query_day_df.join(new_day_df,how='outer',sort='query times')
        
query_day_df

In [None]:
# plot a graph for all valid users
yticks_max = query_day_df.max().max()
graph_day = query_day_df.plot(kind='bar',figsize=(14,16),title='query times in a day',fontsize=18,yticks=np.arange(0, yticks_max+4, step=4))
graph_day.title.set_size(20)
plt.xlabel('query times',fontsize=16)
plt.ylabel('days', fontsize=16)

In [None]:
# subplots
day_ax_arr = query_day_df.plot(subplots=True,layout=(2,4),kind='bar',figsize=(16,8),fontsize=15,sharey=True)

for ax in day_ax_arr[-1]:
    ax.set_xlabel('query times', fontsize=16)
    
for ax_arr in day_ax_arr:
    ax_arr[0].yaxis.set_major_locator(MaxNLocator(integer=True))
    ax_arr[0].set_ylabel('days', fontsize=16)
plt.tight_layout()

### Plot query times in a month

Note: not all users have data on the same number of months. 

In [None]:
# put all query times in a month for all valid users in a data frame
start = 0

for i in range (len(query_month)):
    if query_month[i] is not NaN:
        query_month_df = pd.DataFrame({'user'+str(i+1):query_month[i]}).rename_axis('month indicies').reset_index()
        query_month_df.set_index(['month indicies'], inplace=True)
        start = i
        break
query_month_df

        
for t in range(start+1,len(query_month)):
    if query_month[t] is not NaN:
        new_month_df = pd.DataFrame({'user'+str(t+1):query_month[t]}).rename_axis('month indicies').reset_index()
        new_month_df.set_index(['month indicies'], inplace=True)
        query_month_df = query_month_df.join(new_month_df,how='outer')

query_month_df

In [None]:
# plot a graph for all valid users
graph_month = query_month_df.plot(kind='bar',figsize=(12,8),title='query times in a month',fontsize=18)
graph_month.title.set_size(20)
plt.xlabel('month indicies',fontsize=16)
plt.ylabel('query times', fontsize=16)
graph_day.yaxis.set_major_locator(MaxNLocator(integer=True))

### Get query times for all valid users in January

In [None]:
date_df

In [None]:
# plot data graph
graph_date = date_df.plot(kind='bar',figsize=(20,10),title='query times in January',fontsize=18)
graph_date.title.set_size(20)
plt.xlabel('date',fontsize=16)
plt.ylabel('query times', fontsize=16)
graph_date.yaxis.set_major_locator(MaxNLocator(integer=True))

In [None]:
# subplots on January query times
date_ax_arr = date_df.plot(subplots=True,layout=(2,4),kind='bar',figsize=(16,8),fontsize=15,sharey =True)

for ax in date_ax_arr[-1]:
    ax.set_xlabel("date", fontsize=16)
    
for ax_arr in date_ax_arr:
    ax_arr[0].yaxis.set_major_locator(MaxNLocator(integer=True))
    ax_arr[0].set_ylabel("query times", fontsize=16)
    
date_ax_arr[-1][0].set_xticks(list(range(0,len(date_df.index),5)))
date_ax_arr[-1][0].set_xticklabels(list(range(date_df.index[0],date_df.index[-1]+1,5)))