### This notebook is to compare the results of scipy hierarchical clustering and sklearn KMeans clustering.

We have 2 rounds of clustering. The first test only uses hierarchical clustering for 2 rounds of clustering. The second test adds KMeans clustering in the 2nd round, after running hierarchical custering. Since we cannot directly get the model from scipy hierarchical clustering, also, in sklearn, the associated AgglomerativeClustering method doesn't support separated fit() and predict() functions, we need to use a clustering algorithm like KMeans to build and save the model and use the saved model to predict labels for the new trip.
The result of this notebook shows that adding KMeans doesn't change the result from scipy hierarchical clustering.

We use user 1 from the mini-pilot program here. 

In [None]:
import emission.core.get_database as edb
import emission.analysis.modelling.tour_model.similarity as similarity
import pandas as pd
import numpy as np
import emission.analysis.modelling.tour_model.get_request_percentage as grp
import emission.analysis.modelling.tour_model.get_scores as gs
import emission.analysis.modelling.tour_model.label_processing as lp
import emission.analysis.modelling.tour_model.get_users as gu
import emission.analysis.modelling.tour_model.data_preprocessing as preprocess
import emission.analysis.modelling.tour_model.evaluation_pipeline as ep
import matplotlib.pyplot as plt
import emission.analysis.modelling.tour_model.get_plot as plot
import emission.core.common as ecc

In [None]:
np.set_printoptions(suppress=True)

In [None]:
pd.set_option('max_colwidth',200)
pd.set_option('display.max_rows', None)

In [None]:
participant_uuid_obj = list(edb.get_profile_db().find({"install_group": "participant"}, {"user_id": 1, "_id": 0}))
all_users = [u["user_id"] for u in participant_uuid_obj]

In [None]:
radius = 100

### using scipy hierarchical clustering

In [None]:
for a in range(1):
    user = all_users[a]
    df = pd.DataFrame(columns=['user','user_id','percentage of 1st round','homogeneity socre of 1st round',
                               'percentage of 2nd round','homogeneity socre of 2nd round','scores','lower boundary','distance percentage'])
    trips = preprocess.read_data(user)
    filter_trips = preprocess.filter_data(trips, radius)

    # filter out users that don't have enough valid labeled trips
    if not gu.valid_user(filter_trips, trips):
        continue
    tune_idx, test_idx = preprocess.split_data(filter_trips)

    # choose tuning/test set to run the model
    # this step will use KFold (5 splits) to split the data into different subsets
    # - tune: tuning set
    # - test: test set
    # Here we user a bigger part of the data for testing and a smaller part for tuning
    tune_data = preprocess.get_subdata(filter_trips, test_idx)
    test_data = preprocess.get_subdata(filter_trips, tune_idx)
    
    # tune data
    for j in range(len(tune_data)):
        low, dist_pct = ep.tune(tune_data[j], radius, kmeans=False)
        df.loc[j,'lower boundary']=low
        df.loc[j,'distance percentage']=dist_pct

    # testing
    for k in range(len(test_data)):
        low = df.loc[k,'lower boundary']
        dist_pct = df.loc[k,'distance percentage']

        homo_first, percentage_first, homo_second, percentage_second, scores = ep.test(test_data[k],radius,low,
                                                                                    dist_pct,kmeans=False)
        df.loc[k, 'percentage of 1st round'] = percentage_first
        df.loc[k, 'homogeneity socre of 1st round'] = homo_first
        df.loc[k, 'percentage of 2nd round'] = percentage_second
        df.loc[k, 'homogeneity socre of 2nd round'] = homo_second
        df.loc[k, 'scores'] = scores
        df['user_id'] = user
        df['user']='user'+str(a+1)
df

### using kmeans

In [None]:
for a in range(1):
    user = all_users[a]
    df1 = pd.DataFrame(columns=['user','user_id','percentage of 1st round','homogeneity socre of 1st round',
                               'percentage of 2nd round','homogeneity socre of 2nd round','scores','lower boundary','distance percentage'])
    trips = preprocess.read_data(user)
    filter_trips = preprocess.filter_data(trips, radius)

    # filter out users that don't have enough valid labeled trips
    if not gu.valid_user(filter_trips, trips):
        continue
    tune_idx, test_idx = preprocess.split_data(filter_trips)

    # choose tuning/test set to run the model
    # this step will use KFold (5 splits) to split the data into different subsets
    # - tune: tuning set
    # - test: test set
    # Here we user a bigger part of the data for testing and a smaller part for tuning
    tune_data = preprocess.get_subdata(filter_trips, test_idx)
    test_data = preprocess.get_subdata(filter_trips, tune_idx)
    
    # tune data
    for j in range(len(tune_data)):
        low, dist_pct = ep.tune(tune_data[j], radius, kmeans=False)
        df1.loc[j,'lower boundary']=low
        df1.loc[j,'distance percentage']=dist_pct

    # testing
    # for testing, we add kmeans to re-build the model. Kmeans is run after hierarchical clustering, 
    # passed in n_clusters as a parameter that comes from the result of hierarchical clustering.
    for k in range(len(test_data)):
        low = df1.loc[k,'lower boundary']
        dist_pct = df1.loc[k,'distance percentage']

        homo_first, percentage_first, homo_second, percentage_second, scores = ep.test(test_data[k],radius,low,
                                                                                    dist_pct,kmeans=True)
        df1.loc[k, 'percentage of 1st round'] = percentage_first
        df1.loc[k, 'homogeneity socre of 1st round'] = homo_first
        df1.loc[k, 'percentage of 2nd round'] = percentage_second
        df1.loc[k, 'homogeneity socre of 2nd round'] = homo_second
        df1.loc[k, 'scores'] = scores
        df1['user_id'] = user
        df1['user']='user'+str(a+1)
df1