In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, date, time, timedelta
from tqdm import tqdm, trange
import os
import pickle as pkl
from sklearn.mixture import GaussianMixture
from utils import *

# Load Data

In [10]:
with open('./data/outputs/paid_demand_dict.pkl', 'rb') as f:
    paid_demand_dict = pkl.load(f)
    
with open('./data/outputs/txns_corr_dict.pkl', 'rb') as f:
    txns_corr_dict = pkl.load(f)
    
with open('./data/outputs/occ_dict.pkl', 'rb') as f:
    occ_dict = pkl.load(f)
    
with open('./data/outputs/occ_corr_dict.pkl', 'rb') as f:
    occ_corr_dict = pkl.load(f)
    
with open('./data/outputs/lat_dict.pkl', 'rb') as f:
    lat_dict = pkl.load(f)
    
with open('./data/outputs/long_dict.pkl', 'rb') as f:
    long_dict = pkl.load(f)

In [11]:
max_k = 35
num_clusters = 31
all_ekeys = list(set(txns_corr_dict.keys()).union(set(occ_corr_dict.keys())))

# Paid Demand Correlation Clustering

In [5]:
elementkeys = list(txns_corr_dict.keys())
pd_corr_matrix = np.zeros((len(elementkeys),len(elementkeys)))
for i,ekey1 in tqdm(enumerate(elementkeys)):
    for j,ekey2 in enumerate(elementkeys):
        pd_corr_matrix[i,j] = txns_corr_dict[ekey1][ekey2][0]

gm_pd_corr = GaussianMixture(n_components=num_clusters).fit(pd_corr_matrix)
labels = gm_pd_corr.predict(pd_corr_matrix)
pd_corr_clust_dict = {}
for i,ekey in enumerate(elementkeys):
    pd_corr_clust_dict[ekey] = labels[i]

1613it [00:00, 3170.21it/s]


# Occupancy Correlation Clustering

In [6]:
elementkeys = list(occ_corr_dict.keys())
occ_corr_matrix = np.zeros((len(elementkeys),len(elementkeys)))
for i,ekey1 in tqdm(enumerate(elementkeys)):
    for j,ekey2 in enumerate(elementkeys):
        occ_corr_matrix[i,j] = occ_corr_dict[ekey1][ekey2][0]

gm_occ_corr = GaussianMixture(n_components=num_clusters).fit(occ_corr_matrix)
labels = gm_occ_corr.predict(occ_corr_matrix)
occ_corr_clust_dict = {}
for i,ekey in enumerate(elementkeys):
    occ_corr_clust_dict[ekey] = labels[i]

1300it [00:00, 4315.24it/s]


# Paid Demand Clustering

In [7]:
elementkeys = list(paid_demand_dict.keys())
for i,ekey in enumerate(elementkeys):
    for j,date in enumerate(paid_demand_dict[ekey]):
        if j==0:
            pd_matrix = np.array(paid_demand_dict[ekey][date])
        else:
            pd_matrix = np.vstack((pd_matrix,paid_demand_dict[ekey][date]))
    pd_vect = np.mean(pd_matrix.reshape((-1,13)), axis=0)
    if i==0:
        pd_avg = pd_vect
    else:
        pd_avg = np.vstack((pd_avg,pd_vect))

gm_pd = GaussianMixture(n_components=num_clusters).fit(pd_avg)
labels = gm_pd.predict(pd_avg)
pd_clust_dict = {}
for i,ekey in enumerate(elementkeys):
    pd_clust_dict[ekey] = labels[i]

# Occupancy Clustering

In [8]:
elementkeys = list(occ_dict.keys())
for i,ekey in enumerate(elementkeys):
    for j,date in enumerate(occ_dict[ekey]):
        if j==0:
            occ_matrix = np.array(occ_dict[ekey][date])
        else:
            occ_matrix = np.vstack((occ_matrix,occ_dict[ekey][date]))
    occ_vect = np.mean(occ_matrix.reshape((-1,13)), axis=0)
    if i==0:
        occ_avg = occ_vect
    else:
        occ_avg = np.vstack((occ_avg,occ_vect))

gm_occ = GaussianMixture(n_components=num_clusters).fit(occ_avg)
labels = gm_occ.predict(occ_avg)
occ_clust_dict = {}
for i,ekey in enumerate(elementkeys):
    occ_clust_dict[ekey] = labels[i]

In [12]:
clust_df = pd.DataFrame({'ElementKey':all_ekeys})
clust_df['Latitude'] = clust_df.apply(lambda r: lat_dict.get(r['ElementKey']), axis=1)
clust_df['Longitude'] = clust_df.apply(lambda r: long_dict.get(r['ElementKey']), axis=1)
clust_df['PaidDemandCluster'] = clust_df.apply(lambda r: pd_clust_dict.get(r['ElementKey'],-1), axis=1)
clust_df['OccupancyCluster'] = clust_df.apply(lambda r: occ_clust_dict.get(r['ElementKey'],-1), axis=1)
clust_df['PaidDemandCorrCluster'] = clust_df.apply(lambda r: pd_corr_clust_dict.get(r['ElementKey'],-1), axis=1)
clust_df['OccupancyCorrCluster'] = clust_df.apply(lambda r: occ_corr_clust_dict.get(r['ElementKey'],-1), axis=1)
clust_df[clust_df.OccupancyCorrCluster==5]

Unnamed: 0,ElementKey,Latitude,Longitude,PaidDemandCluster,OccupancyCluster,PaidDemandCorrCluster,OccupancyCorrCluster
6,81113,47.618142,-122.33683,23,24,0,5
8,79365,47.617551,-122.336685,23,24,0,5
37,31302,47.617598,-122.337668,4,24,29,5
59,77230,47.616249,-122.340383,23,24,16,5
95,81685,47.617841,-122.330403,4,24,0,5
139,54410,47.61826,-122.340075,23,24,21,5
167,78937,47.616559,-122.331057,4,24,12,5
170,53805,,,4,24,0,5
176,57193,,47.617177,4,24,0,5
201,12886,47.616531,-122.333119,4,24,27,5
