In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import scipy.stats as st
import pickle
import utils
import moran_auto
import gmm
import gmm3
import kmeans_utils
import figure_functions
from sklearn.preprocessing import MinMaxScaler
from sklearn import mixture
from sklearn.cluster import KMeans
import csv
from collections import defaultdict

In [2]:
curr_dir = os.getcwd()
data_path = curr_dir + '/../data/'
fig_path = curr_dir + '/../figs/'
results_path = curr_dir + '/../results'
animation_path = curr_dir + '/../animation'

In [3]:
params = utils.load_data(data_path)
gps_loc, avg_loads, park_data, N, P, idx_to_day_hour, day_hour_to_idx = params
park_data_new = utils.load_daily_data(park_data)

In [4]:
results = gmm3.locational_demand_analysis(park_data_new, gps_loc, N)

In [5]:
days = [result[0] for result in results]
hours = [result[1] for result in results]
errors = [result[2] for result in results]
means = [result[3] for result in results]

In [10]:
data_df = park_data_new.loc[(park_data_new['Day'] == 0) & (park_data_new['Hour'] == 8)]
data_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Load,Date,Hour,Day
Datetime,ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-01-05 08:00:00,1017,0.000000,2015-01-05,8,0
2015-01-05 08:00:00,1018,0.000000,2015-01-05,8,0
2015-01-05 08:00:00,1021,0.035056,2015-01-05,8,0
2015-01-05 08:00:00,1022,0.000000,2015-01-05,8,0
2015-01-05 08:00:00,1025,0.083241,2015-01-05,8,0
2015-01-05 08:00:00,1026,0.000000,2015-01-05,8,0
2015-01-05 08:00:00,1029,0.089618,2015-01-05,8,0
2015-01-05 08:00:00,1030,0.421007,2015-01-05,8,0
2015-01-05 08:00:00,1033,0.000000,2015-01-05,8,0
2015-01-05 08:00:00,1034,0.000000,2015-01-05,8,0


In [23]:
data = data_df['Load'].values.reshape((N, -1))

In [25]:
data[2]

array([ 0.11368687,  0.01279915,  0.08666667,  0.18329365,  0.13347222,
        0.        ,  0.        ,  0.        ,  0.        ,  0.04650794,
        0.00211806,  0.08326389])

In [22]:
data_df['Load'].xs(1021, level=1).values

array([ 0.03505556,  0.07502778,  0.04002778,  0.09002778,  0.1       ,
        0.18675   ,  0.05666667,  0.07672222,  0.06497222,  0.14838889,
        0.01005556,  0.09841667])

In [16]:
data = data_df['Load'].values.reshape((-1, N)).T

In [20]:
data[2]

array([ 0.03505556,  0.07502778,  0.04002778,  0.09002778,  0.1       ,
        0.18675   ,  0.05666667,  0.07672222,  0.06497222,  0.14838889,
        0.01005556,  0.09841667])

In [None]:
days = [result[0] for result in results]
hours = [result[1] for result in results]
errors = [result[2] for result in results]
morans_mixture = [result[3] for result in results]
morans_adjacent = [result[4] for result in results]
means = [result[5] for result in results]

In [None]:
errors

In [None]:
np.array(errors).reshape((6,10))

In [None]:
import itertools

In [None]:
days = sorted(park_data_new['Day'].unique())
hours = sorted(park_data_new['Hour'].unique())

times = list(itertools.product(days, hours))
iteration = 0

In [None]:
time = times[iteration]
day = time[0]
hour = time[1]

In [None]:
park_data = park_data_new

In [None]:
data_df = park_data.loc[(park_data['Day'] == day) & (park_data['Hour'] == hour)]
block_keys = sorted(data_df.index.get_level_values(1).unique().tolist())

# Each row is an element key, and each column is a date.
data = data_df['Load'].values.reshape((N, -1))

P = data.shape[1]

average_accuracies = []

centers = []

morans_mixture = []
morans_adjacent = []

In [None]:
# Fitting the model for each date for the given day and hour combination.
for train_time in xrange(P):

    train = np.hstack((data[:, train_time, None], gps_loc))

    # Saving the scaling so it can be applied to the test set as well.
    unscaled_loads = train[:,0]
    scaler = MinMaxScaler().fit(train)
    train = scaler.transform(train)

    gmm = mixture.GaussianMixture(n_init=200, n_components=4, 
                                  covariance_type='diag').fit(train)

    # Scaling the mean and covariances back to GPS coordinates.
    means = np.vstack(([(mean[1:] - scaler.min_[1:])/(scaler.scale_[1:]) for mean in gmm.means_]))
    covs = np.dstack(([np.diag((cov[1:])/(scaler.scale_[1:]**2)) for cov in gmm.covariances_])).T

    centers.append(means)

    train_labels = gmm.predict(train)

    weights = moran_auto.get_mixture_weights(train_labels, N)        
    I = moran_auto.moran_mixture(unscaled_loads, train_labels, N)
    expectation = moran_auto.moran_expectation(N)
    variance = moran_auto.moran_variance(unscaled_loads, weights, N)
    z_score = moran_auto.z_score(I, expectation, variance)
    p_one_sided, p_two_sided = moran_auto.p_value(z_score)

    morans_mixture.append([I, expectation, variance, z_score, p_one_sided, p_two_sided])

    weights = moran_auto.get_adjacent_weights(block_keys, N)        
    I = moran_auto.moran_adjacent(unscaled_loads, block_keys, N)
    expectation = moran_auto.moran_expectation(N)
    variance = moran_auto.moran_variance(unscaled_loads, weights, N)
    z_score = moran_auto.z_score(I, expectation, variance)
    p_one_sided, p_two_sided = moran_auto.p_value(z_score)

    morans_adjacent.append([I, expectation, variance, z_score, p_one_sided, p_two_sided])

    accuracies = []

    # For each other day of data, predict using model that was fit.
    for test_time in xrange(P):

        if test_time == train_time:
            continue

        test = np.hstack((data[:, test_time, None], gps_loc))

        test = scaler.transform(test)

        test_labels = gmm.predict(test)

        correct_idx = [i for i in range(N) if train_labels[i] == test_labels[i]]
        accuracy = len(correct_idx)/float(N)

        accuracies.append(accuracy)
    
    print accuracies
    
    
    # Getting average prediction accuracy over all test sets.
    average_accuracies.append(np.array(accuracies).mean())
    print('These are average')
    print average_accuracies

# # Average error for the particular day and hour combination.
# time_avg_accuracy = round(100.0 - np.array(average_accuracies).mean() * 100, 2)

# result = (day, hour, time_avg_accuracy, morans_mixture, morans_adjacent, centers)

In [None]:
train_labels

In [None]:
test_labels

In [None]:
figure_functions.model_selection(avg_loads, gps_loc, P, fig_path)

In [None]:
figure_functions.create_animation(avg_loads, gps_loc, N, P, fig_path, animation_path)

In [None]:
avg_loads.mean(axis=1).reshape((-1,1))

In [None]:
fig, ax = figure_functions.spatial_heterogeneity(loads=avg_loads, time=1, 
                                                 N=N, fig_path=fig_path)
plt.show()

In [None]:
fig, ax = figure_functions.mixture_plot(loads=avg_loads, gps_loc=gps_loc, 
                                        times=[time2], N=N, fig_path=fig_path, 
                                        shape=(1,1), filename='friday_6pm_gmm.png',
                                        title='')

In [None]:
time = 2
time1 = 2
time2 = 9


fig, ax = figure_functions.mixture_plot(loads=avg_loads, gps_loc=gps_loc, 
                                        times=[time1], N=N, fig_path=fig_path, 
                                        shape=(1,1), filename='friday_10am_gmm.png',
                                        title='')
fig, ax = figure_functions.mixture_plot(loads=avg_loads, gps_loc=gps_loc, 
                                        times=[time2], N=N, fig_path=fig_path, 
                                        shape=(1,1), filename='friday_6pm_gmm.png',
                                        title='')

fig, ax = figure_functions.interpolation(loads=avg_loads, gps_loc=gps_loc, time=time,
                                         N=N, fig_path=fig_path)

fig, ax = figure_functions.triangular_grid(loads=avg_loads, gps_loc=gps_loc, time=time,
                                           N=N, fig_path=fig_path)

fig, ax = figure_functions.contour_plot(loads=avg_loads, gps_loc=gps_loc, time=time1,
                                        title='Friday 10:00 AM Average Load Contours', 
                                        N=N, filename='friday_10am.png', fig_path=fig_path, 
                                        contours=10)

fig, ax = figure_functions.contour_plot(loads=avg_loads, gps_loc=gps_loc, time=time2,
                                        title='Friday 6:00 PM Average Load Contours', 
                                        N=N, filename='friday_6pm.png', fig_path=fig_path, 
                                        contours=10)

fig, ax = figure_functions.surface_plot(loads=avg_loads, gps_loc=gps_loc, time=time, 
                                        fig_path=fig_path)

fig, ax = figure_functions.voronoi(gps_loc=gps_loc, N=N, fig_path=fig_path)

fig, ax = figure_functions.spatial_heterogeneity(loads=avg_loads, time=time, 
                                                 N=N, fig_path=fig_path)

fig, ax = figure_functions.temporal_heterogeneity(loads=avg_loads, time=time, 
                                                  P=P, fig_path=fig_path)

fig, ax = figure_functions.temporal_day_plots(loads=avg_loads, P=P, fig_path=fig_path)

fig, ax = figure_functions.temporal_hour_plots(loads=avg_loads, fig_path=fig_path)

In [None]:
2%10

In [None]:
time = times[0]

In [None]:
time = times[iter]
day = time[0]
hour = time[1]

data_df = park_data.loc[(park_data['Day'] == day) & (park_data['Hour'] == hour)]
block_keys = sorted(data_df.index.get_level_values(1).unique().tolist())

# Each row is an element key, and each column is a date.
data = data_df['Load'].values.reshape((N, -1))

P = data.shape[1]

average_accuracies = []

centers = []

morans_mixture = []
morans_adjacent = []

# Fitting the model for each date for the given day and hour combination.
for train_time in xrange(P):

    train = np.hstack((data[:, train_time, None], gps_loc))

    # Saving the scaling so it can be applied to the test set as well.
    unscaled_loads = train[:,0]
    scaler = MinMaxScaler().fit(train)
    train = scaler.transform(train)

    gmm = mixture.GaussianMixture(n_init=200, n_components=4, 
                                  covariance_type='diag').fit(train)

    # Scaling the mean and covariances back to GPS coordinates.
    means = np.vstack(([(mean[1:] - scaler.min_[1:])/(scaler.scale_[1:]) for mean in gmm.means_]))
    covs = np.dstack(([np.diag((cov[1:])/(scaler.scale_[1:]**2)) for cov in gmm.covariances_])).T

    centers.append(means)

    train_labels = gmm.predict(train)

    weights = moran_auto.get_mixture_weights(train_labels, N)        
    I = moran_auto.moran_mixture(unscaled_loads, train_labels, N)
    expectation = moran_auto.moran_expectation(N)
    variance = moran_auto.moran_variance(unscaled_loads, weights, N)
    z_score = moran_auto.z_score(I, expectation, variance)
    p_one_sided, p_two_sided = moran_auto.p_value(z_score)

    morans_mixture.append([I, expectation, variance, z_score, p_one_sided, p_two_sided])

    weights = moran_auto.get_adjacent_weights(block_keys, N)        
    I = moran_auto.moran_adjacent(unscaled_loads, block_keys, N)
    expectation = moran_auto.moran_expectation(N)
    variance = moran_auto.moran_variance(unscaled_loads, weights, N)
    z_score = moran_auto.z_score(I, expectation, variance)
    p_one_sided, p_two_sided = moran_auto.p_value(z_score)

    morans_adjacent.append([I, expectation, variance, z_score, p_one_sided, p_two_sided])

    accuracies = []

    # For each other day of data, predict using model that was fit.
    for test_time in xrange(P):

        if test_time == train_time:
            continue

        test = np.hstack((data[:, test_time, None], gps_loc))

        test = scaler.transform(test)

        test_labels = gmm.predict(test)

        correct_idx = [i for i in range(N) if train_labels[i] == test_labels[i]]
        accuracy = len(correct_idx)/float(N)

        accuracies.append(accuracy)

    # Getting average prediction accuracy over all test sets.
    average_accuracies.append(np.array(accuracies).mean())

# Average error for the particular day and hour combination.
time_avg_accuracy = round(100.0 - np.array(average_accuracies).mean() * 100, 2)

result = (day, hour, time_avg_accuracy, morans_mixture, morans_adjacent, centers)

return result

# Train and Test GMM

In [None]:
park_data, gps_loc, N = utils.load_daily_data(data_path)

In [None]:
check = np.array(sorted(park_data['Load'].values.tolist(), reverse=True))

In [None]:
len(np.where(check > 1.5)[0])/float(len(check)) * 100

In [None]:
pd.DataFrame(range(10), index=range(10), columns=['check'])

In [None]:
results = gmm.locational_demand_analysis(park_data, gps_loc, N)

In [None]:
days = [result[0] for result in results]
hours = [result[1] for result in results]
errors = [result[2] for result in results]
morans_mix = [result[3] for result in results]
morans_adj = [result[4] for result in results]
means = [result[5] for result in results]

In [None]:
scores, times = kmeans_utils.get_time_scores(means)

In [None]:
best_times = {}
best_days = {}
for time in times:
    day = time/12
    hour = time%12 + 8
    if hour not in best_times:
        best_times[hour] = time
    if day not in best_days and hour not in [8, 15,16,17]:
        print day, hour
        best_days[day] = time

In [None]:
good_times = []

for time in times:
    if time % 12 + 8 in [8,9]:
        continue
        
    good_times.append(time)
    data = np.vstack((means[time]))
    kmeans = KMeans(n_clusters=3).fit(data)
    labels = kmeans.labels_.tolist()

In [None]:
good_times = best_days.values()

In [None]:
fig, ax = figure_functions.centroid_plots(means, gps_loc, N, times=good_times, fig_path=fig_path, shape=(2,3))