In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import scipy.stats as st
import pickle
import utils
import moran_auto
import gmm
import figure_functions
from sklearn.preprocessing import MinMaxScaler
from sklearn import mixture
from sklearn.cluster import KMeans

In [2]:
curr_dir = os.getcwd()
data_path = curr_dir + '/../data/'
fig_path = curr_dir + '/../figs/'

In [3]:
park_data, gps_loc, N = utils.load_daily_data(data_path)

In [12]:
keys = park_data.index.get_level_values(1).unique().tolist()

In [5]:
data_df = park_data.loc[(park_data['Day'] == 0) & (park_data['Hour'] == 10)]

# Each row is an element key, and each column is a date.
data = data_df['Load'].values.reshape((-1, N)).T

P = data.shape[1]

train = np.hstack((data[:, 8, None], gps_loc))

# Saving the scaling so it can be applied to the test set as well.
unscaled_loads = train[:,0]

scaler = MinMaxScaler().fit(train)
train = scaler.transform(train)

gmm = mixture.GaussianMixture(n_init=200, n_components=4, 
                              covariance_type='diag').fit(train)

# Scaling the mean and covariances back to GPS coordinates.
means = np.vstack(([(mean[1:] - scaler.min_[1:])/(scaler.scale_[1:]) for mean in gmm.means_]))
covs = np.dstack(([np.diag((cov[1:])/(scaler.scale_[1:]**2)) for cov in gmm.covariances_])).T

train_labels = gmm.predict(train)

In [6]:
w = moran_auto.get_mixture_weights(train_labels, len(data))
I = moran_auto.moran_mixture(unscaled_loads, train_labels, len(data))
var = moran_auto.moran_variance(unscaled_loads, w, len(data))
e = moran_auto.moran_expectation(N)
z = moran_auto.z_score(I, e, var)
p_one_sided, p_two_sided = moran_auto.p_value(z)

In [7]:
print I
print var
print e
print z
print p_one_sided, p_two_sided

0.119248435147
5.37163254567e-05
-0.00392156862745
16.8055142518
1.11187822394e-63 2.22375644789e-63


In [13]:
w = moran_auto.get_adjacent_weights(keys, len(data), data_path)
I = moran_auto.moran_adjacent(unscaled_loads, keys, len(data), data_path)
var = moran_auto.moran_variance(unscaled_loads, w, len(data))
e = moran_auto.moran_expectation(N)
z = moran_auto.z_score(I, e, var)
p_one_sided, p_two_sided = moran_auto.p_value(z)

In [14]:
print I
print var
print e
print z
print p_one_sided, p_two_sided

0.0894593982559
0.000852057230605
-0.00392156862745
3.19907176873
0.000689354212739 0.00137870842548


In [None]:
results = gmm.locational_demand_analysis(park_data, gps_loc, N)

In [None]:
days = [result[0] for result in results]
hours = [result[1] for result in results]
errors = [result[2] for result in results]
morans = [result[3] for result in results]
means = [result[4] for result in results]

In [None]:
good_times = []
for time in times:
    if time % 10 + 8 in [8,9]:
        continue
        
    good_times.append(time)
    data = np.vstack((means[time]))
    kmeans = KMeans(n_clusters=4).fit(data)
    labels = kmeans.labels_.tolist()
    
#     print 'Time index is', time

#     print len(np.where(np.array(labels) == 0)[0])
#     print len(np.where(np.array(labels) == 1)[0])
#     print len(np.where(np.array(labels) == 2)[0])
#     print len(np.where(np.array(labels) == 3)[0])

    plt.figure(figsize=(18,16))

    colors = {0: 'blue', 1: 'red', 2: 'green', 3: 'orange'}

#     for i in range(len(labels)):
#         plt.scatter(data[i,0], data[i,1], color=colors[labels[i]], s=100)
#         plt.scatter(data[i,0], data[i,1], color=colors[labels[i]], s=100)
#         plt.scatter(data[i,0], data[i,1], color=colors[labels[i]], s=100)
#         plt.scatter(data[i,0], data[i,1], color=colors[labels[i]], s=100)
#     plt.show()
    
    if len(good_times) == 12:
        break

In [None]:
len(good_times)

In [None]:
fig, ax = figure_functions.centroid_plots(means, gps_loc, N, times=good_times, fig_path=fig_path, shape=(3,4))
plt.show()

In [None]:
np.array([1,2,5]).sum()

In [None]:
k = 4

n = len(data)

# Seed numpy random number generator.
np.random.seed()

# Choosing the first cluster centroid uniformly at random.
init = np.random.choice(range(n), 1, replace=False).tolist()
means = data[init]

# Choosing the rest of the cluster centroid initializations.
for i in range(1, k):

    # Finding minimum squared distance for each point to an existing center.
    squared_dists = np.array([[np.linalg.norm(data[j]-means[l])**2 
                             for l in xrange(i)] for j in xrange(n)])

    dist_mins = squared_dists.min(axis=1)

    # Sampling with probability proportional to the minimum squared distance.
    prob_weights = dist_mins/dist_mins.sum()
    sample = np.random.multinomial(1, prob_weights).tolist()
    sample_choice = sample.index(1)

    means = np.vstack((means, data[sample_choice]))
    

squared_dists = np.array([[np.linalg.norm(data[j]-means[l])**2 
                             for l in xrange(k)] for j in xrange(n)])
min_max = abs(squared_dists.min(axis=1) - squared_dists.max(axis=1))
min_max = min_max.reshape((-1, 1))
min_max = np.hstack((min_max, np.arange(len(min_max)).reshape((-1,1))))
min_max = min_max[min_max[:,0].argsort()][::-1]
preferences = np.argmin(squared_dists, axis=1)
old_labels = np.nan * np.zeros(n)

removed_labels = []

count = 0

for i in xrange(n):
    
    while True:
        if np.isnan(old_labels[int(min_max[count,1])]):
            
            old_labels[int(min_max[count,1])] = preferences[int(min_max[count,1])]
        
            count += 1
            break
        else:
            count += 1
    
    for j in xrange(k):
                
        if list(old_labels).count(j) == n/k and j not in removed_labels:
            
            removed_labels.append(j)
            
            if len(removed_labels) == k:
                break
            
            squared_dists = np.array([[np.linalg.norm(data[m] - means[l])**2 
                                       for l in xrange(k)] for m in xrange(n)])
            
            squared_dists[:, removed_labels] = np.nan
            
            min_max = abs(np.nanmin(squared_dists, axis=1) - np.nanmax(squared_dists, axis=1))
            
            min_max = min_max.reshape((-1, 1))
            
            min_max = np.hstack((min_max, np.arange(len(min_max)).reshape((-1,1))))
            
            min_max = min_max[min_max[:,0].argsort()][::-1]
            
            preferences = np.nanargmin(squared_dists, axis=1)
                        
            count = 0
            
            break
            
plt.figure(figsize=(18,16))

colors = {0: 'blue', 1: 'red', 2: 'green', 3: 'orange'}

for i in range(len(labels)):
    plt.scatter(data[i,0], data[i,1], color=colors[old_labels[i]], s=100)
    plt.scatter(data[i,0], data[i,1], color=colors[old_labels[i]], s=100)
    plt.scatter(data[i,0], data[i,1], color=colors[old_labels[i]], s=100)
    plt.scatter(data[i,0], data[i,1], color=colors[old_labels[i]], s=100)
plt.show()

In [None]:
old_labels

In [None]:
not_converged = True

while not_converged:
    means = np.array([data[np.where(old_labels == i)[0]].mean(axis=0).tolist() 
                      for i in xrange(k)])  
    
    new_squared_dists = np.array([[np.linalg.norm(data[j]-means[l])**2 
                             for l in xrange(k)] for j in xrange(n)])
    
    new_preferences = np.argmin(new_squared_dists, axis=1)
    
    
    sorted_dists = np.fliplr(np.sort(new_squared_dists, axis=1))
    
    delta = abs(sorted_dists[:, 0, None] - sorted_dists[:, 1, None])

    delta = np.hstack((delta, np.arange(n).reshape((-1,1))))
            
    delta = delta[delta[:, 0].argsort()][::-1]        
    
    changes = []
    
    for idx in delta[:,1]:
        if new_preferences[int(idx)] != old_labels[int(idx)]:
            changes.append([idx, old_labels[int(idx)], new_preferences[int(idx)]])
            
    break

In [None]:
changes

In [None]:
new_squared_dists[:5]

In [None]:
new_preferences