In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import math
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
def normalize(df):
    mindf = df.min()
    maxdf = df.max()
    return (df-mindf)/(maxdf-mindf)

In [3]:
def denormalize(norm, _min, _max):
    return [(n * (_max-_min)) + _min for n in norm]

In [4]:
#Set target and input variables 
target_station = 'WTG01'

#All neighbor stations with residual correlation greater than .90
neighbor_stations_90 = ['WTG01','WTG02','WTG03','WTG05','WTG06']

In [5]:
df = pd.read_pickle("df_wind_speed.pkl")
df_ssa_clean = pd.read_pickle("df_wind_speed_ssa_clean.pkl")
df_ssa_residual = pd.read_pickle("df_wind_speed_ssa_residual.pkl")

In [6]:
# Get data form the interval of interest
#interval = ((df.index >= '2017-05') & (df.index <= '2018-05'))
interval = '2017-06'
df = df.loc[interval]
df_ssa_clean = df_ssa_clean.loc[interval]
df_ssa_residual = df_ssa_residual.loc[interval]

In [7]:
#Normalize Data

# Save Min-Max for Denorm
min_raw = df[target_station].min()
min_clean = df_ssa_clean[target_station].min()
min_residual = df_ssa_residual[target_station].min()

max_raw = df[target_station].max()
max_clean = df_ssa_clean[target_station].max()
max_residual = df_ssa_residual[target_station].max()

# Perform Normalization
norm_df = normalize(df)
norm_df_ssa_clean = normalize(df_ssa_clean)
norm_df_ssa_residual = normalize(df_ssa_residual)

## K-Means Clustering

In [8]:
from sklearn.cluster import MiniBatchKMeans

In [9]:
from ggplot import *

You can access Timestamp as pandas.Timestamp
  pd.tslib.Timestamp,
  from pandas.lib import Timestamp
  from pandas.core import datetools


In [10]:
def kmeans_clustering(data, partitions, batch_size, init_size):
    clusterer = MiniBatchKMeans(init='k-means++', n_clusters=partitions, batch_size=batch_size, init_size=init_size,
                                        n_init=1, verbose=False)
    data_labels = clusterer.fit_predict(data)
    centroids = clusterer.cluster_centers_
    
    return data_labels, centroids

In [20]:
data = norm_df[neighbor_stations_90].values

In [21]:
nclusters = 10

In [22]:
labels, centers = kmeans_clustering(data, nclusters, 100, 100)

### PCA Transformation

In [23]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
pca_result = pca.fit_transform(data)
pca_df = pd.DataFrame()

pca_df['pca-one'] = pca_result[:,0]
pca_df['pca-two'] = pca_result[:,1] 
#pca_df['pca-three'] = pca_result[:,2]
pca_df['label'] = labels

print ('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))

Explained variation per principal component: [ 0.96547166  0.01799783  0.00696812]


In [31]:
from ggplot import *

chart = ggplot( pca_df, aes(x='pca-one', y='pca-two', color='label') ) \
        + geom_point(size=75,alpha=0.8) \
        + ggtitle("First and Second Principal Components colored by digit")
chart

<ggplot: (-9223371874719619619)>

### TSNE Transformation

In [16]:
import time

from sklearn.manifold import TSNE

time_start = time.time()
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(data)

[t-SNE] Computing pairwise distances...
[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 1000 / 4129
[t-SNE] Computed conditional probabilities for sample 2000 / 4129
[t-SNE] Computed conditional probabilities for sample 3000 / 4129
[t-SNE] Computed conditional probabilities for sample 4000 / 4129
[t-SNE] Computed conditional probabilities for sample 4129 / 4129
[t-SNE] Mean sigma: 0.011691
[t-SNE] Error after 100 iterations with early exaggeration: 1.292907
[t-SNE] Error after 300 iterations: 1.141771


In [17]:
df_tsne = pd.DataFrame()
df_tsne['x-tsne'] = tsne_results[:,0]
df_tsne['y-tsne'] = tsne_results[:,1]
df_tsne['label'] = labels

In [18]:
chart = ggplot( df_tsne, aes(x='x-tsne', y='y-tsne', color='label') ) \
        + geom_point(size=100) \
        + ggtitle("tSNE dimensions colored by cluster") \
        #+ scale_colour_gradient(low="coral", high="steelblue")
    
chart

<ggplot: (162247270895)>

## DBSCAN Clustering

In [19]:
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler

### K-distance plot

In [20]:
def k_distances2(x, k):
    dim0 = x.shape[0]
    dim1 = x.shape[1]
    p=-2*x.dot(x.T)+np.sum(x**2, axis=1).T+ np.repeat(np.sum(x**2, axis=1),dim0,axis=0).reshape(dim0,dim0)
    p = np.sqrt(p)
    p.sort(axis=1)
    p=p[:,:k]
    pm= p.flatten()
    pm= np.sort(pm)
    return p, pm

In [21]:
tnse_values = df_tsne[['x-tsne', 'y-tsne']].values

In [22]:
#k = 2 ** data.shape[1]
k = math.floor(math.log(len(tnse_values)))
m, m2= k_distances2(tnse_values, k)
plt.plot(m2)
plt.ylabel("k-distances")
plt.grid(True)
plt.show()



In [23]:
db = DBSCAN(eps=0.9, min_samples=k).fit(data)

In [24]:
np.unique(db.labels_)

array([0], dtype=int64)

In [25]:
df_tsne['label'] = db.labels_

In [26]:
chart = ggplot( df_tsne, aes(x='x-tsne', y='y-tsne', color='label') ) \
        + geom_point(size=100) \
        + ggtitle("tSNE dimensions colored by cluster") \
        #+ scale_colour_gradient(low="coral", high="steelblue")
    
chart

  cbook._putmask(xa, xa < 0.0, -1)


<ggplot: (-9223371874719647544)>