In [1]:
from sklearn.cluster import MeanShift, estimate_bandwidth

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import math

import os
import sys
from numpy.fft import fft, ifft
import glob

In [2]:
def remove_periodic(X, df_index, detrending=True, model='additive', frequency_threshold=0.1e12):
    rad = np.array(X)
    
    if detrending:
        det_rad = rad - np.average(rad)
    else:
        det_rad = rad
    
    det_rad_fft = fft(det_rad)

    # Get the power spectrum
    rad_ps = [np.abs(rd)**2 for rd in det_rad_fft]
    
    clean_rad_fft = [det_rad_fft[i] if rad_ps[i] > frequency_threshold else 0 
                     for i in range(len(det_rad_fft))]
    
    rad_series_clean = ifft(clean_rad_fft)
    rad_series_clean = [value.real for value in rad_series_clean]
    
    if detrending:
        rad_trends = rad_series_clean + np.average(rad)
    else:
        rad_trends = rad_series_clean
        
    rad_clean_ts = pd.Series(rad_trends, index=df_index)
    
    #rad_clean_ts[(rad_clean_ts.index.hour < 6) | (rad_clean_ts.index.hour > 20)] = 0
    residual = rad - rad_clean_ts.values
    clean = rad_clean_ts.values
    return residual, clean

In [3]:
def load_data(path, resampling=None):
    ## some resampling options: 'H' - hourly, '15min' - 15 minutes, 'M' - montlhy
    ## more options at:
    ## http://benalexkeen.com/resampling-time-series-data-with-pandas/
    allFiles = glob.iglob(path + "/**/*.txt", recursive=True)
    frame = pd.DataFrame()
    list_ = []
    for file_ in allFiles:
        #print("Reading: ",file_)
        df = pd.read_csv(file_,index_col="datetime",parse_dates=['datetime'], header=0, sep=",")
        if frame.columns is None :
            frame.columns = df.columns
        list_.append(df)
    frame = pd.concat(list_)
    if resampling is not None:
        frame = frame.resample(resampling).mean()
    frame = frame.fillna(method='ffill')
    return frame

In [11]:
path = '/Users/cseveriano/spatio-temporal-forecasting/data/processed/NREL/Oahu'

df = load_data(path)

# Corrigir ordem das colunas
df.columns = ['DHHL_3','DHHL_4', 'DHHL_5', 'DHHL_10', 'DHHL_11', 'DHHL_9', 'DHHL_2', 'DHHL_1', 'DHHL_1_Tilt', 'AP_6', 'AP_6_Tilt', 'AP_1', 'AP_3', 'AP_5', 'AP_4', 'AP_7', 'DHHL_6', 'DHHL_7', 'DHHL_8']
#inicio dos dados possui falhas na medicao
df = df.loc[df.index > '2010-03-20']
df.drop(['DHHL_1_Tilt', 'AP_6_Tilt'], axis=1, inplace=True)

## Preparação bases de treinamento e testes

In [12]:
clean_df = pd.DataFrame(columns=df.columns, index=df.index)
residual_df = pd.DataFrame(columns=df.columns, index=df.index)

for col in df.columns:
    residual, clean = remove_periodic(df[col].tolist(), df.index, frequency_threshold=0.01e12)
    clean_df[col] = clean.tolist()
    residual_df[col] = residual.tolist()

In [13]:
train_df = df[(df.index >= '2010-09-01') & (df.index <= '2011-09-01')]
train_clean_df = clean_df[(clean_df.index >= '2010-09-01') & (clean_df.index <= '2011-09-01')]
train_residual_df = residual_df[(residual_df.index >= '2010-09-01') & (residual_df.index <= '2011-09-01')]


test_df = df[(df.index >= '2010-08-05')& (df.index < '2010-08-06')]
test_clean_df = clean_df[(clean_df.index >= '2010-08-05')& (clean_df.index < '2010-08-06')]
test_residual_df = residual_df[(residual_df.index >= '2010-08-05')& (residual_df.index < '2010-08-06')]

In [20]:
lat = [21.31236,21.31303,21.31357,21.31183,21.31042,21.31268,21.31451,21.31533,21.30812,21.31276,21.31281,21.30983,21.31141,21.31478,21.31179,21.31418,21.31034]
lon = [-158.08463,-158.08505,-158.08424,-158.08554,-158.0853,-158.08688,-158.08534,-158.087,-158.07935,-158.08389,-158.08163,-158.08249,-158.07947,-158.07785,-158.08678,-158.08685,-158.08675]

In [21]:
additional_info = pd.DataFrame({'station': df.columns, 'latitude': lat, 'longitude': lon })

In [34]:
additional_info[(additional_info.station == col)].latitude.values[0]

21.31034

In [37]:
#ll = []
#for ind, row in train_residual_df.iterrows():
#    for col in train_residual_df.columns:
#        lat = additional_info[(additional_info.station == col)].latitude.values[0]
#        lon = additional_info[(additional_info.station == col)].longitude.values[0]
#        doy = ind.dayofyear
#        hour = ind.hour
#        minute = ind.minute
#        irradiance = row[col]
#        ll.append([lat, lon, doy, hour, minute, irradiance])  

#ms_df = pd.DataFrame(columns=['latitude','longitude','dayofyear', 'hour', 'minute','irradiance'], data=ll)

In [58]:
ll = []
for ind, row in train_residual_df.iterrows():
    for col in train_residual_df.columns:
        lat = additional_info[(additional_info.station == col)].latitude.values[0]
        lon = additional_info[(additional_info.station == col)].longitude.values[0]
        irradiance = row[col]
        ll.append([lat, lon, irradiance])  

ms_df = pd.DataFrame(columns=['latitude','longitude','irradiance'], data=ll)

In [60]:
ms_df

Unnamed: 0,latitude,longitude,irradiance
0,21.31236,-158.08463,41.524779
1,21.31303,-158.08505,37.241017
2,21.31357,-158.08424,57.049132
3,21.31183,-158.08554,27.679583
4,21.31042,-158.08530,47.497538
5,21.31268,-158.08688,50.920165
6,21.31451,-158.08534,72.683784
7,21.31533,-158.08700,101.006040
8,21.30812,-158.07935,36.740957
9,21.31276,-158.08389,-310.405128


## Mean Shift

Normalização dos dados

In [59]:
from sklearn import preprocessing

x = ms_df.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)

In [62]:
bandwidth

0.29400864428464507

In [61]:
bandwidth = estimate_bandwidth(x_scaled, quantile=0.2, n_samples=int(len(ms_df)*0.1), n_jobs=-1)

In [64]:
ms = MeanShift(bandwidth=bandwidth, n_jobs=-1)
ms.fit(x_scaled)

Process ForkPoolWorker-4:
Process ForkPoolWorker-3:
Process ForkPoolWorker-1:


KeyboardInterrupt: 

In [55]:
labels = ms.labels_
cluster_centers = ms.cluster_centers_

labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

print("number of estimated clusters : %d" % n_clusters_)

number of estimated clusters : 1


In [47]:
labels

array([0, 0, 0, ..., 0, 0, 0])