# Imports

In [1]:
#%matplotlib inline
%matplotlib notebook
#%matplotlib widget
import ipywidgets as widgets
#para encontrar las coordenadas mas cercanas a un punto
from sklearn.metrics.pairwise import nan_euclidean_distances

import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import image
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.cm as cm
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.colors as colors
import matplotlib as mpl
from sklearn.cluster import KMeans
from scipy import stats
import seaborn as sns; sns.set()
import os
import ipynb.fs.defs.my_funcs_clusters as myfunc
# noinspection PyCompatibility
import pathlib
import os

# Funciones

In [2]:
def calculate_vectorprob(row):
    """
    la funcion se aplica fila por fila
    se suman todos los valores por fila y se divide entre cada valor, luego se multiplica por 100
    Compute the probabability of every row.

    usage: mprob = dfMaTra.apply(calculate_vectorprob, axis=1)
            mprob.rename(index=idxclnames,inplace=True)
    :param row: row number
    :return: probabilities matrix
    """
    return row.astype(float)/row.sum()*100

def group_state_values(statenum,df):
    """
Agrupa los valores con datos diferentes a nan de un estado en particular. Los datos se agregan a una lista hasta
que salta a otro estado, con esto se puede saber cuanto tiempo duro en un estado.

    Inputs:
            statenum: numero de estado, empieza en 1
            dfclvp: dataframe con los clusters separados de vv y pw
    Outputs:
            lista_valores_estado: una lista con los valores del estado agrupados
    """
    #va a buscar renglon por renglon si el valor es nan y
    #va a agrupar los valores que no son nan junto con su timestamp
    statenum ='C'+str(statenum)
    idx =  df[statenum].first_valid_index() #primer indice que no tiene valores nan
    lista_valores_estado=[] #contiene los valores agrupados de tiempo vv y pw
    row_values=[]# lista temporal para agrupar los valores de tiempo vv y pw
    idxw=0 #indice exclusivo para el ciclo for
    while not idxw ==  df.index[-1]: #si el indice ya llego a la ultima fila

        for row in df[statenum][idx:].itertuples(): #empieza en el ultimo indice que no tuvo valores
            if not np.isnan(row.vViento):
                row_values.append([row[0],row.vViento,row.Pw])
            #si el valor es nan y row_values tiene elementos significa que
            #ya tomo todos los valores del estado y se brinco a otro
            if np.isnan(row.vViento) and len(row_values)>0:
                idx=row[0]
                lista_valores_estado.append(row_values.copy())
                row_values.clear()
                break
            idxw =row[0]#para saber cuando llega al ultimo indice
    return lista_valores_estado



## de matriz (dataframe) a vector

In [3]:
# def mat_to_vector(df):
#     """
#     #convertir de matriz a vector  (debe ser igual que kmeans_labels)
#     #tomo el dataframe con los estados ya indicados y la convierto a vector ignorando
#     #en este caso ceros
#     :param df:
#     :return:
#     """
#     vect_states= []
#     for row in range(len(df)):
#         clust = np.argwhere(df.iloc[row].notnull().values)[0][0]
#         vect_states.append(clust)
#         #else:
#         #es una fila de ceros, entonces se agrega un cero, así si queda igual
#         #que kmeans labels y los otros codigos para hacer la matriz de transicion
#         #pero quiero esto?, no, no lo quiero por que no quiero que de un estado x
#         #se regrese al estado cero solo porque no hay datos, pero de todos modos lo hago
#         #vect_states.append(0)
#     return vect_states


## Transition matrix

In [4]:
def transition_matrix(transitions):
    """
    stack overflow, mas elegante
    the following code takes a list such as
    [1,1,2,6,8,5,5,7,8,8,1,1,4,5,5,0,0,0,1,1,4,4,5,1,3,3,4,5,4,1,1]
    with states labeled as successive integers starting with 0
    and returns a transition matrix, M,
    where M[i][j] is the probability of transitioning from i to j
    Test:
    t = [1,1,2,6,8,5,5,7,8,8,1,1,4,5,5,0,0,0,1,1,4,4,5,1,3,3,4,5,4,1,1]
    m = transition_matrix(vect_states)
    for row in m: print(' '.join('{0:.4f}'.format(x) for x in row))

    :param transitions: list with transitions
    :return:
    """
    n = 1+ max(transitions) #number of states

    M = [[0]*n for _ in range(n)]

    for (i,j) in zip(transitions,transitions[1:]):
        M[i][j] += 1

    #now convert to probabilities:
    for row in M:
        s = sum(row)
        if s > 0:
            row[:] = [f/s for f in row]
    return M


#  Data processing

In [5]:
import importlib
importlib.reload(myfunc)

<module 'ipynb.fs.defs.my_funcs_clusters' (C:\Users\mungu\Documents\GitHub\aero\my_funcs_clusters.ipynb)>

In [6]:
# ###########################################configuración
# columns_to_use = [0,1,3,6]
# columns_id =['day','hour','wdir','vwind']
# home = str(pathlib.Path.home()) #user directory
# xlsPath = home + '\\Dropbox\\Doctorado\\Documentos\\datos\\datos bcs\\DP-PB01-2005.xlsx'
# xlsPathMfgCurve = home + '\\Dropbox\\Doctorado\\Python\\aero\\Curva de potencia vestas 90.xlsx'
# dir_format= 'deg'
# year = '2005' #data year
# ########################################################
# #imprimir a consola
# os.write(1, b"Inciando procesamiento de datos...\n")
#
# dataVDxls = pd.read_excel(xlsPath,usecols=columns_to_use,dtype={'DIA' : str, 'HORA':str})
# dataVDxls.columns =columns_id
# #agrego la columna de potencia instantanea sin filtrar
# #dataVPxls['Pw']= (dataVPxls.iloc[1:,1].values-dataVPxls.iloc[0:-1,1]) * np.pi*45**2
# print('Total de registros: ' + str(len(dataVDxls)))
# #dfMfgCurve = pd.read_excel(xlsPathMfgCurve,usecols=[0,2],index_col=0,names=['pw'])#cambio esto en la nueva version
# dfMfgCurve = pd.read_excel(xlsPathMfgCurve,usecols=[0,2],index_col=0)
# dfMfgCurve.columns = ['pw']
# #marcando los datos faltantes asignando un nan a la fila completa
# datamk = dataVDxls
# datamk.loc[datamk.isnull().any(axis=1), :] = np.nan
# #numero de filas sin datos
# print('Numero de filas sin datos')
# print(datamk.loc[datamk.isnull().any(axis=1), :].isnull().sum())
#
# #eliminando filas con NaN
# cleanData = datamk.dropna()
#
#
# #datos direccion velocidad
# #print(len(dataVP))
# #print(len(dataDir))
# #dataDV = pd.concat([dataDir,dataVP.vViento],axis=1)
# #dataVD = pd.concat([dataVP.vViento,dataDir,axis=1)
# #dataVcD =pd.concat([dataDir,df_comp_vel],axis=1)
# #change hour 24:00 to 00:00
# dataVDxls['hour']=dataVDxls['hour'].str.replace('2400','0000')
# dataVDxls['timeStamp']= dataVDxls.apply(lambda x: myfunc.daymin2date(year,x.day, x.hour), axis=1)
# dataVDxls.set_index('timeStamp',inplace=True,verify_integrity = True)
# dfWD = dataVDxls.drop(columns=['day','hour'])
# dfWD.index =pd.to_datetime(dfWD.index,dayfirst=True)
# del dataVDxls
# #CHECK THE DIRECTION OF THE ANEMOMETER
# if dir_format == 'rad':
#     direcvrad= np.deg2rad(dfWD['wdir'].values)
#     vecVel = [-np.sin(direcvrad)*dfWD['vwind'].values,np.cos(direcvrad)*dfWD['vwind'].values]
# else:
#     vecVel = [-np.sin(dfWD['wdir'].values * np.pi / 180) * dfWD['vwind'].values,
#               np.cos(dfWD['wdir'].values * np.pi / 180)* dfWD['vwind'].values]
#
#
# vecVelnp = np.array(vecVel).transpose()
# #original sin timestamp
# #df_comp_vel = pd.DataFrame(data=vecVelnp,columns=['vx','vy']
# #con timestamp
# df_comp_vel = pd.DataFrame(data=vecVelnp,columns=['vx','vy'],index=dfWD.index)
#
#
# os.write(1, b"Fin del procesamiento de datos\n")


In [7]:
data_path ='datos/DP-PB01-2005.xlsx'
mf_pow_curve_path ='datos/DeWind d8.2.csv'
df_wind_dir, df_comp_vel,df_mf_curve = myfunc.proc_dat_bcs(data_path,mf_pow_curve_path)

Inciando procesamiento de datos...

Total de registros: 52560
Numero de filas sin datos
day      0.0
hour     0.0
wdir     0.0
vwind    0.0
dtype: float64
Fin del procesamiento de datos


# Matriz de transicion de viento

## Clustering

In [9]:
n_clusters= 15
model_kmeans = KMeans( n_clusters=n_clusters,random_state=0)
model_kmeans.fit(df_comp_vel.values)
kmeans_labels = model_kmeans.labels_
centroids = model_kmeans.cluster_centers_#los centroides siguen siendo 46
n_clusters = np.unique(kmeans_labels).size

#crear un dataframe viento, direccion, cluster
df_wind_dir_cl = df_wind_dir.copy()
df_wind_dir_cl['cluster'] = kmeans_labels




## Ordenar clusters

In [10]:
#orden de aparicion de los clusters kmeans
#ordenar el orden de aparicion segun la magnitud de la vv
clmagni = np.zeros(n_clusters)
for i in range(n_clusters):
    vx = df_comp_vel.vx.values[kmeans_labels==i]
    vy = df_comp_vel.vy.values[kmeans_labels==i]
    clmagni[i]= np.round(np.mean(np.sqrt(vx**2 + vy**2)),1) #magnitud de la vv

clord = clmagni.argsort()

columnas=[]
for i in np.arange(1,n_clusters+1):
    columnas.append('C'+str(i))

dfclvv = pd.DataFrame()
for i in range(n_clusters):
    dfclvv = pd.concat([dfclvv,df_wind_dir.vwind[kmeans_labels==clord[i]]], ignore_index=True, axis=1)
dfclvv.columns=columnas[0:n_clusters]
dfclvv

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,C15
2005-01-01 00:30:00,0.893,,,,,,,,,,,,,,
2005-01-01 00:40:00,1.016,,,,,,,,,,,,,,
2005-01-01 00:50:00,1.092,,,,,,,,,,,,,,
2005-01-01 01:20:00,0.333,,,,,,,,,,,,,,
2005-01-01 01:30:00,0.032,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2005-12-16 17:30:00,,,,,,,,,,,,,,,9.80
2005-12-16 17:40:00,,,,,,,,,,,,,,,10.95
2005-12-16 17:50:00,,,,,,,,,,,,,,,9.84
2005-12-16 18:10:00,,,,,,,,,,,,,,,9.62


In [11]:
#probabilidades de cada estado
nregcl=[]
for col in dfclvv.columns:
    nregcl.append(len(dfclvv[dfclvv[col].notna()]))
ntotreg= len(df_wind_dir)
np.array(nregcl)/ntotreg *100


array([24.53433284,  9.57209993,  6.78095093,  4.0430754 ,  9.17825682,
        2.57805514,  0.88091478,  9.08312563,  3.23446032,  6.44799178,
        3.45516467,  8.69499039,  3.4171122 ,  6.09790902,  2.00156015])

In [12]:
#magnitud de los centroides
for i in range(n_clusters):
    print( str(i)+' - '+ str(dfclvv['C'+str(i+1)].mean()))

0 - 0.46604831329972857
1 - 2.1559827072152653
2 - 2.498785914702581
3 - 3.0380216470588235
4 - 4.316563432835821
5 - 4.434022878228782
6 - 6.2067840172786175
7 - 6.388041684122329
8 - 6.993161764705882
9 - 7.607657125995869
10 - 7.6598573788546265
11 - 8.933557986870897
12 - 10.34109131403118
13 - 11.167419656786272
14 - 11.255332699619771


In [13]:
#resultado dudoso
for i in range(n_clusters):
    vx = df_comp_vel.vx.values[kmeans_labels==i]
    vy = df_comp_vel.vy.values[kmeans_labels==i]
    print(np.round(np.mean(np.sqrt(vx**2 + vy**2)),1) )#magnitud de la vv

2.2
8.9
3.0
4.3
10.3
7.0
6.4
0.5
11.2
4.4
11.3
7.6
7.7
6.2
2.5


## Matriz de velocidades de viento

In [14]:
# Matriz sin ordenar
columnas=[]
for i in np.arange(1,n_clusters+1):
    columnas.append('C'+str(i))


dfclvv_sin_ord = pd.DataFrame()
dfclvv_sin_ord.index= df_wind_dir.index
for i in range(n_clusters):
    dfclvv_sin_ord = pd.concat([dfclvv_sin_ord,df_wind_dir.vwind[kmeans_labels==i]], ignore_index=True, axis=1)
dfclvv_sin_ord.columns=columnas[0:n_clusters]


#ordenar
#solo voy a cambiar el orden de las etiquetas
#ordenando etiquetas por valores de centroides de mayor a menor
columnas_ord=[x for _, x in sorted(zip(clord, columnas))]
dfclvv_ord = dfclvv_sin_ord.copy()
dfclvv_ord.columns = columnas_ord
#ordenando nombres de columnas
dfclvv_ord =dfclvv_ord[columnas]

In [15]:
dfclvv_ord

Unnamed: 0_level_0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,C15
timeStamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2005-01-01 00:10:00,,1.898,,,,,,,,,,,,,
2005-01-01 00:20:00,,1.759,,,,,,,,,,,,,
2005-01-01 00:30:00,0.893,,,,,,,,,,,,,,
2005-01-01 00:40:00,1.016,,,,,,,,,,,,,,
2005-01-01 00:50:00,1.092,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2005-12-31 23:10:00,,1.535,,,,,,,,,,,,,
2005-12-31 23:20:00,,,,2.085,,,,,,,,,,,
2005-12-31 23:30:00,1.072,,,,,,,,,,,,,,
2005-12-31 23:40:00,1.254,,,,,,,,,,,,,,


## Cambiar de valor de velocidad de viento a unos

In [16]:
#si quiero cambiar los NaN por ceros, pero hay algunos ceros en los datos
# mejor dejo los NaN
#dfclvv_sin_ord.fillna(0,inplace=True)
#dfclvv_sin_ord[dfclvv_sin_ord!=0]=1
#dfclvv_sin_ord.head(20)

## Step plot

In [17]:
#convetir matriz de estados a vector de estados (o transiciones)
#este vector debe ser igual a kmeans_labels  cuando no se ha modificado el df
vect_states = myfunc.mat_to_vector(dfclvv_sin_ord)


In [18]:
data = df_wind_dir.loc[df_wind_dir.index.month ==1]
plt.figure(figsize=(20,10))
plt.step(data.index,model_kmeans.labels_[:len(data)],where='post')
plt.show()


<IPython.core.display.Javascript object>

In [19]:
suma = []
for i in range(n_clusters):
    suma.append(len(dfclvv_ord[dfclvv_ord['C' + str(i+ 1)].notnull()]['C'+str(i+1)]) /len(dfclvv_sin_ord))
    print(str(i) + '-'+str(suma[i]*100))

0-24.534332844993244
1-9.572099925797675
2-6.780950931334311
3-4.043075400977949
4-9.17825681614947
5-2.5780551380353507
6-0.880914781483666
7-9.083125630244107
8-3.2344603207823592
9-6.447991780665538
10-3.455164672082802
11-8.694990391750224
12-3.417112197720656
13-6.0979090165338
14-2.001560151448848


## Matriz de transicion


In [20]:
# m = transition_matrix(a)
# m= np.multiply(m,100)
# for row in m: print(' '.join('{0:.4f}'.format(x) for x in row))


In [21]:
# #SOLO FUNCIONA PARA FILAS PERO NO PARA COLUMNAS
# plt.figure(figsize=(20,18))
#
# ht =sns.heatmap(m, annot=True,fmt='.2f')
# figure = ht.get_figure()
# figure.savefig('ht_prob_internet.png', dpi=400)

In [22]:
# for i in range(len(centroids)):
#     # print(str(i) + ' - ' +str(np.sqrt(centroids[i][0]**2 + centroids[i][1]**2)))

In [23]:

mat_trans_prob= myfunc.trans_matrix_from_df(dfclvv_ord,mat_mode='prob')
mat_trans_prob

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,C15
C1,77.913061,9.881264,7.325418,3.622459,0.281747,0.794929,0.04025,0.030187,0.04025,0.010062,0.04025,0.020125,0.0,0.0,0.0
C2,21.327768,59.749553,4.432518,0.894454,10.216657,1.649771,0.596303,0.73544,0.019877,0.218644,0.039754,0.099384,0.0,0.019877,0.0
C3,19.668911,6.67789,62.738496,6.257015,0.617284,0.364759,1.71156,0.16835,1.599327,0.028058,0.0,0.140292,0.0,0.0,0.028058
C4,16.329412,2.164706,11.058824,58.588235,0.752941,2.964706,0.094118,0.235294,7.435294,0.188235,0.094118,0.047059,0.0,0.047059,0.0
C5,0.891376,12.189055,0.331675,0.103648,67.578773,2.860697,0.103648,11.567164,0.02073,3.109453,0.787728,0.393864,0.02073,0.041459,0.0
C6,6.199262,5.01845,0.516605,5.97786,7.675277,59.335793,0.0,0.811808,0.295203,0.95941,10.332103,0.369004,2.287823,0.221402,0.0
C7,0.431965,7.12743,11.231102,0.0,1.079914,0.215983,70.62635,5.615551,0.647948,0.647948,0.0,1.943844,0.0,0.431965,0.0
C8,0.041894,0.52367,0.020947,0.0,14.327608,0.125681,0.460829,68.098031,0.0,9.551739,0.209468,6.388773,0.020947,0.230415,0.0
C9,0.117647,0.235294,2.941176,9.411765,0.117647,0.117647,0.117647,0.058824,80.176471,0.0,0.0,0.117647,0.058824,0.0,6.529412
C10,0.0,0.0,0.0,0.0,4.131012,0.206551,0.088522,13.63234,0.0,62.791384,3.068752,13.455297,0.147536,2.478607,0.0


In [24]:

#SOLO FUNCIONA PARA FILAS PERO NO PARA COLUMNAS
plt.figure(figsize=(20,18))

ht =sns.heatmap(mat_trans_prob, annot=True,fmt='.2f')
figure = ht.get_figure()
plt.show()
figure.savefig('ht_prob_mio.png', dpi=400)


<IPython.core.display.Javascript object>

## Cadena de Markov

### Todos los estados

In [25]:

# Plot de direcciones de viento
################################################

#valores menor a 10%
#c = mat_trans_prob.mask(mat_trans_prob<10)

#c.fillna(0,inplace=True)
#c = np.round(c,4)
#c = c[['C8','C12','C1ddd4','C15']]
#revisar por que el .copy() es necesario, si no lo pongo afecta a la variable original
myfunc.create_mc_plot(20,mat_trans_prob.copy(), as_pct=True, replace_zero_to_nan=False)


FileNotFoundError: [Errno 2] No such file or directory: 'graph_mc.jpg'

### Estados con probabilidad mayor a 10%

In [26]:
#elinminar valores menores a 10%
matprob10p = mat_trans_prob.mask(mat_trans_prob<10)
#conviertiendo nan a cero
#matprob10p.fillna(0,inplace=True)
#matprob10p = np.round(matprob10p,4)

myfunc.create_mc_plot(21,matprob10p.copy(), as_pct=True, replace_zero_to_nan=True, layout='circle');


FileNotFoundError: [Errno 2] No such file or directory: 'graph_mc.jpg'

# Combinar clusters

In [None]:
n_clusters = 15
clusters_wind = myfunc.KMData()
clusters_wind.dataframe_to_cluster(df_comp_vel, n_clusters=n_clusters, clusters_data='wind')
#crear un dataframe viento, direccion, cluster
df_wind_dir_cl = df_wind_dir.copy()
#tomando las labels de kmeans y sumandole uno por que empieza en 0
#ademas agregando la letra C y ordenando de menor v a mayor vel
col_clnames = ['C' + str(clusters_wind.cl_ord[c]+1) for c in clusters_wind.kmeans_labels]
df_wind_dir_cl['cluster'] = col_clnames
del col_clnames

## Magnitud de viento

In [28]:
#agrupando los datos de viento en clusters
dfclvv = myfunc.create_clustered_data(df_wind_dir,clusters_wind.kmeans_labels)
dfclvv

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,C15
2005-01-01 00:10:00,,1.898,,,,,,,,,,,,,
2005-01-01 00:20:00,,1.759,,,,,,,,,,,,,
2005-01-01 00:30:00,0.893,,,,,,,,,,,,,,
2005-01-01 00:40:00,1.016,,,,,,,,,,,,,,
2005-01-01 00:50:00,1.092,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2005-12-31 23:10:00,,1.535,,,,,,,,,,,,,
2005-12-31 23:20:00,,,,2.085,,,,,,,,,,,
2005-12-31 23:30:00,1.072,,,,,,,,,,,,,,
2005-12-31 23:40:00,1.254,,,,,,,,,,,,,,


In [29]:
#combinando y eliminado clusters
col_to_join = [['C1','C2','C3','C4','C5','C6',],['C8','C9'],['C10','C11'],['C12','C13'],['C14','C15']]
col_to_del = [['C7']]
dfclvv_joined = myfunc.join_clusters(dfclvv.copy(),col_to_join,col_to_del)
dfclvv_joined

Unnamed: 0,C1,C8,C10,C12,C14
2005-01-01 00:10:00,1.898,,,,
2005-01-01 00:20:00,1.759,,,,
2005-01-01 00:30:00,0.893,,,,
2005-01-01 00:40:00,1.016,,,,
2005-01-01 00:50:00,1.092,,,,
...,...,...,...,...,...
2005-12-31 23:10:00,1.535,,,,
2005-12-31 23:20:00,2.085,,,,
2005-12-31 23:30:00,1.072,,,,
2005-12-31 23:40:00,1.254,,,,


### Markov

In [30]:
#########################################################
mat_trans_prob_joined= myfunc.trans_matrix_from_df(dfclvv_joined,mat_mode='prob')

#SOLO FUNCIONA PARA FILAS PERO NO PARA COLUMNAS
plt.figure(figsize=(10,10))

ht =sns.heatmap(mat_trans_prob_joined, annot=True,fmt='.2f')
plt.show()
figure = ht.get_figure()

figure.savefig('ht_prob_joined.png', dpi=400)


<IPython.core.display.Javascript object>

In [31]:
myfunc.create_mc_plot(100,mat_trans_prob_joined.copy(), filename='graph_mc_joined.jpg', as_pct=False,
                      replace_zero_to_nan=False);


<IPython.core.display.Javascript object>

## Componentes de veocidad

### Clusters unidos manualmente

In [32]:

# col_to_join =['C1','C2','C3','C4','C5','C6',]
# df_comp_vel_joined = join_clusters(clusters_wind.comp_vel,col_to_join)
# col_to_join =['C8','C9']
# df_comp_vel_joined = join_clusters(df_comp_vel_joined,col_to_join)
# col_to_join =['C10','C11']
# df_comp_vel_joined =  join_clusters(df_comp_vel_joined,col_to_join)
# col_to_join =['C12','C13']
# df_comp_vel_joined = join_clusters(df_comp_vel_joined,col_to_join)
# col_to_join =['C14','C15']
# df_comp_vel_joined = join_clusters(df_comp_vel_joined,col_to_join)
# df_comp_vel_joined.drop('C7',axis=1,level=0,inplace=True)
# #si no hago esto no se quita el C7 del indice y ocurre un error al querer plotear C7 que no existe
# df_comp_vel_joined.columns =df_comp_vel_joined.columns.remove_unused_levels()
col_to_join = [['C1','C2','C3','C4','C5','C6',],['C8','C9'],['C10','C11'],['C12','C13'],['C14','C15']]
col_to_del = [['C7']]
dfclvv_joined = myfunc.join_clusters(dfclvv.copy(),col_to_join,col_to_del)

df_comp_vel_joined = myfunc.join_clusters(clusters_wind.comp_vel.copy(),col_to_join,col_to_del)




In [33]:
dfclvv_joined

Unnamed: 0,C1,C8,C10,C12,C14
2005-01-01 00:10:00,1.898,,,,
2005-01-01 00:20:00,1.759,,,,
2005-01-01 00:30:00,0.893,,,,
2005-01-01 00:40:00,1.016,,,,
2005-01-01 00:50:00,1.092,,,,
...,...,...,...,...,...
2005-12-31 23:10:00,1.535,,,,
2005-12-31 23:20:00,2.085,,,,
2005-12-31 23:30:00,1.072,,,,
2005-12-31 23:40:00,1.254,,,,


In [34]:
clusters_wind.comp_vel

Unnamed: 0_level_0,C1,C1,C2,C2,C3,C3,C4,C4,C5,C5,...,C11,C11,C12,C12,C13,C13,C14,C14,C15,C15
Unnamed: 0_level_1,vx,vy,vx,vy,vx,vy,vx,vy,vx,vy,...,vx,vy,vx,vy,vx,vy,vx,vy,vx,vy
2005-01-01 00:10:00,,,-1.635036,0.963879,,,,,,,...,,,,,,,,,,
2005-01-01 00:20:00,,,-1.523646,0.878968,,,,,,,...,,,,,,,,,,
2005-01-01 00:30:00,-0.773516,0.446230,,,,,,,,,...,,,,,,,,,,
2005-01-01 00:40:00,-0.944979,0.373190,,,,,,,,,...,,,,,,,,,,
2005-01-01 00:50:00,1.091120,0.043824,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2005-12-31 23:10:00,,,-1.273172,0.857472,,,,,,,...,,,,,,,,,,
2005-12-31 23:20:00,,,,,,,1.168932,-1.726506,,,...,,,,,,,,,,
2005-12-31 23:30:00,0.733834,-0.781454,,,,,,,,,...,,,,,,,,,,
2005-12-31 23:40:00,0.435057,-1.176113,,,,,,,,,...,,,,,,,,,,


In [35]:

#calcular los nuevos centroides
cent_teoricos=[]
#si uso numpy o solo get level values ordena los clusters
for cl in pd.unique(df_comp_vel_joined.columns.get_level_values(0)):
    cent_teoricos.append(df_comp_vel_joined[cl].mean().values)
#buscar centroides en los puntos existentes en los datos
idx = []
#agrupa los datos vx y vy y los suma, lo que resulta en solo dos columnas vx y vy
vx_vy_group = df_comp_vel_joined.groupby(axis=1,level=1).sum()
for cent in cent_teoricos:
    idx.append( np.nanargmin(nan_euclidean_distances(vx_vy_group,[cent])))


# buscar los valores vx,vy y timeStamp de los centroides en puntos reales
lcent = []
lcent_cl = []
for i in idx:
    lcent.append(vx_vy_group.iloc[i])
    #añadir el cluster al que pertenece el centroide
    lcent_cl.append(df_comp_vel_joined.iloc[i].dropna().index[0][0])
#crear dataframe idx_centroids
idx_centroids = pd.DataFrame(lcent)
idx_centroids['ord_nat'] = lcent_cl
idx_centroids.index.name = 'PCTimeStamp'
idx_centroids
#ordenar clusters por magnitud de viento
mag_vv = []
for row in lcent:
    mag_vv.append(np.sqrt(row['vx']**2 + row['vy']**2))
#son solo numeros de los indices
orden_menmay=np.argsort(mag_vv)
#son los nombres de los clusters obtenidos del dataframe
#nuevo_orden=  list(df_comp_vel_joined.columns.levels[0])[orden_menmay]
#actualizando orden de las columnas
#df_comp_vel_joined.columns =pd.MultiIndex.from_product([nuevo_orden,['vx','vy']])

In [36]:
#calcular los nuevos centroides
#cent_teoricos=[]
idx = []
#si uso numpy o solo get level values ordena los clusters
for cl in pd.unique(df_comp_vel_joined.columns.get_level_values(0)):
    cent_teorico = df_comp_vel_joined[cl].mean().values
    idx.append( np.nanargmin(nan_euclidean_distances(
        df_comp_vel_joined[cl],[cent_teorico])))

#buscar centroides en los puntos existentes en los datos

#agrupa los datos vx y vy y los suma, lo que resulta en solo dos columnas vx y vy
# vx_vy_group = df_comp_vel_joined.groupby(axis=1,level=1).sum()
# for cent in cent_teoricos:
#     idx.append( np.nanargmin(nan_euclidean_distances(vx_vy_group,[cent])))


# buscar los valores vx,vy y timeStamp de los centroides en puntos reales
lcent = []
lcent_cl = []
for i in idx:
    lcent.append(vx_vy_group.iloc[i])
    #añadir el cluster al que pertenece el centroide
    lcent_cl.append(df_comp_vel_joined.iloc[i].dropna().index[0][0])
#crear dataframe idx_centroids
idx_centroids = pd.DataFrame(lcent)
idx_centroids['ord_nat'] = lcent_cl
idx_centroids.index.name = 'PCTimeStamp'
idx_centroids
#ordenar clusters por magnitud de viento
mag_vv = []
for row in lcent:
    mag_vv.append(np.sqrt(row['vx']**2 + row['vy']**2))
#son solo numeros de los indices
orden_menmay=np.argsort(mag_vv)
#son los nombres de los clusters obtenidos del dataframe
#nuevo_orden=  list(df_comp_vel_joined.columns.levels[0])[orden_menmay]
#actualizando orden de las columnas
#df_comp_vel_joined.columns =pd.MultiIndex.from_product([nuevo_orden,['vx','vy']])

In [37]:
#######################################
clsclord=('wind',None)
ploti = myfunc.PlotSubClusterInt()
ploti.create_plot(
    df_comp_vel_joined,
    figsize=(20, 10),
    idx_centroids=clusters_wind.idx_centroids.iloc[[0,7,9,11,13]],
    fign='Figure '+ str(plt.gcf().number+1),
    save_folder='figures_tests/',
    filename = 'plot_clusters_joined',
    showlBetz=False,
    showCent= True,
    showOpt= 'Numero',
    dfMfgCurve=df_mf_curve
)

HBox(children=(VBox(children=(Button(description='Sel. todo', style=ButtonStyle()), Button(description='Des. t…

VBox(children=(Text(value='', placeholder='Log'),))

<IPython.core.display.Javascript object>

### 5 clusters automaticos

In [38]:
n_clusters = 5
clusters_wind_5 = myfunc.KMData()
clusters_wind_5.dataframe_to_cluster(df_comp_vel, n_clusters=n_clusters, clusters_data='wind')


In [39]:

clsclord=('wind',None)
ploti_5c = myfunc.PlotSubClusterInt()
ploti_5c.create_plot(
    clusters_wind_5.comp_vel,
    figsize=(20, 10),
    idx_centroids=clusters_wind_5.idx_centroids,
    fign='Figure '+ str(plt.gcf().number+1),
    save_folder = 'figures_tests/',
    filename = 'plot5clusters',
    showlBetz=False,
    showCent= True,
    showOpt= 'Numero',
    dfMfgCurve=df_mf_curve
)

HBox(children=(VBox(children=(Button(description='Sel. todo', style=ButtonStyle()), Button(description='Des. t…

VBox(children=(Text(value='', placeholder='Log'),))

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Tiempo total que pasa en estados de interes

In [40]:
#direcciones promedio
for col in dfclvv_sin_ord.columns:
    print(col+ ' - ' +str(df_wind_dir.wdir[dfclvv_sin_ord[col] !=0 ].mean()))

C1 - 101.07347715900211
C2 - 101.07347715900211
C3 - 101.07347715900211
C4 - 101.07347715900211
C5 - 101.07347715900211
C6 - 101.07347715900211
C7 - 101.07347715900211
C8 - 104.784449710703
C9 - 101.07347715900211
C10 - 101.07347715900211
C11 - 101.07347715900211
C12 - 101.07347715900211
C13 - 101.07347715900211
C14 - 101.07347715900211
C15 - 101.07347715900211


In [41]:
#obtener estados entre dos velocidades de viento
#primero obtengo las velocidades de viento y de ahi los estados
#por ejemplo seleccionar los estados que ocurren entre las velocidades 6 y 11
#y ver como brincan entre estados y cuanto duran estas transiciones antes
# de que brinquen a estados que no son de interes
betw = df_wind_dir[df_wind_dir.vwind.between(6,11)]
betw.index = pd.to_datetime(pd.to_datetime(betw.index).strftime('%d/%m/%Y %H:%M'))

In [42]:
dfclvv_sin_ord.index = pd.to_datetime(pd.to_datetime(dfclvv_sin_ord.index).strftime('%d/%m/%Y %H:%M'))


In [43]:
#buscar cuanto duran los estados
#fila por fila
#contador de tiempos de 10 minutos para tener lo hora total
t10_count = 10
#lista con todos los tiempos totales
list_total_times=[]

df = betw
for row in range(len(df)-1):
    #obtener el estado de del registro
    estado = dfclvv_sin_ord.loc[df.index[row]].to_numpy().reshape(-1).nonzero()[0][0]+1
    #comprobar que la diferencia entre una fila y la siguiente es de diez minutos
    if df.index[row+1] - df.index[row]  == pd.Timedelta(minutes=10):
        # dato de la fila
        t10_count+=10 #sumar 10 minutos
        register = df.iloc[row]
        print('%s | %0.2f m/s | %0.1f°| C%i'%
              (register.name,register.vwind,register.wdir, estado))
    else:
        register = df.iloc[row]
        print('%s | %0.2f m/s | %0.1f°| C%i'%
              (register.name,register.vwind,register.wdir,estado))
        list_total_times.append(pd.to_datetime(t10_count,unit='m').strftime('%H:%M'))
        print('Total time: %s hr(s)' % list_total_times[-1] )
        print('########################################')
        t10_count = 10


2005-01-01 14:50:00 | 6.35 m/s | 92.5°| C1
Total time: 00:10 hr(s)
########################################
2005-01-01 15:50:00 | 6.56 m/s | 91.0°| C1
2005-01-01 16:00:00 | 7.67 m/s | 96.9°| C1
2005-01-01 16:10:00 | 8.15 m/s | 101.4°| C1
2005-01-01 16:20:00 | 8.57 m/s | 102.8°| C1
2005-01-01 16:30:00 | 8.71 m/s | 97.5°| C1
2005-01-01 16:40:00 | 7.39 m/s | 97.5°| C1
2005-01-01 16:50:00 | 7.32 m/s | 93.7°| C1
2005-01-01 17:00:00 | 7.57 m/s | 95.8°| C1
2005-01-01 17:10:00 | 7.67 m/s | 102.0°| C1
2005-01-01 17:20:00 | 7.05 m/s | 103.4°| C1
Total time: 01:40 hr(s)
########################################
2005-02-01 12:30:00 | 6.50 m/s | 88.7°| C1
2005-02-01 12:40:00 | 8.06 m/s | 95.9°| C1
2005-02-01 12:50:00 | 8.83 m/s | 94.5°| C1
2005-02-01 13:00:00 | 8.96 m/s | 102.1°| C1
2005-02-01 13:10:00 | 8.88 m/s | 101.8°| C1
2005-02-01 13:20:00 | 8.14 m/s | 105.9°| C1
2005-02-01 13:30:00 | 8.98 m/s | 103.2°| C1
2005-02-01 13:40:00 | 8.98 m/s | 103.2°| C1
2005-02-01 13:50:00 | 8.21 m/s | 100.8°| C1


In [44]:
#buscar cuanto duran los estados
#fila por fila
#contador de tiempos de 10 minutos para tener lo hora total
t10_count = 10
#lista con todos los tiempos totales
list_total_times=[]

df = df_wind_dir
#indica a que grupo pertenece el dato
#con el codigo de abajo, los estados se dividen en grupos
#pero cronologicamente, para luego identificar a que grupo
#pertenece cierto registro le añado un indice
#por lo que se tienen datos agrupados por clusters de magnitudes de voltaje
#y por clusters de tiempos consecutivos de ocurrencia
grouped_states_index = 1

for row in range(len(df)):
    #obtener el estado de del registro
    #reshape porque a veces me da un array de 1xn y a veces de nx1
    estado_ini = dfclvv_sin_ord.loc[df.index[row]].to_numpy().reshape(-1).nonzero()[0][0]+1
    estado_sig = dfclvv_sin_ord.loc[df.index[row+1]].to_numpy().reshape(-1).nonzero()[0][0]+1
    #comprobar que la diferencia entre una fila y la siguiente es de diez minutos
    if estado_ini == estado_sig:
        # dato de la fila
        t10_count+=10 #sumar 10 minutos
        register = df.iloc[row]
        print('%s | %0.2f m/s | %0.1f°| C%i |gp_idx: %i |'%
              (register.name,register.vwind,register.wdir, estado_sig,grouped_states_index))
    else:
        register = df.iloc[row]
        print('%s | %0.2f m/s | %0.1f°| C%i |gp_idx: %i |'%
              (register.name,register.vwind,register.wdir,estado_ini,grouped_states_index))
        list_total_times.append(pd.to_datetime(t10_count,unit='m').strftime('%H:%M'))
        print('Total time: %s hr(s)' % list_total_times[-1] )
        print('########################################')
        t10_count = 10
        grouped_states_index+=1 #nuevo grupo, nuevo indice



2005-01-01 00:10:00 | 1.90 m/s | 59.5°| C1 |gp_idx: 1 |
2005-01-01 00:20:00 | 1.76 m/s | 60.0°| C1 |gp_idx: 1 |
2005-01-01 00:30:00 | 0.89 m/s | 60.0°| C1 |gp_idx: 1 |
2005-01-01 00:40:00 | 1.02 m/s | 68.5°| C1 |gp_idx: 1 |
2005-01-01 00:50:00 | 1.09 m/s | 272.3°| C1 |gp_idx: 1 |
2005-01-01 01:00:00 | 1.25 m/s | 283.8°| C1 |gp_idx: 1 |
2005-01-01 01:10:00 | 1.35 m/s | 272.5°| C1 |gp_idx: 1 |
2005-01-01 01:20:00 | 0.33 m/s | 268.3°| C1 |gp_idx: 1 |
2005-01-01 01:30:00 | 0.03 m/s | 289.7°| C1 |gp_idx: 1 |
2005-01-01 01:40:00 | 0.65 m/s | 296.3°| C1 |gp_idx: 1 |
2005-01-01 01:50:00 | 1.12 m/s | 297.7°| C1 |gp_idx: 1 |
2005-01-01 02:00:00 | 1.54 m/s | 0.0°| C1 |gp_idx: 1 |
2005-01-01 02:10:00 | 1.28 m/s | 322.2°| C1 |gp_idx: 1 |
2005-01-01 02:20:00 | 2.63 m/s | 321.1°| C1 |gp_idx: 1 |
2005-01-01 02:30:00 | 2.67 m/s | 317.5°| C1 |gp_idx: 1 |
2005-01-01 02:40:00 | 1.24 m/s | 1.2°| C1 |gp_idx: 1 |
2005-01-01 02:50:00 | 0.52 m/s | 70.9°| C1 |gp_idx: 1 |
2005-01-01 03:00:00 | 1.52 m/s | 340.0°|

IndexError: index 52559 is out of bounds for axis 0 with size 52559

In [None]:
df_wind_dir_cl

In [None]:
row = 0
dfclvv_sin_ord.loc[df.index[row]].to_numpy().reshape(-1).nonzero()

In [None]:
type(dfclvv_sin_ord.loc[df.index[row]].to_numpy())

In [None]:
dfclvv_sin_ord

In [None]:
#este dataframe contendrá en orden cronologíco los datos con una fila extra indicando
#el cluster al que pertenece el dato. Se puede hacer manual o pandificado
#aqui lo voy a hacer pandificado
df_data_with_states = df_wind_dir.copy()
dfclvv_sin_ord.loc['2005-01-01 00:50:00'].to_numpy().nonzero()[0][0]+1

## Buscar transiciones

In [None]:
vel_prom_estados = [8.2,10.7, 12.4]

In [None]:
# 1.- encontrar el primer dato no NAN del estado (df ordenado cronologicamente)
# 2.- encontrar el siguiente dato NAN. El estado se encuentra entre estos dos no NAN y NAN
#

dfclvv_sin_ord.C1.first_valid_index()

In [None]:
dfclvv_sin_ord.C1.isnull()

In [None]:
# #va a buscar renglon por renglon si el valor es nan y
# #va a agrupar los valores que no son nan junto con su timestamp
# num_estado = 'C17'
# dfclvv_sin_ord[num_estado].first_valid_index() #primer valor que no es nan
# lista=[]
# C=[]
# flag= True
# for i in dfclvv_sin_ord[num_estado].itertuples():
#     if not np.isnan(i[1]):
#         C.append([i[0],i.vViento,i.Pw])
#         flag = False
#
#
#     if not flag:
#         lista.append(C)
#         C.clear()
#         flag_nonan=True

In [None]:
# c17agrupado = group_state_values(17,dfclvv_sin_ord)

In [None]:
# listaTiempos=[] #cuenta el numero de elementos que tiene cada lista
# #es decir si la lista contiene 4 elementos, quiere decir que el estado
# #estuvo sin cambios 40 minutos
# for i in c17agrupado:
#     listaTiempos.append(len(i))


In [None]:
# #moda de los tiempos por cada unidad son 10 minutos
# stats.mode( listaTiempos)

In [None]:
# #promedio de los tiempos de estadia, 1.95 es aprox 2 o 20 min
# np.mean(listaTiempos)

In [None]:
# tempdf= dfclvv_sin_ord.C17.copy()
# tempdf.dropna(inplace=True)
# tempdf.reset_index(inplace=True)
# tempdf.rename(columns={'index':'timeStamp'},inplace=True)

In [None]:
# #solo conservar las horas
# l=tempdf['timeStamp'].dt.time.values
# #calcular la moda, es decir, a que hora es más probable el estado
# stats.mode(l)

# Markov chain

In [None]:
#falta ordenar los nombres de columnas
col_to_join =['C1','C2','C3','C4','C5','C6',]
b = clusters_wind.comp_vel

c = b[col_to_join].groupby(level=1,axis=1).sum(min_count=1)
c.columns = pd.MultiIndex.from_product([['C_'],['vx','vy']])
b = b.join(c)
b.drop(col_to_join,level=0,axis=1,inplace=True)
b.rename({'C_':col_to_join[0]},axis=1,inplace=True)
