In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import skfda

In [None]:
import warnings 
warnings.filterwarnings('ignore')
big_data = pd.read_csv('data_csvs/initdata.csv', sep=';', parse_dates=['REF_DATE'])
big_data["AP"] = big_data["AP"].astype("category")
big_data["AP"] = big_data["AP"].cat.codes
big_data['max'] = big_data.groupby('AP')['REF_DATE'].transform('max')
big_data['min'] = big_data.groupby('AP')['REF_DATE'].transform('min')
big_data['range'] = big_data['max'] - big_data['min']
non_zero = big_data[big_data['range']==big_data['range'].max()]
non_zero['max'] = pd.to_datetime('2021-09-30') # the last two months are with zero obs for every customer
non_zero['range'] = non_zero['max'] - non_zero['min'] 
non_zero['ENERGY_MW'] = non_zero['ENERGY'] / 1000

non_zero.set_index('REF_DATE', inplace=True)

In [None]:
df_big = pd.DataFrame()
for i in tqdm(non_zero['AP'].unique()):
    c = non_zero[non_zero['AP']==i]
    s = c.resample('W')['ENERGY_MW'].sum()
    s = pd.DataFrame(s)
    s['AP'] = i
    df_big = pd.concat((df_big,s))

df_big['datetime'] = df_big.index

In [None]:
df_big['meanvalue'] = df_big.groupby('AP')['ENERGY_MW'].transform('mean')
every_where_zero = df_big.set_index('AP')[df_big.set_index('AP')['meanvalue']==0].index.unique()

drop customers that have meanvalue = 0 => there are everywhere zero

In [None]:
df_big.set_index('AP', inplace=True)
df_big.drop(every_where_zero, inplace=True)

df_big['AP'] = df_big.index
df_big['datetime'] = pd.to_datetime(df_big['datetime'])
df_big.set_index('datetime', inplace=True)

function for scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler

def scale_data(df, column):
    scaled_df = pd.DataFrame()
    for customer in tqdm(df["AP"].unique()):
        scaler = MinMaxScaler()
        customer_df = df[df["AP"]==customer]
        customer_df["scaled"] = scaler.fit_transform(customer_df[[column]])
        scaled_df = pd.concat((scaled_df, customer_df), axis=0).sort_index()

    return scaled_df

functions for clustering

In [None]:
def moving_average(x, w):
    return np.convolve(x, np.ones(w), 'valid') / w

def sliding_moving_average(x, w, s):
    point=0
    avg_array = []
    for i in range(s):
        avg = moving_average(x=x[point:s], w=w)
        point+=s
        avg_array.append(avg)
    return avg_array

In [None]:
scaled_df = scale_data(df_big, 'ENERGY_MW')

create features for clustering (based on sliding avgs)

In [None]:
avg_df_tiny = np.empty((1,310))

for customer_id in tqdm(scaled_df.AP.unique()):
    time_series = scaled_df[scaled_df['AP']==customer_id]['scaled'].values
    averages = moving_average(time_series, 3).reshape(1,309)
    customer_id = np.reshape(customer_id, (1,1))
    averages = np.concatenate((customer_id, averages), axis=1)
    avg_df_tiny = np.concatenate((avg_df_tiny, averages), axis=0)
avg_df_tiny = np.delete(avg_df_tiny, obj=0, axis=0)
avg_df_tiny = pd.DataFrame(avg_df_tiny)
avg_df_tiny_columns = [f'small_{i}' for i in range(1,310)]
avg_df_tiny_columns.extend(['AP_tiny'])
avg_df_tiny.columns = avg_df_tiny_columns


avg_df_small = np.empty((1,308))

for customer_id in tqdm(scaled_df.AP.unique()):
    time_series = scaled_df[scaled_df['AP']==customer_id]['scaled'].values
    averages = moving_average(time_series, 5).reshape(1,307)
    customer_id = np.reshape(customer_id, (1,1))
    averages = np.concatenate((customer_id, averages), axis=1)
    avg_df_small = np.concatenate((avg_df_small, averages), axis=0)
avg_df_small = np.delete(avg_df_small, obj=0, axis=0)
avg_df_small = pd.DataFrame(avg_df_small)
avg_df_small_columns = [f'small_{i}' for i in range(1,308)]
avg_df_small_columns.extend(['AP_small'])
avg_df_small.columns = avg_df_small_columns



avg_df_medium = np.empty((1,305))

for customer_id in tqdm(scaled_df.AP.unique()):
    time_series = scaled_df[scaled_df['AP']==customer_id]['scaled'].values
    averages = moving_average(time_series, 8).reshape(1,304)
    customer_id = np.reshape(customer_id, (1,1))
    averages = np.concatenate((customer_id, averages), axis=1)
    avg_df_medium = np.concatenate((avg_df_medium, averages), axis=0)
avg_df_medium = np.delete(avg_df_medium, obj=0, axis=0)

avg_df_medium = pd.DataFrame(avg_df_medium)
avg_df_medium_columns = [f'medium{i}' for i in range(1,305)]
avg_df_medium_columns.extend(['AP_medium'])
avg_df_medium.columns = avg_df_medium_columns


avg_df_big = np.empty((1,303))

for customer_id in tqdm(scaled_df.AP.unique()):
    time_series = scaled_df[scaled_df['AP']==customer_id]['scaled'].values
    averages = moving_average(time_series, 10).reshape(1,302)
    customer_id = np.reshape(customer_id, (1,1))
    averages = np.concatenate((customer_id, averages), axis=1)
    avg_df_big = np.concatenate((avg_df_big, averages), axis=0)
avg_df_big = np.delete(avg_df_big, obj=0, axis=0)

avg_df_big = pd.DataFrame(avg_df_big)
avg_df_big_columns = [f'big_{i}' for i in range(1,303)]
avg_df_big_columns.extend(['AP'])
avg_df_big.columns = avg_df_big_columns

In [None]:
avg_df_big.index = [i for i in range(1,5069)]
avg_df_medium.index = [i for i in range(1,5069)]
avg_df_small.index = [i for i in range(1,5069)]
avg_df_tiny.index = [i for i in range(1,5069)]

concat

In [None]:
avg_df = pd.concat((avg_df_big, avg_df_small,avg_df_medium, avg_df_tiny,), axis=1)
avg_df.drop(['AP_small', 'AP_medium', 'AP_tiny',], inplace=True, axis=1)

selecting columns

In [None]:
cols = avg_df_small_columns.copy()
cols.extend(avg_df_big_columns)
cols.extend(avg_df_medium_columns)
cols.extend(avg_df_tiny_columns)
cols.remove('AP_medium')
cols.remove('AP_small')
cols.remove('AP_tiny')
cols.remove('AP_baby')
cols.remove('AP')

clustering

In [None]:
wcss = []
mat_grid = skfda.FDataGrid(avg_df[cols]) #metric=skfda.misc.metrics.angular_distance
for i in tqdm(range(2, 25)): 
    fkm = skfda.ml.clustering.FuzzyCMeans(n_clusters=i, metric=skfda.misc.metrics.angular_distance)
    fkm.fit(mat_grid)
    wcss.append(fkm.inertia_)

wcss-k plot

In [None]:
import matplotlib.pyplot as plt
fig = plt.figure()
plt.plot(range(2, 25), wcss, 'bx-')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS') 
# plt.savefig('./figures/WCSSfuzzy.png')
plt.show()

select (k=10)

In [None]:
fkm = skfda.ml.clustering.FuzzyCMeans(random_state=42, n_clusters=10, metric=skfda.misc.metrics.angular_distance)
fkm.fit(mat_grid)

add cluster info to df

In [None]:
labels = pd.DataFrame(fkm.labels_)
labels = pd.concat((pd.DataFrame(scaled_df['AP'].unique()), labels), axis=1)
labels.columns = ['AP', 'cluster']

some viz

In [None]:
customer_id = labels[labels['cluster']==5].sample(n=1)['AP'].item()
fig = plt.figure()
plt.plot(df_big[df_big['AP']==customer_id]['ENERGY_MW'].index, df_big[df_big['AP']==customer_id]['ENERGY_MW'])
plt.xlabel('Datetime')
plt.ylabel('Consumption (kW)') 
plt.savefig(f'./figures/bad-{customer_id}.png')
plt.show()

In [None]:
dic = labels.set_index('AP')['cluster']
scaled_df['cluster'] = scaled_df['AP'].map(dic)

In [None]:
md = pd.DataFrame(fkm.membership_degree_)
md.columns = [f'degree_{i}' for i in range(0,10)]

md = pd.concat((md, labels), axis=1)

In [None]:
md['max'] = md[[f'degree_{i}' for i in range(0,10)]].drop('degree_5', axis=1).max(axis=1)
md['min'] = md[[f'degree_{i}' for i in range(0,10)]].min(axis=1)
md['sum'] = md[[f'degree_{i}' for i in range(0,10)]].sum(axis=1)

some viz

In [None]:
customer_id = md[md['cluster']==5].sort_values('degree_5', ascending=True).tail(100).sample(n=1)['AP'].item()
fig = plt.figure()
plt.plot(df_big[df_big['AP']==customer_id]['ENERGY_MW'].index, df_big[df_big['AP']==customer_id]['ENERGY_MW'])
plt.xlabel('Datetime')
plt.ylabel('Consumption (kW)') 
plt.show()

In [None]:
bad_cluster = md[md['cluster']==5]
AP_values = bad_cluster[['AP']].reset_index()

js divergence inside the cluster

In [None]:
from scipy.spatial.distance import jensenshannon

bad_cluster_distances = bad_cluster[[f'degree_{i}' for i in range(0,10)]]
bad_cluster_distances = np.array(bad_cluster_distances).reshape((len(bad_cluster_distances), len(bad_cluster_distances.columns)))

js_matrix = np.empty((1,len(bad_cluster_distances)))
for row in tqdm(range(len(bad_cluster_distances))):
    js_row = []
    for row_2 in range(len(bad_cluster_distances)):
        js = jensenshannon(bad_cluster_distances[row], bad_cluster_distances[row_2])
        js_row.append(js)
    js_row = np.reshape(js_row, (1, len(bad_cluster_distances)))
    js_matrix = np.concatenate((js_matrix, js_row), axis=0)


js_matrix = np.delete(js_matrix, 0, axis=0)

In [None]:
df_js = pd.DataFrame(js_matrix)
df_js.columns = [f'distance_{i}' for i in range(0,len(bad_cluster_distances))]
df_js = pd.concat((df_js, AP_values), axis=1)
df_js.drop('index', axis=1, inplace=True)

some viz

In [None]:
customer_id = df_js.sort_values('mean_js', ascending=True).tail(100).sample(n=1)['AP'].item()
fig = plt.figure()
plt.plot(df_big[df_big['AP']==customer_id]['ENERGY_MW'].index, df_big[df_big['AP']==customer_id]['ENERGY_MW'])
plt.xlabel('Datetime')
plt.ylabel('Consumption (kW)') 
plt.show()