In [1]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [2]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import math as mt
import warnings

# 한글출력
matplotlib.rc('font', family='AppleGothic')
plt.rcParams['axes.unicode_minus'] = False
warnings.filterwarnings(action='ignore') 

from src import crs, PublicPredictor, TimeDivisionKMeans
from src.dbc import utils
from IPython.display import clear_output

In [3]:
# 1. load_excel
data_path = "data/apt_1.xlsx"
xlsx = pd.read_excel(data_path, header=None,
                     skiprows=2, engine="openpyxl")

# 2. data preprocessing
p, m = crs.utils.data_preprocessing(xlsx)

m.set_index("month", inplace=True)

# 3. data init
_month = 1

month_df = pd.DataFrame(m.loc[_month])
month_df.reset_index(inplace=True)

month_df.columns = ['name', 'usage (kWh)']

PUBLIC_PERCENTAGE = 30
APT = crs.utils.get_APT(month_df, PUBLIC_PERCENTAGE)

calc = crs.models.ManagementOffice(
        month=_month,
        households=month_df,
        APT=APT,
        contract="단일계약"
    )
apt = calc.apart

In [4]:
m_15 = utils.data_preprocessing(xlsx)

df = utils.dimension_reduction(m_15)
m_60 = df.copy()

m_15_1 = m_15[m_15.index.month == 1].copy()
m_60_1 = m_60[m_60.index.month == 1].copy()

## tdkmeans vs kmeans

In [5]:
from sklearn.metrics import euclidean_distances as euc
from src.KMeans import KMeans

kmeans_sort_info = np.array([])
tdkmeans_sort_info = np.array([])

for case in range(0,500):
    kmeans = KMeans(datas=m_60_1.T.values, ver=1)
    kmeans.fit()
    kmeans.sorting()

    tdkmeans = TimeDivisionKMeans(datas=m_60_1)
    tdkmeans.fit()
    
    kmeans_labels = kmeans.labels_
    tdkmeans_labels = tdkmeans.groups_
    
    kmeans_sort_info = np.append(kmeans_sort_info, kmeans_labels).reshape(-1, kmeans_labels.size)
    tdkmeans_sort_info = np.append(tdkmeans_sort_info, tdkmeans_labels).reshape(-1, tdkmeans_labels.size)
    
    clear_output(wait=True)

ECV : 51 %
1/248 - ECV:85%
11/248 - ECV:79%
21/248 - ECV:85%
31/248 - ECV:87%
41/248 - ECV:87%
51/248 - ECV:83%
61/248 - ECV:87%
71/248 - ECV:82%
81/248 - ECV:87%
91/248 - ECV:83%
101/248 - ECV:82%
111/248 - ECV:79%
121/248 - ECV:87%
131/248 - ECV:79%
141/248 - ECV:84%
151/248 - ECV:83%
161/248 - ECV:86%
171/248 - ECV:76%
181/248 - ECV:81%
191/248 - ECV:83%
201/248 - ECV:88%
211/248 - ECV:80%
221/248 - ECV:83%
231/248 - ECV:80%
241/248 - ECV:88%
248/248 - ECV:84%


In [6]:
kmeans_chk = np.array([])

for sort_info in kmeans_sort_info:
    chk = kmeans_sort_info == sort_info
    for _chk in chk:
        kmeans_chk = np.append(kmeans_chk, 
                              np.where(~_chk)[0].size
                              )

In [7]:
kmeans_chk.mean()

52.148648

In [8]:
tdkmeans_chk = np.array([])

for sort_info in tdkmeans_sort_info:
    chk = tdkmeans_sort_info == sort_info
    for _chk in chk:
        tdkmeans_chk = np.append(tdkmeans_chk, 
                              np.where(~_chk)[0].size
                              )

In [9]:
tdkmeans_chk.mean()

3.664736

In [10]:
from sklearn.metrics import euclidean_distances as euc
from src.KMeans import KMeans

kmeans_sort_info = np.array([])
kmeans_2_sort_info = np.array([])

for case in range(0,500):
    kmeans = KMeans(datas=m_60_1.T.values, ver=1)
    kmeans.fit()
    kmeans.sorting()

    kmeans_2 = KMeans(datas=m_60_1.T.values)
    kmeans_2.fit()
    kmeans_2.sorting()
    
    kmeans_labels = kmeans.labels_
    kmeans_2_labels = kmeans_2.labels_
    
    kmeans_sort_info = np.append(kmeans_sort_info, kmeans_labels).reshape(-1, kmeans_labels.size)
    kmeans_2_sort_info = np.append(kmeans_2_sort_info, kmeans_2_labels).reshape(-1, kmeans_2_labels.size)
    
    clear_output(wait=True)

ECV : 46 %
ECV : 51 %
