# Import thư viện

In [34]:
# Pandas for reading data
import pandas as pd

# Matplotlib for ploting graph
import matplotlib.pyplot as plt

# Train test split
from sklearn.model_selection import train_test_split

# Multilayer Perceptron Classifier
from sklearn.neural_network import MLPClassifier

# Set matplotlib figure size
plt.rcParams['figure.figsize'] = (20, 4)

# Đọc dữ liệu

In [35]:
# Power dataframe
p_df = pd.read_csv('./data/W.csv',\
                       names=['time', 'light', 'socket', 'heater', 'aircond1', 'aircond2', 'aircond3', 'indcooker'],\
                       header=0)

# Voltage dataframe
u_df = pd.read_csv('./data/V.csv',\
                  names=['time', 'u'],\
                  header=0)

# Current dataframe
i_df = pd.read_csv('./data/A.csv',\
                  names=['time', 'light', 'socket', 'heater', 'aircond1', 'aircond2', 'aircond3', 'indcooker'],\
                  header=0)

light    : chiếu sáng  
socket   : ổ cắm phòng khách + bếp + ngủ  
heater   : bình nóng lạnh  
aircond1 : điều hoà 1 & 2  
aircond2 : điều hoà 3  
aircond3 : điều hoà phòng khách  
indcooker: bếp từ  

### Bảng đo công suất tiêu thụ

In [36]:
p_df

Unnamed: 0,time,light,socket,heater,aircond1,aircond2,aircond3,indcooker
0,2020-07-15 00:00:00,0.0,216.0,0,244.0,0.0,631.0,1.0
1,2020-07-15 00:01:00,0.0,210.0,0,286.0,0.0,631.0,3.5
2,2020-07-15 00:02:00,0.0,206.0,0,314.0,0.0,633.0,6.0
3,2020-07-15 00:03:00,0.0,206.0,0,254.0,0.0,632.0,3.5
4,2020-07-15 00:04:00,0.0,160.0,0,274.0,0.0,629.0,1.0
...,...,...,...,...,...,...,...,...
31675,2020-08-05 23:55:00,0.0,236.0,0,183.0,0.0,0.0,1.0
31676,2020-08-05 23:56:00,0.0,238.0,0,174.0,0.0,0.0,1.0
31677,2020-08-05 23:57:00,0.0,243.0,0,199.0,0.0,0.0,1.0
31678,2020-08-05 23:58:00,0.0,241.0,0,229.0,0.0,0.0,1.0


### Bảng đo điện áp tiêu thụ

In [37]:
u_df

Unnamed: 0,time,u
0,2020-07-15 00:00:00,231
1,2020-07-15 00:01:00,230
2,2020-07-15 00:02:00,230
3,2020-07-15 00:03:00,230
4,2020-07-15 00:04:00,230
...,...,...
31675,2020-08-05 23:55:00,230
31676,2020-08-05 23:56:00,230
31677,2020-08-05 23:57:00,231
31678,2020-08-05 23:58:00,231


### Bảng đo dòng điện tiêu thụ

In [38]:
i_df

Unnamed: 0,time,light,socket,heater,aircond1,aircond2,aircond3,indcooker
0,2020-07-15 00:00:00,0.0,0.965,0.0,1.52,0.0,2.86,0.25
1,2020-07-15 00:01:00,0.0,0.955,0.0,1.53,0.0,2.86,0.25
2,2020-07-15 00:02:00,0.0,0.950,0.0,1.53,0.0,2.86,0.25
3,2020-07-15 00:03:00,0.0,0.950,0.0,1.53,0.0,2.87,0.25
4,2020-07-15 00:04:00,0.0,0.790,0.0,1.53,0.0,2.86,0.25
...,...,...,...,...,...,...,...,...
31675,2020-08-05 23:55:00,0.0,1.060,0.0,1.17,0.0,0.00,0.25
31676,2020-08-05 23:56:00,0.0,1.060,0.0,1.17,0.0,0.00,0.25
31677,2020-08-05 23:57:00,0.0,1.070,0.0,1.17,0.0,0.00,0.25
31678,2020-08-05 23:58:00,0.0,1.070,0.0,1.17,0.0,0.00,0.25


# Chuẩn bị dữ liệu

### Chọn các thiết bị sẽ phân loại

In [39]:
# select_device = ['heater', 'indcooker']
# select_device = ['heater', 'indcooker', 'aircond1']
# select_device = ['heater', 'indcooker', 'aircond1', 'aircond2', 'aircond3']
select_device = ['heater', 'indcooker', 'aircond1', 'aircond2', 'aircond3', 'socket', 'light']


### Tính tổng P

In [40]:
p_df['sum'] = p_df[select_device].sum(axis=1)
p_df.head()

Unnamed: 0,time,light,socket,heater,aircond1,aircond2,aircond3,indcooker,sum
0,2020-07-15 00:00:00,0.0,216.0,0,244.0,0.0,631.0,1.0,1092.0
1,2020-07-15 00:01:00,0.0,210.0,0,286.0,0.0,631.0,3.5,1130.5
2,2020-07-15 00:02:00,0.0,206.0,0,314.0,0.0,633.0,6.0,1159.0
3,2020-07-15 00:03:00,0.0,206.0,0,254.0,0.0,632.0,3.5,1095.5
4,2020-07-15 00:04:00,0.0,160.0,0,274.0,0.0,629.0,1.0,1064.0


### Tính tổng I

In [41]:
i_df['sum'] = i_df[select_device].sum(axis=1)
i_df.head()

Unnamed: 0,time,light,socket,heater,aircond1,aircond2,aircond3,indcooker,sum
0,2020-07-15 00:00:00,0.0,0.965,0.0,1.52,0.0,2.86,0.25,5.595
1,2020-07-15 00:01:00,0.0,0.955,0.0,1.53,0.0,2.86,0.25,5.595
2,2020-07-15 00:02:00,0.0,0.95,0.0,1.53,0.0,2.86,0.25,5.59
3,2020-07-15 00:03:00,0.0,0.95,0.0,1.53,0.0,2.87,0.25,5.6
4,2020-07-15 00:04:00,0.0,0.79,0.0,1.53,0.0,2.86,0.25,5.43


### Gán nhãn

Dựa vào bảng đo công suất tiêu thụ, tại một thời điểm t, thiết bị nào có công suất tiêu thụ lớn hơn `threshold` thì xác định là thiết bị đó đang bật. 

Có n thiết bị thì sẽ có tương ứng 2^n trường hợp, hay 2^n nhãn lớp.

Ví dụ có 5 thiết bị: A, B, C, D, E:
- A đang bật => 1
- B đang tắt => 0
- C đang bật => 1
- D đang bật => 1
- E đang tắt => 0

Thì nhãn lớp sẽ là 10110 (base 2) tương ứng với 22 (base 10)


In [42]:
threshold = 10

def set_label(row):
    x = 0
    for i, name in enumerate(select_device):
        if row[name] > 10:
            x += 2**i
    return x

p_df['label'] = p_df.apply(set_label, axis=1)

p_df.sample(10)

Unnamed: 0,time,light,socket,heater,aircond1,aircond2,aircond3,indcooker,sum,label
25338,2020-08-01 14:18:00,22.0,237.0,0,259.0,0.0,0.0,1.0,519.0,100
10034,2020-07-21 23:14:00,0.0,250.0,0,212.0,0.0,623.0,1.0,1086.0,52
11604,2020-07-23 01:24:00,0.0,149.0,0,282.0,0.0,615.0,2.0,1048.0,52
6260,2020-07-19 08:20:00,0.0,129.0,0,0.0,0.0,0.0,0.5,129.5,32
2204,2020-07-16 12:44:00,148.0,100.0,0,467.0,1108.0,0.0,1257.0,3080.0,110
14198,2020-07-24 20:38:00,0.0,132.0,0,0.0,0.0,0.0,1.0,133.0,32
7900,2020-07-20 11:40:00,44.5,1701.0,0,0.0,0.0,2096.0,2.0,3843.5,112
30462,2020-08-05 03:42:00,0.0,211.0,0,0.0,0.0,43.5,1.0,255.5,48
9050,2020-07-21 06:50:00,0.0,179.0,0,0.0,14.5,0.0,0.5,194.0,40
9992,2020-07-21 22:32:00,0.0,304.0,0,296.0,0.0,665.0,1.0,1266.0,52


### Lấy các cột tổng ra cùng với label tương ứng

In [43]:
data = pd.DataFrame()
data['P'] = p_df['sum']
data['U'] = u_df['u']
data['I'] = i_df['sum']
data['Label'] = p_df['label']

data.sample(10)

Unnamed: 0,P,U,I,Label
12446,1241.0,224,5.935,40
5262,2156.0,226,9.81,48
11058,2955.5,226,13.57,97
9266,233.5,224,1.405,32
1765,1041.0,228,5.16,52
407,291.5,228,1.58,96
27666,253.5,232,1.415,32
28037,2460.0,226,11.235,98
4108,316.0,223,1.77,32
17723,260.0,222,1.43,96


### Đưa về tập X và y để đưa vào model

Tập dữ liệu X phải chuẩn hoá về ma trận 2 chiều của numpy  
Mỗi dòng tương ứng với một điểm dữ liệu

In [44]:
X = data[['P', 'U', 'I']].to_numpy()

#### Tập nhãn lớp y đưa về dạng one-hot-coding vector  
Ví dụ:
- label = 0 tương ứng với vector [1, 0, 0, ..., 0] 
- label = 1 tương ứng với vector [0, 1, 0, ..., 0]
- label = 2 tương ứng với vector [0, 0, 1, ..., 0]
- ...
- label = 2^n - 1 (n = số thiết bị) tương ứng với vector [0, 0, 0, ..., 1]

Số lượng nơ-ron ở tầng đầu ra = độ dài vector output = 2 ^ n

In [45]:
y = [[0 for i in range(2**len(select_device))] for j in range(len(data))]
for i in range(len(data)):
    y[i][data['Label'][i]] = 1

#### Cách khác để mã hoá nhãn lớp (không làm theo bài báo):

Vì cách mã hoá one-hot-coding sẽ khiến vector output có độ dài bằng 2^n, sẽ làm tầng output của mạng nơ-ron cồng kềnh dẫn đến mất thời gian tính toán, nên có một cách khác tự nhiên hơn. (Thử nghiệm cho thấy độ hiệu quả là xấp xỉ nhau)

Giữ nhãn lớp ở dạng vector gồm 1 và 0 tương ứng với trạng thái của các thiết bị.  
Ví dụ có 5 thiết bị: A, B, C, D, E:
- A đang bật => 1
- B đang tắt => 0
- C đang bật => 1
- D đang bật => 1
- E đang tắt => 0

Thì mã hoá nhãn lớp là vector [1,0,1,1,0] luôn.

Số lượng nơ ron ở tầng đầu ra = số thiết bị cần phân loại.

In [46]:
# def tobase2(n):
#     length = len(select_device)
#     ret = [0 for i in range(length)]
#     i = length - 1
#     while n > 0:
#         ret[i] = n % 2
#         n = n // 2
#         i -= 1
#     return ret

# y = [tobase2(data['Label'][i]) for i in range(len(data))]

# Đưa dữ liệu vào mô hình

### Chia tập train, test

Tỷ lệ chia 70-30, lấy theo thứ tự, không xáo trộn (để dễ dàng so sánh kết quả với các giải thuật hoặc mô hình khác)

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, shuffle=False)

### Train

In [48]:
clf = MLPClassifier(solver='adam', 
                    alpha=1e-5, 
                    hidden_layer_sizes=20, 
                    random_state=1, 
                    max_iter=1000, 
                    verbose=True, 
                    learning_rate='adaptive',
                    n_iter_no_change=20)
clf.fit(X_train, y_train)

Iteration 1, loss = 672.20963123
Iteration 2, loss = 7.44927427
Iteration 3, loss = 4.05532580
Iteration 4, loss = 3.40712160
Iteration 5, loss = 3.19006111
Iteration 6, loss = 3.05356276
Iteration 7, loss = 2.96565737
Iteration 8, loss = 2.90972740
Iteration 9, loss = 2.84062603
Iteration 10, loss = 2.80436361
Iteration 11, loss = 2.78278993
Iteration 12, loss = 2.75681200
Iteration 13, loss = 2.73494598
Iteration 14, loss = 2.73107406
Iteration 15, loss = 2.70858660
Iteration 16, loss = 2.70388076
Iteration 17, loss = 2.70541384
Iteration 18, loss = 2.68828760
Iteration 19, loss = 2.69242550
Iteration 20, loss = 2.69063292
Iteration 21, loss = 2.68690009
Iteration 22, loss = 2.67754637
Iteration 23, loss = 2.68118362
Iteration 24, loss = 2.68123924
Iteration 25, loss = 2.67324926
Iteration 26, loss = 2.65860887
Iteration 27, loss = 2.67094295
Iteration 28, loss = 2.68790350
Iteration 29, loss = 2.68249745
Iteration 30, loss = 2.67545125
Iteration 31, loss = 2.68515887
Iteration 32, l

Iteration 254, loss = 2.38866699
Iteration 255, loss = 2.38435340
Iteration 256, loss = 2.38188364
Iteration 257, loss = 2.39710915
Iteration 258, loss = 2.38281562
Iteration 259, loss = 2.37969100
Iteration 260, loss = 2.38334144
Iteration 261, loss = 2.38420631
Iteration 262, loss = 2.37767591
Iteration 263, loss = 2.38357688
Iteration 264, loss = 2.38840041
Iteration 265, loss = 2.37989555
Iteration 266, loss = 2.37843585
Iteration 267, loss = 2.37572996
Iteration 268, loss = 2.38356978
Iteration 269, loss = 2.37562879
Iteration 270, loss = 2.37773814
Iteration 271, loss = 2.37610733
Iteration 272, loss = 2.37420366
Iteration 273, loss = 2.37317665
Iteration 274, loss = 2.37372698
Iteration 275, loss = 2.38566690
Iteration 276, loss = 2.37719889
Iteration 277, loss = 2.36675972
Iteration 278, loss = 2.36894004
Iteration 279, loss = 2.37543993
Iteration 280, loss = 2.36828412
Iteration 281, loss = 2.37164605
Iteration 282, loss = 2.36652999
Iteration 283, loss = 2.37095077
Iteration 

MLPClassifier(alpha=1e-05, hidden_layer_sizes=20, learning_rate='adaptive',
              max_iter=1000, n_iter_no_change=20, random_state=1, verbose=True)

### Test

Độ chính xác của mô hình

In [49]:
clf.score(X_test, y_test)

0.18771043771043772