In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.layers import Input, Dense, LSTM, Dropout
from tensorflow.keras import Model
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from sklearn.utils import shuffle
import os
import json
from tqdm import tqdm
import tensorflow_datasets as tfds

In [2]:
#data
label_folder = 'drive/My Drive/Colab Notebooks/data/challenge2008/training/labels'
data_folder = 'drive/My Drive/Colab Notebooks/data/challenge2008/training/notes_cuis'

In [3]:
##label_read
label = []

for file in os.listdir(label_folder):
    with open(os.path.join(label_folder, file),'r') as file_read:
        y = 0
        for line in file_read.readlines():
            line = line.split()
            # print(line)
            if line[0] == 'intuitive':
              if line[2] == 'Obesity':
                y = 0 if line[4] == 'N' else 1
        # print(y)
        label.append(y)
        # print(len(label))

In [4]:
x = []

for file in os.listdir(data_folder):
  filepath = os.path.join(data_folder, file)
  x.append(open(filepath, encoding='UTF-8').read().strip())

In [5]:
print(len(x), len(label))
print(x[1], label[1])

730 730
C0004238 C0004238 C0003578 C0003578 C1281590 C0232197 C0232197 C0232197 C0232197 C0677519 C1283839 C1283838 C0034121 C0034121 C0596002 C0596002 C0038454 C0038454 C0038454 C0038454 C0455458 C0455458 C0019825 C0005767 C0005767 C0039985 C0403447 C0559499 C0013182 C1140621 C0011860 C1281592 C1281594 C0036658 C0006277 C0700501 C0028754 C0028754 C0028754 C0028754 C0586177 C0243032 C0028756 C0005821 C0020538 C0020538 C0042591 C0042591 C0042591 C0042591 C0333548 C0007226 C0024485 C0024485 C0024485 C0024485 C0024485 C0024485 C1272641 C1272641 C0024109 C0012569 C0012569 C0012569 C0012569 C0012569 C0012569 C0012569 C0035253 C1140618 C0543467 C0232483 C0558145 C0005823 C0005823 C0475371 C0475371 C0262926 C0262926 C0262926 C0262926 C0262926 C0262926 C0262926 C0262926 C0262926 C0021853 C1623258 C1285009 C0197554 C1280538 C1271104 C1271104 C1280999 C0013604 C1883552 C1883552 C1883552 C0202194 C1278896 C1269612 C1306645 C0200005 C0018681 C0027530 C0260877 C1269611 C1281570 C1281570 C1281570 C1

In [6]:
def tokenize(lang):
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
      filters='')
  lang_tokenizer.fit_on_texts(lang)

  tensor = lang_tokenizer.texts_to_sequences(lang)

  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                         padding='post')

  return tensor, lang_tokenizer

In [7]:
x_tensor, inp_lang_tokenizer = tokenize(x)


In [8]:
from tensorflow.keras.utils import to_categorical

y = to_categorical(label)

vocab_size = len(inp_lang_tokenizer.word_index)+1

In [12]:
print(x_tensor.shape, y.shape)

(730, 1026) (730, 2)


In [25]:
# from sklearn.model_selection import train_test_split

# xtr, xte, ytr, yte = train_test_split(x_tensor, y, test_size=0.2)

# print(len(xtr), len(ytr), len(xte), len(yte))

584 584 146 146


In [23]:
from tensorflow.keras import layers

model = tf.keras.Sequential([
        layers.Embedding(vocab_size, 64),
        layers.Bidirectional(layers.LSTM(64, return_sequences=True)),
        layers.Bidirectional(layers.LSTM(32)),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(2, activation='softmax')
])

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 64)          318336    
_________________________________________________________________
bidirectional_2 (Bidirection (None, None, 128)         66048     
_________________________________________________________________
bidirectional_3 (Bidirection (None, 64)                41216     
_________________________________________________________________
dense_2 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 130       
Total params: 429,890
Trainable params: 429,890
Non-trainable params: 0
________________________________________________

In [24]:
model.compile(optimizer=tf.keras.optimizers.Adam(1e-4), loss=tf.keras.losses.BinaryCrossentropy(), metrics=['accuracy'])

In [28]:
# results = model.fit(xtr, ytr, epochs=10, batch_size=32, validation_split = 0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [25]:
# federal
C = 0.2
E = 10
B = 16
w = model.get_weights()

In [26]:
K = 10
round = 10

In [27]:
def ClientUpdate(data, label, w):
    model.set_weights(w)
    model.fit(data, label, batch_size=B, epochs=E, validation_split=0.2)

    return model.get_weights()

In [28]:
#split data into k arrays
x_silos = np.arange(730*1026).reshape(10, 73, 1026)
y_silos = np.arange(730*2).reshape(10, 73, 2)

for k in range(10):
    x_silos[k] = x_tensor[k*73:(k+1)*73]
    y_silos[k] = y[k*73:(k+1)*73]

In [29]:
print(x_silos[1].shape)

(73, 1026)


In [30]:
for i in range(2):
    print("###round{}###".format(i+1))
    data,label = shuffle(x_silos, y_silos)
    weight = []

    for j in range(10):
        print("###client{}###".format(j+1))
        weight.append(ClientUpdate(x_silos[j], y_silos[j], w))
    w = np.mean(weight, axis=0)

###round1###
###client1###
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
###client2###
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
###client3###
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
###client4###
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
###client5###
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
###client6###
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
###client7###
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
###client8###
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10

In [None]:
# for i in range(round):
#     m = max(C*K, 1)
#     m = int(m)
#     data,label = shuffle(x_silos, y_silos)
#     S_data = data[:m]
#     S_label = label[:m]
#     weight = []

#     for j in range(m):
#         weight.append(ClientUpdate(S_data[j], S_label[j], w))
#     w = np.mean(weight, axis=0)
