# Подключение библиотек

In [1]:
from google.colab import files 
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install openpyxl==3.0.9 -q

In [3]:
import openpyxl
openpyxl.__version__

'3.0.9'

In [4]:
import tensorflow as tf
import numpy as np
import unicodedata
import re
import time
import pandas as pd
import os
from sklearn.model_selection import train_test_split
import tensorflow.keras as keras
from tensorflow.keras.layers import Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy, CategoricalCrossentropy
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.optimizers.schedules import LearningRateSchedule
import matplotlib.pyplot as plt

In [5]:
!pip install transformers
!pip install sentencepiece



In [6]:
import torch
import json 
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [7]:
directory= '/content/drive/My Drive/Базы/мед.статьи_23 урок /1/'

# Чтение данных

In [8]:
def readText():
  artical_set = [] 
  for filename in os.listdir(directory): 
    path=directory+filename
    artical_read = pd.read_excel(path)
    artical_set.append(artical_read)
    text=[]
    abstract=[]
    for i in range(0,len(artical_set)):
      cols_total= set(artical_set[i].columns)
      if len(cols_total) > 2:
        diff = cols_total - set(artical_set[i].columns[[0, 1]])
        artical_set[i].drop(diff, axis=1, inplace=True)
      elif len(cols_total) < 2:
        continue
      artical=artical_set[i].set_axis(['Часть статьи', 'Текст статьи'], axis=1, inplace=False)
      only_abstract=artical.iloc[0]['Текст статьи']
      only_introduction=artical.iloc[1]['Текст статьи']
      only_metods=artical.iloc[2]['Текст статьи']
      only_conclusion=artical.iloc[4]['Текст статьи']
      full_text=str(only_introduction) + str(only_metods) + str(only_conclusion)
      abstract.append(only_abstract)
      text.append(full_text)
  return abstract, text

In [9]:
def clean_words(sentence):
  text=[]
  if sentence != '':
    delSymbols = ['\n', "\t", "\ufeff", ".", "_", "-", ",", "!", "?", "–", "(", ")", "«", "»", "№", ";",'•','%',"]","["]
    for dS in delSymbols: 
      sentence = sentence.replace(dS, " ") 
      sentence = re.sub("[.]", " ", sentence) 
      sentence = re.sub(":", " ", sentence)
      sentence = re.sub("<", " < ", sentence)
      sentence = re.sub(">", ">  ", sentence)
      sentence = re.sub(r"≥", "более ", sentence)
      sentence = re.sub(r"≤", "менее ", sentence)
      sentence = ' '.join(sentence.split()) 
      sentence = sentence.lower() 
      sentence = sentence.strip()
    text.append(sentence)
  return text

# Подготовка дата-сета

In [10]:
# перенести вниз! 1) читаем и чистим исходные данные
abstract, text=readText()
summaries=[]
longreview = []
for i in range(0,len(abstract)):
  clean_text = clean_words(text[i])
  longreview.append(clean_text[0])
  if type(abstract[i]) == str:
    clean_abstract=clean_words(abstract[i])
  else:
    clean_abstract="nan"
  for j in range(0,len(clean_abstract)):
    start_token, end_token = '<start>' , '<end>'
    teg_abstract = start_token + ' ' + clean_abstract[j] + ' ' + end_token
  summaries.append(teg_abstract)

In [11]:
filters = '!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n'
oov_token = '<unk>'
document_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token=oov_token)
summary_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters=filters, oov_token=oov_token)

# Токенизация и формирование выборки

In [12]:
document_tokenizer.fit_on_texts(longreview)
summary_tokenizer.fit_on_texts(summaries)

In [13]:
inputs = document_tokenizer.texts_to_sequences(longreview)
targets = summary_tokenizer.texts_to_sequences(summaries)

In [14]:
summary_tokenizer.texts_to_sequences(["заболевания предстательной железы"])

[[70, 371, 187]]

In [15]:
summary_tokenizer.sequences_to_texts([[70, 371, 187]])

['заболевания предстательной железы']

In [16]:
encoder_vocab_size = len(document_tokenizer.word_index) + 1
decoder_vocab_size = len(summary_tokenizer.word_index) + 1
print(encoder_vocab_size, decoder_vocab_size)

126609 37693


In [17]:
document_lengths = pd.Series([len(x) for x in longreview])
summary_lengths = pd.Series([len(x) for x in summaries])

In [18]:
#document_lengths.describe()

In [19]:
#summary_lengths.describe()

In [20]:
encoder_maxlen = 400 #12000 400
decoder_maxlen = 75 #2000 75

In [21]:
inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs, maxlen=encoder_maxlen, padding='post', truncating='post')
targets = tf.keras.preprocessing.sequence.pad_sequences(targets, maxlen=decoder_maxlen, padding='post', truncating='post')

In [22]:
inputs = tf.cast(inputs, dtype=tf.int32) #(2004, 400) => (2004, 12000)
targets = tf.cast(targets, dtype=tf.int32) #(2004, 75) => (2004,2000)

In [23]:
BUFFER_SIZE = 20000 #1000 20000
BATCH_SIZE = 64 #6 64

In [24]:
dataset = tf.data.Dataset.from_tensor_slices((inputs, targets)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE) #2004/BATCH_SIZE

# Модель

In [25]:
def get_angles(position, i, d_model):
  angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
  return position * angle_rates

def positional_encoding(position, d_model):
  angle_rads = get_angles(np.arange(position)[:, np.newaxis], np.arange(d_model)[np.newaxis, :],d_model)
  angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
  angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
  pos_encoding = angle_rads[np.newaxis, ...]
  return tf.cast(pos_encoding, dtype=tf.float32)

In [26]:
def create_padding_mask(seq):
  seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
  return seq[:, tf.newaxis, tf.newaxis, :]

def create_look_ahead_mask(size):
  mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
  return mask

In [27]:
def scaled_dot_product_attention(q, k, v, mask):
  matmul_qk = tf.matmul(q, k, transpose_b=True)
  dk = tf.cast(tf.shape(k)[-1], tf.float32)
  scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
  if mask is not None:
    scaled_attention_logits += (mask * -1e9)  
  attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
  output = tf.matmul(attention_weights, v)
  return output, attention_weights

In [28]:
class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads):
    super(MultiHeadAttention, self).__init__()
    self.num_heads = num_heads
    self.d_model = d_model
    assert d_model % self.num_heads == 0

    self.depth = d_model // self.num_heads
    self.wq = tf.keras.layers.Dense(d_model)
    self.wk = tf.keras.layers.Dense(d_model)
    self.wv = tf.keras.layers.Dense(d_model)
    self.dense = tf.keras.layers.Dense(d_model)
        
  def split_heads(self, x, batch_size):
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(x, perm=[0, 2, 1, 3])
    
  def call(self, v, k, q, mask):
    batch_size = tf.shape(q)[0]
    q = self.wq(q)
    k = self.wk(k)
    v = self.wv(v)
    q = self.split_heads(q, batch_size)
    k = self.split_heads(k, batch_size)
    v = self.split_heads(v, batch_size)

    scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask)
    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
    concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))
    output = self.dense(concat_attention)       
    return output, attention_weights

In [29]:
def point_wise_feed_forward_network(d_model, dff):
  return tf.keras.Sequential([tf.keras.layers.Dense(dff, activation='relu'),tf.keras.layers.Dense(d_model)])

In [30]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(EncoderLayer, self).__init__()
    self.mha = MultiHeadAttention(d_model, num_heads)
    self.ffn = point_wise_feed_forward_network(d_model, dff)
    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    
  def call(self, x, training, mask):
    attn_output, _ = self.mha(x, x, x, mask)
    attn_output = self.dropout1(attn_output, training=training)
    out1 = self.layernorm1(x + attn_output)
    ffn_output = self.ffn(out1)
    ffn_output = self.dropout2(ffn_output, training=training)
    out2 = self.layernorm2(out1 + ffn_output)
    return out2

In [31]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(DecoderLayer, self).__init__()
    self.mha1 = MultiHeadAttention(d_model, num_heads)
    self.mha2 = MultiHeadAttention(d_model, num_heads)
    self.ffn = point_wise_feed_forward_network(d_model, dff)
    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    self.dropout3 = tf.keras.layers.Dropout(rate)
    
    
  def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
    attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)
    attn1 = self.dropout1(attn1, training=training)
    out1 = self.layernorm1(attn1 + x)
    attn2, attn_weights_block2 = self.mha2(enc_output, enc_output, out1, padding_mask)
    attn2 = self.dropout2(attn2, training=training)
    out2 = self.layernorm2(attn2 + out1)
    ffn_output = self.ffn(out2)
    ffn_output = self.dropout3(ffn_output, training=training)
    out3 = self.layernorm3(ffn_output + out2)
    return out3, attn_weights_block1, attn_weights_block2

In [32]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding, rate=0.1):
    super(Encoder, self).__init__()
    self.d_model = d_model
    self.num_layers = num_layers
    self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
    self.pos_encoding = positional_encoding(maximum_position_encoding, self.d_model)
    self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(rate)
        
  def call(self, x, training, mask):
    seq_len = tf.shape(x)[1]
    x = self.embedding(x)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x += self.pos_encoding[:, :seq_len, :]
    x = self.dropout(x, training=training)
    for i in range(self.num_layers):
      x = self.enc_layers[i](x, training, mask)
    return x

In [33]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, maximum_position_encoding, rate=0.1):
    super(Decoder, self).__init__()
    self.d_model = d_model
    self.num_layers = num_layers
    self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
    self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)
    self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(rate)
    
  def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
    seq_len = tf.shape(x)[1]
    attention_weights = {}
    x = self.embedding(x)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x += self.pos_encoding[:, :seq_len, :]
    x = self.dropout(x, training=training)
    for i in range(self.num_layers):
      x, block1, block2 = self.dec_layers[i](x, enc_output, training, look_ahead_mask, padding_mask)
      attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
      attention_weights['decoder_layer{}_block2'.format(i+1)] = block2
    return x, attention_weights

In [34]:
class Transformer(tf.keras.Model):
  def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, pe_input, pe_target, rate=0.1):
    super(Transformer, self).__init__()
    self.encoder = Encoder(num_layers, d_model, num_heads, dff, input_vocab_size, pe_input, rate)
    self.decoder = Decoder(num_layers, d_model, num_heads, dff, target_vocab_size, pe_target, rate)
    self.final_layer = tf.keras.layers.Dense(target_vocab_size)
    
  def call(self, inp, tar, training, enc_padding_mask, look_ahead_mask, dec_padding_mask):
    enc_output = self.encoder(inp, training, enc_padding_mask)
    dec_output, attention_weights = self.decoder(tar, enc_output, training, look_ahead_mask, dec_padding_mask)
    final_output = self.final_layer(dec_output)
    return final_output, attention_weights

# Обучение

In [35]:
num_layers = 4
d_model = 128
dff = 512
num_heads = 8
EPOCHS = 20

In [36]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()
    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)
    self.warmup_steps = warmup_steps
    
  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)
    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [37]:
learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

In [38]:
def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)
  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask
  return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

In [39]:
train_loss = tf.keras.metrics.Mean(name='train_loss')

In [40]:
transformer = Transformer(num_layers, d_model, num_heads, dff,encoder_vocab_size, 
    decoder_vocab_size, pe_input=encoder_vocab_size, pe_target=decoder_vocab_size,)

In [41]:
def create_masks(inp, tar):
  enc_padding_mask = create_padding_mask(inp)
  dec_padding_mask = create_padding_mask(inp)
  look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
  dec_target_padding_mask = create_padding_mask(tar)
  combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
  return enc_padding_mask, combined_mask, dec_padding_mask

In [42]:
checkpoint_path = "checkpoints"
ckpt = tf.train.Checkpoint(transformer=transformer, optimizer=optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)
if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print ('Latest checkpoint restored!!')

In [43]:
@tf.function
def train_step(inp, tar):
  tar_inp = tar[:, :-1]
  tar_real = tar[:, 1:]
  enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)
  with tf.GradientTape() as tape:
    predictions, _ = transformer(inp, tar_inp, True, enc_padding_mask, combined_mask, dec_padding_mask) 
    loss = loss_function(tar_real, predictions)

  gradients = tape.gradient(loss, transformer.trainable_variables)    
  optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
  train_loss(loss)

In [44]:
for epoch in range(EPOCHS):
  start = time.time()
  train_loss.reset_states()
  for (batch, (inp, tar)) in enumerate(dataset): # batch 31, dataset 400, 75
    train_step(inp, tar)
    print ('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, train_loss.result()))
    if batch % 31 == 0:
      print ("все")

Epoch 1 Batch 0 Loss 10.5414
все
Epoch 1 Batch 1 Loss 10.5406
Epoch 1 Batch 2 Loss 10.5402
Epoch 1 Batch 3 Loss 10.5408
Epoch 1 Batch 4 Loss 10.5404
Epoch 1 Batch 5 Loss 10.5399
Epoch 1 Batch 6 Loss 10.5402
Epoch 1 Batch 7 Loss 10.5402
Epoch 1 Batch 8 Loss 10.5399
Epoch 1 Batch 9 Loss 10.5400
Epoch 1 Batch 10 Loss 10.5397
Epoch 1 Batch 11 Loss 10.5394
Epoch 1 Batch 12 Loss 10.5390
Epoch 1 Batch 13 Loss 10.5385
Epoch 1 Batch 14 Loss 10.5381
Epoch 1 Batch 15 Loss 10.5378
Epoch 1 Batch 16 Loss 10.5373
Epoch 1 Batch 17 Loss 10.5368
Epoch 1 Batch 18 Loss 10.5365
Epoch 1 Batch 19 Loss 10.5361
Epoch 1 Batch 20 Loss 10.5358
Epoch 1 Batch 21 Loss 10.5353
Epoch 1 Batch 22 Loss 10.5348
Epoch 1 Batch 23 Loss 10.5343
Epoch 1 Batch 24 Loss 10.5338
Epoch 1 Batch 25 Loss 10.5334
Epoch 1 Batch 26 Loss 10.5328
Epoch 1 Batch 27 Loss 10.5323
Epoch 1 Batch 28 Loss 10.5317
Epoch 1 Batch 29 Loss 10.5311
Epoch 1 Batch 30 Loss 10.5306
Epoch 1 Batch 31 Loss 10.5300
все
Epoch 2 Batch 0 Loss 10.5067
все
Epoch 2 B

In [45]:
def evaluate(input_document):
  input_document = document_tokenizer.texts_to_sequences([input_document])
  input_document = tf.keras.preprocessing.sequence.pad_sequences(input_document, maxlen=encoder_maxlen, padding='post', truncating='post')
  encoder_input = tf.expand_dims(input_document[0], 0)
  decoder_input = [summary_tokenizer.word_index["<start>"]]
  output = tf.expand_dims(decoder_input, 0)
  for i in range(decoder_maxlen):
    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(encoder_input, output)
    predictions, attention_weights = transformer(encoder_input, output,False, enc_padding_mask,combined_mask, dec_padding_mask)
    predictions = predictions[: ,-1:, :]
    predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)
    if predicted_id == summary_tokenizer.word_index["<end>"]:
      return tf.squeeze(output, axis=0), attention_weights
    output = tf.concat([output, predicted_id], axis=-1)
  return tf.squeeze(output, axis=0), attention_weights

In [46]:
def summarize(input_document):
  summarized = evaluate(input_document=input_document)[0].numpy()
  summarized = np.expand_dims(summarized[1:], 0)  
  return summary_tokenizer.sequences_to_texts(summarized)[0]  

# Проверка результатов

In [47]:
# вывод сгенерированного саммари статьи (результат НС)
summarize(longreview[1])

'в в статье и методы в статье и методы в статье и методы в статье и методы в статье и методы в статье и методы в статье и методы в статье и методы в статье и методы в статье и методы в статье и методы в статье и методы в статье и методы в статье и методы в статье и методы в статье и методы в статье и методы в статье и методы в статье'

In [57]:
# вывод полного текста статьи
longreview[1]

'в настоящее время около 37 млн человек в мире живут с вич инфекцией 1 общее число вич инфицированных в российской федерации составило 1272403 человека по состоянию на 1 е полугодие 2018 г наиболее высокая частота инфицированности вич наблюдается среди лиц 30 44 лет среди больных преобладают мужчины 63 из них 3 3 в возрасте 35 39 лет многие планируют стать отцами1 антиретровирусная терапия арвт позволяет этим пациентам безопасно реализовать репродуктивную функцию арвт проводят по трехкомпонентной схеме цель терапии подавление репликации вируса и снижение количества рнк вич до неопределяемых значений как в крови так и в семенной плазме 2 3 исследователи не выявили на поверхности сперматозоидов рецепторы необходимые для проникновения вич в клетку а именно cd4 cxcr4 и ccr5 4 тем не менее остается много вопросов о влиянии вич инфекции на фертильность мужчин одни исследователи опубликовали данные о том что у вичинфицированных пациентов уменьшается объем эякулята общее количество концентраци

In [58]:
# вывод изначального саммари статьи 
summaries[1]

'<start> цель исследования оценить показатели спермограммы мужчин инфицированных вич и принимающих антиретровирусные препараты материалы и методы в проспективное исследование случай контроль вошли 115 мужчин в основную группу 51 пациент с вич инфекцией в контрольную группу 64 пациента с отрицательным результатом серологического исследования на наличие вич и нормальными показателями спермограммы проведена оценка 97 и 111 образцов спермы статус пациентов с вичинфекцией оценивали на основании данных о стадии и фазе заболевания уровне вирусной нагрузки уровне лимфоцитов cd3+ cd4+ cd8+ и длительности антиретровирусной терапии арвт результаты стаж вич инфекции варьировал от 2 до 9 лет медиана 5 лет антиретровирусные препараты принимали все пациенты медиана продолжительности приема препаратов 1 5 года у 48 4 вич инфицированных пациентов выявлена нормозооспермия в структуре патозооспермии преобладала тератозооспермия 40 при этом доля патологических форм сперматозоидов у мужчин с вич инфекцией 