In [1]:

# Install TensorFlow and also our package via PyPI
!pip install tensorflow-gpu
!pip install headliner

Collecting tensorflow-gpu
[?25l  Downloading https://files.pythonhosted.org/packages/f1/aa/ae64be5acaac9055329289e6bfd54c1efa28bfe792f9021cea495fe2b89d/tensorflow_gpu-2.4.0-cp36-cp36m-manylinux2010_x86_64.whl (394.7MB)
[K     |████████████████████████████████| 394.7MB 44kB/s 
Installing collected packages: tensorflow-gpu
Successfully installed tensorflow-gpu-2.4.0
Collecting headliner
[?25l  Downloading https://files.pythonhosted.org/packages/74/e4/386e9f58b8464261d4e220abaebe66da2426d55b6ea4186ec2cb828195ef/headliner-1.0.2-py3-none-any.whl (65kB)
[K     |████████████████████████████████| 71kB 9.1MB/s 
[?25hCollecting transformers>=2.2.2
[?25l  Downloading https://files.pythonhosted.org/packages/50/0c/7d5950fcd80b029be0a8891727ba21e0cd27692c407c51261c3c921f6da3/transformers-4.1.1-py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 14.7MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13

In [2]:
!wget http://www.manythings.org/anki/rus-eng.zip
!unzip rus-eng.zip
!head rus.txt

--2020-12-26 21:20:06--  http://www.manythings.org/anki/rus-eng.zip
Resolving www.manythings.org (www.manythings.org)... 104.24.108.196, 172.67.173.198, 104.24.109.196, ...
Connecting to www.manythings.org (www.manythings.org)|104.24.108.196|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13811083 (13M) [application/zip]
Saving to: ‘rus-eng.zip’


2020-12-26 21:20:07 (27.2 MB/s) - ‘rus-eng.zip’ saved [13811083/13811083]

Archive:  rus-eng.zip
  inflating: rus.txt                 
  inflating: _about.txt              
Go.	Марш!	CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1159202 (shanghainese)
Go.	Иди.	CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #5898247 (marafon)
Go.	Идите.	CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #5898250 (marafon)
Hi.	Здравствуйте.	CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #402127 (odexed)
Hi.	Привет!	CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #466968 (katjka

In [3]:
# Create the dataset but only take a subset for faster training
import io

def create_dataset(path, num_examples):
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
    word_pairs = [[w for w in l.split('\t')]  for l in lines[:num_examples]]
    return zip(*word_pairs)

eng, rus, meta = create_dataset('rus.txt', 30000)
data = list(zip(eng, rus))
data[:5]

[('Go.', 'Марш!'),
 ('Go.', 'Иди.'),
 ('Go.', 'Идите.'),
 ('Hi.', 'Здравствуйте.'),
 ('Hi.', 'Привет!')]

In [4]:
# Split the dataset into train and test
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=100)

In [5]:
len(train)

29900

In [6]:
# Define the model and train it
import tensorflow as tf
from headliner.trainer import Trainer
from headliner.model.attention_summarizer import AttentionSummarizer

summarizer = AttentionSummarizer(lstm_size=256, embedding_size=64, max_prediction_len=10)
trainer = Trainer(batch_size=32, 
                  steps_per_epoch=500, 
                  steps_to_log=50, 
                  max_vocab_size_encoder=30000,
                  max_vocab_size_decoder=30000,
                  max_output_len=10,
                  model_save_path='/tmp/summarizer')
trainer.train(summarizer, train, num_epochs=10, val_data=test)

training a bare model, preprocessing data to init model...
fitting tokenizers...
vocab encoder: 3582, vocab decoder: 9829
epoch 0, batch 50, logs: {'loss': 3.4555767822265624}
epoch 0, batch 100, logs: {'loss': 3.16320556640625}
epoch 0, batch 150, logs: {'loss': 3.049910481770833}
epoch 0, batch 200, logs: {'loss': 2.982446594238281}
epoch 0, batch 250, logs: {'loss': 2.927796630859375}
epoch 0, batch 300, logs: {'loss': 2.8833502197265624}
epoch 0, batch 350, logs: {'loss': 2.8410792759486605}
epoch 0, batch 400, logs: {'loss': 2.7926870727539064}
epoch 0, batch 450, logs: {'loss': 2.752205132378472}
epoch 0, batch 500, logs: {'loss': 2.70997802734375}

(input) <start> she is kind . <end> 
(target) <start> она добрая . <end> 
(prediction) я не не . <end>


(input) <start> it may break . <end> 
(target) <start> он может сломаться . <end> 
(prediction) я не не не . <end>


(input) <start> we need a goal . <end> 
(target) <start> нам нужна цель . <end> 
(prediction) я не не не не . <end

In [7]:
summarizer.predict('Do you have a plan?')

'у вас есть план ? <end>'

Построим какую-нибудь картинку

In [8]:
!zip -r model-en-ru_v2.zip  /tmp/summarizer/

  adding: tmp/summarizer/ (stored 0%)
  adding: tmp/summarizer/decoder.index (deflated 65%)
  adding: tmp/summarizer/summarizer.pkl (deflated 64%)
  adding: tmp/summarizer/encoder.index (deflated 58%)
  adding: tmp/summarizer/decoder.data-00000-of-00001 (deflated 8%)
  adding: tmp/summarizer/encoder.data-00000-of-00001 (deflated 7%)
  adding: tmp/summarizer/checkpoint (deflated 38%)


In [9]:
from google.colab import files
files.download("./model-en-ru_v2.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

64