In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
!pip install transformers
!pip install simpletransformers



In [3]:
import numpy as np
import pandas as pd
import os, json, gc, re, random
from tqdm.notebook import tqdm
# from tqdm import tqdm
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

import logging
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [4]:
import torch, transformers, tokenizers
torch.__version__, transformers.__version__, tokenizers.__version__

('1.8.1+cu101', '4.6.1', '0.10.3')

In [5]:
cd drive/My Drive/Colab Notebooks/experiments

/content/drive/My Drive/Colab Notebooks/experiments


In [6]:
data_file = pd.read_csv("data/trofix.csv")

In [7]:
data_file = data_file.rename(columns={'sentence': 'input_text'}) #abstract - sentence
data_file['target_text'] = data_file[['arg1', 'arg2', 'verb']].agg(' '.join, axis=1) #title - metaphorical words
#data_file['target_text'] = data_file['verb'] #title - metaphorical words

In [8]:
papers = data_file[["input_text", "target_text"]]
papers.head()

Unnamed: 0,input_text,target_text
0,Triple mileage has struck another blow to the ...,mileage struck blow
1,U.S. officials said evidence suggests that a J...,terrorist attack target
2,"Some police forces , for example , have steppe...",forces stepped use
3,"Every day his troops gather under the green , ...",day pour stream
4,He says manufacturers are increasingly rolling...,manufacturers rolling products


In [9]:
%%time

from simpletransformers.seq2seq import Seq2SeqModel

eval_df = papers.sample(frac=0.1, random_state=42)
train_df = papers.drop(eval_df.index)

model_args = {
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    "save_model_every_epoch": False,
    "save_eval_checkpoints": False,
    "max_seq_length": 512,
    "train_batch_size": 6,
    "num_train_epochs": 3,
}

# Create a Bart-base model
model = Seq2SeqModel(encoder_decoder_type="bart",
                    encoder_decoder_name="facebook/bart-base",
                    args=model_args)

CPU times: user 5.08 s, sys: 1.03 s, total: 6.12 s
Wall time: 8.7 s


In [10]:
%%time

# Train the model
model.train_model(train_df)

# Evaluate the model
result = model.eval_model(eval_df)
print(result)

INFO:simpletransformers.seq2seq.seq2seq_utils: Creating features from dataset file at cache_dir/


  0%|          | 0/1300 [00:00<?, ?it/s]

INFO:simpletransformers.seq2seq.seq2seq_model: Training started


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/217 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/217 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/217 [00:00<?, ?it/s]

INFO:simpletransformers.seq2seq.seq2seq_model:Saving model into outputs/
INFO:simpletransformers.seq2seq.seq2seq_model: Training of facebook/bart-base model complete. Saved to outputs/.
INFO:simpletransformers.seq2seq.seq2seq_utils: Creating features from dataset file at cache_dir/


  0%|          | 0/144 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/18 [00:00<?, ?it/s]

INFO:simpletransformers.seq2seq.seq2seq_model:{'eval_loss': 1.6503019879261653}


{'eval_loss': 1.6503019879261653}
CPU times: user 2min 53s, sys: 3.23 s, total: 2min 57s
Wall time: 2min 47s


In [11]:
for _ in range(65):

    random_idx = random.randint(0, len(eval_df)-1)

    abstract = eval_df.iloc[random_idx]['input_text']
    true_title = eval_df.iloc[random_idx]['target_text']

    # Predict with trained BART model
    predicted_title = model.predict([abstract])[0]

    print(f'True Title: {true_title}\n')
    print(f'Predicted Title: {predicted_title}\n')
    print(f'Abstract: {abstract}\n\n\n')

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: kennedy kill newspapers

Predicted Title: k kill newspapers

Abstract: Senator Kennedy , and the handful of legislators who knew about the secret provision , intended to kill two unfriendly newspapers .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: companies examining possibilities

Predicted Title: compan examining possibilities

Abstract: It said that the companies are examining the possibilities provided within the framework of securities law but that `` all these considerations have n't been in any way concluded . ''





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: that absorb sunlight

Predicted Title: that absorb sunlight

Abstract: Mr. Gallas says he began looking into melanin about five years ago when he was working on solar energy and was looking for materials that absorb sunlight .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: they kill you

Predicted Title: they kill you

Abstract: `` But we found that at the end of three weeks , they want to kill you . ''





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: intention examining records

Predicted Title: seccerns examining records

Abstract: Prompted by news reports of alleged misappropriation of funds , the SEC visited the firm 's New York headquarters last week with the intention of examining business records .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: effectiveness dissolve gallstones

Predicted Title: drug dissolve gallstones

Abstract: But some physicians questioned the effectiveness of the drug , designed to dissolve gallstones , and how widely it might be used .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: which killed 188

Predicted Title: which killed 188

Abstract: The latest truce in the week-old battle , which has killed 188 and injured 534 , was arranged by Syrian President Assad and Iranian President Khamenei .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: that escape detection

Predicted Title: thatmissions escape detection

Abstract: Most patients will not be aware of an expensive test forgone or a consultation not provided -- omissions that can readily escape detection if they are limited in scope .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: it kills parasite

Predicted Title: it kills parasite

Abstract: It kills the parasite residing in red blood cells and has few side effects , SmithKline said .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: she examines foods

Predicted Title: you examines foods

Abstract: `` You could change husbands , but not the situation , '' she thinks as she examines snack foods .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: economy absorb them

Predicted Title: econom absorb them

Abstract: But Korea 's booming economy can absorb them , economists say .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: structure fixed it

Predicted Title: that reached it

Abstract: `` I 'm quite sure there was someone within the IBM structure that could have just reached in and fixed it . ''





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: state escape base

Predicted Title: state escape reliance

Abstract: All agree that the state must diversify its industrial base to escape the recently detrimental reliance on oil , said Ron Jones , chairman of the New Orleans Business Council and president of Louisiana Coca Cola Bottling Ltd .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: poles missed message

Predicted Title: p missed message

Abstract: Few Poles missed that message in his tributes to their leader .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: delays filling orders

Predicted Title: it filling orders

Abstract: Lillian Vernon Corp . , a mail-order company , said it is experiencing delays in filling orders at its new national distribution center in Virginia Beach , Va .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: that lend to

Predicted Title: that lend to

Abstract: But bankers that lend to farmers suddenly are worrying anew about their borrower 's financial health .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: e examined horse

Predicted Title: dr examined horse

Abstract: A Dr. E `` examined a horse that had warts on its forelegs , '' begins one item in the newsletter , which withholds names of vets and patients .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: farmers planted smallest

Predicted Title: farmers planted acres

Abstract: Farmers in Kansas , the No. 1 wheat-producing state , planted 10.2 million acres , the smallest since 1971 .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: requirements pump %

Predicted Title: that pump capital

Abstract: They are irate about new capital-adequacy requirements that force securities firms to pump at least 20 % more capital into reserves .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: attempt knock amendment

Predicted Title: attempt knock amendment

Abstract: The issue is particularly hot in Florida , where a coalition of civic leaders , civil libertarians and Hispanic groups went to federal court recently in an unsuccessful attempt to knock the proposed amendment off the ballot .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: unions grab members

Predicted Title: union grab members

Abstract: The suspension likely will accelerate inter-union warfare , as the electricians step up their recruitment drive at new sites , while other unions try to grab electricians ' members in factories where they represent the minority , Mr. Metcalfe says .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: requirements pump %

Predicted Title: that pump capital

Abstract: They are irate about new capital-adequacy requirements that force securities firms to pump at least 20 % more capital into reserves .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: remark cooled heels

Predicted Title: making cooled heels

Abstract: `` That remark -LRB- of the official -RRB- definitely cooled the heels of the bulls , '' said Paul Clohesy , chief dealer at Australia and New Zealand Banking Group Ltd. in New York .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: which killed 188

Predicted Title: which killed 188

Abstract: The latest truce in the week-old battle , which has killed 188 and injured 534 , was arranged by Syrian President Assad and Iranian President Khamenei .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: reagan examine creation

Predicted Title: boardagan examine issues

Abstract: CNW Corp . , the parent of the Chicago & North Western Transportation Co . , said President Reagan has ordered the creation of a presidential emergency board to examine the issues in a labor dispute over crew size , thus temporarily averting a strike that had been scheduled for tomorrow night .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: court strikes law

Predicted Title: court strikes law

Abstract: In 1925 , Congress said that the Supreme Court has discretion to refuse to hear cases , except when a state supreme court strikes down a federal law or upholds a state law that was challenged as violating the Constitution , or when a federal appeals court strikes down a state law or a federal appeals or district court strikes down a federal law , all on constitutional grounds .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: winners fly patrol

Predicted Title: win winners fly patrol

Abstract: Three former scholarship winners now fly border patrol for the U.S. Customs Service .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: cars roll line

Predicted Title: car roll line

Abstract: Most of the machinery in Toyota Motor Corp. 's $ 800 million auto plant in Georgetown , Ky . , came from Japan , for example , and Toyota spokesmen say that when the first cars roll off the assembly line this spring , 40 % of the total value will be imported .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: cray plows %

Predicted Title: rayray plows %

Abstract: Cray plows more than 15 % of its revenues back into R & D , a very high percentage even for high-tech companies .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: dole attacking bush

Predicted Title: effle attacking congress

Abstract: But Mr. Dole has so far spent much of his campaign attacking Vice President Bush for , among other things , having the effrontery to attack Congress .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: intention examining records

Predicted Title: seccerns examining records

Abstract: Prompted by news reports of alleged misappropriation of funds , the SEC visited the firm 's New York headquarters last week with the intention of examining business records .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: scientists examined members

Predicted Title: members examined members

Abstract: In Kindred 1002 alone , the scientists have examined 350 members .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: drexel poured resources

Predicted Title: rexrexel poured resources

Abstract: In recent years , Drexel has poured far more resources into areas such as high-yield , `` junk '' bonds , merger and acquisition advice and mortgage-backed securities .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: parties strike arrangements

Predicted Title: part strike arrangements

Abstract: He said that parties looking to strike joint venture arrangements with Texaco `` want to know who they 're going to be dealing with in the future . ''





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: you plant it

Predicted Title: you plant it

Abstract: You have to plant it when it 's small , and wait for it to grow. '





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: parties strike arrangements

Predicted Title: part strike arrangements

Abstract: He said that parties looking to strike joint venture arrangements with Texaco `` want to know who they 're going to be dealing with in the future . ''





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: parties strike arrangements

Predicted Title: part strike arrangements

Abstract: He said that parties looking to strike joint venture arrangements with Texaco `` want to know who they 're going to be dealing with in the future . ''





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: that flooded offices

Predicted Title: that flooded offices

Abstract: But Cameloot was just another loser among the entries that flooded the offices of Circus Circus Enterprises Inc. during a contest to name its planned $ 290 million castle-theme resort .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: houses destroyed homes

Predicted Title: thatChris destroyed houses

Abstract: Tropical storm Chris spawned tornadoes that damaged houses , destroyed mobile homes and caused at least one death in the Carolinas before gradually weakening .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: it eats soul

Predicted Title: it eats soul

Abstract: It just eats your soul . ''





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: opening grabbed gold

Predicted Title: that grabbed gold

Abstract: `` A strong opening in platinum coupled with a weak dollar just grabbed gold by the horns and took it on up , '' he said .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: effectiveness dissolve gallstones

Predicted Title: drug dissolve gallstones

Abstract: But some physicians questioned the effectiveness of the drug , designed to dissolve gallstones , and how widely it might be used .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: holmes target acquisitions

Predicted Title: court target acquisitions

Abstract: Mr. Story thinks Mr. Holmes a Court will soon target acquisitions .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: skaters pump revenue

Predicted Title: kers pump revenue

Abstract: Besides if Mikhail Gorbachev wants to spend all those resources so the East bloc 's skaters and lugers can pump revenue into ABC , it 's fine with us .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: poles missed message

Predicted Title: p missed message

Abstract: Few Poles missed that message in his tributes to their leader .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: pittenger fill post

Predicted Title: itittenger fill post

Abstract: Baaron Pittenger , a long-time USOC aide who had served as acting executive director after Mr. Miller left , was named to fill the vacant post until year end , when Mr. Schiller 's contract expires .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: advertiser targeting california

Predicted Title: ad targeting California

Abstract: Olds is n't the only advertiser targeting Southern California with such campaigns .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: federal absorb thrifts

Predicted Title: f absorb thrifts

Abstract: First Federal was then to absorb several other troubled thrifts in federally assisted mergers , as part of the Federal Savings and Loan Insurance Corp. 's so-called Southwest Plan to rescue the many ailing savings and loans based in the region .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: lebanese drink coffee

Predicted Title: people drink tea

Abstract: A Middle Eastern analyst says Lebanese usually drink coffee at such occasions ; Palestinians drink tea .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: poles missed message

Predicted Title: p missed message

Abstract: Few Poles missed that message in his tributes to their leader .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: reagan examine creation

Predicted Title: boardagan examine issues

Abstract: CNW Corp . , the parent of the Chicago & North Western Transportation Co . , said President Reagan has ordered the creation of a presidential emergency board to examine the issues in a labor dispute over crew size , thus temporarily averting a strike that had been scheduled for tomorrow night .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: it targeted center

Predicted Title: it targeted center

Abstract: Before midnight , it had targeted the National Aeronautics and Space Administration Ames Research Center in California 's Silicon Valley , as well as the University of Pittsburgh and the Los Alamos National Laboratory in New Mexico .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: which lend themselves

Predicted Title: ityo press for projects

Abstract: Tokyo is using its overvalued yen to press for big state projects -- which lend themselves to large business contracts .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: stern played harp

Predicted Title: bull played harp

Abstract: Karen Stern has played the harp surrounded by ice sculptures at a restaurant and as part of the display for a three-foot-high cake at a pastry competition .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: reservationists strike corp.

Predicted Title: workers strike Corp.

Abstract: The feisty , 62-year-old director of the airline division of the Teamsters union wants 4 , 300 Pan Am reservationists , ticket agents , and office and other workers to strike Pan Am Corp. after a cooling-off period that ends Sunday .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: it kills parasite

Predicted Title: it kills parasite

Abstract: It kills the parasite residing in red blood cells and has few side effects , SmithKline said .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: time eat lot

Predicted Title: he eat lot

Abstract: Indiana Rep. Andrew Jacobs , a Democrat who 's neutral in the presidential-nomination contest , says Mr. Gephardt reminds him of a time in his life when he had to eat a lot of dinners at a drive-in restaurant .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: report lends credence

Predicted Title: it ravage cells

Abstract: But the report , published in today 's edition of Science , lends credence to a body of opinion that HIV-2 in some cases may not ravage T-4 cells -- immune sentries of the bloodstream -- as much as it harms cells of the brain or bowel , said Dr. Levy in an interview .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: dole attacking bush

Predicted Title: effle attacking congress

Abstract: But Mr. Dole has so far spent much of his campaign attacking Vice President Bush for , among other things , having the effrontery to attack Congress .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: industry stepped efforts

Predicted Title: government stepped efforts

Abstract: Both the government and private industry have stepped up efforts to recruit back to Korea scientists who were trained and then stayed abroad .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: romania kicking ambassador

Predicted Title: re kicking ambassador

Abstract: While Mr. Gorbachev preached togetherness yesterday , Romania was kicking Hungary 's ambassador out of the country .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: efforts cool economy

Predicted Title: eff cool economy

Abstract: The Bank of England added that it is `` too early to be sure '' whether the British government 's recent efforts to cool down Britain 's overheated economy `` will prove sufficient to restore the economy to a sustainable path . ''





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: it targeted center

Predicted Title: it targeted center

Abstract: Before midnight , it had targeted the National Aeronautics and Space Administration Ames Research Center in California 's Silicon Valley , as well as the University of Pittsburgh and the Los Alamos National Laboratory in New Mexico .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: italy dragging feet

Predicted Title: iItaly dragging feet

Abstract: An Aer Lingus spokesman claimed Italy is dragging its feet because Alitalia plans to start flights between Milan and Manchester this month .





Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

True Title: economy absorb them

Predicted Title: econom absorb them

Abstract: But Korea 's booming economy can absorb them , economists say .



