#Setup

In [None]:
!pip install pandas pyarrow
!pip install transformers
!pip install SentencePiece
!pip install transformers
import os
import sys
import pandas as pd
from transformers import pipeline
from transformers import pipeline, set_seed
from transformers import T5Tokenizer, TFT5Model, TFT5ForConditionalGeneration
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel
import numpy as np
from sklearn.model_selection import train_test_split

!pip install --upgrade simplet5
from simplet5 import SimpleT5

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


INFO:pytorch_lightning.utilities.seed:Global seed set to 42


In [None]:
# Pull Data from Google Drive
from google.colab import drive
drive.mount('/content/gdrive')
abstracts = pd.read_parquet("/content/gdrive/MyDrive/w266_final_julia/df_abstract.parquet.gzip")
abstracts

Mounted at /content/gdrive


Unnamed: 0,file_key,abstract
0,0/links.,
1,a9012998,Excavations by Dr. Alan Simmons and his collea...
2,a9012376,This study will investigate organic matter deg...
3,a9012258,Professor Hull is engaged in an investigation ...
4,a9012685,"Because of recent scientific developments, phi..."
...,...,...
134611,3580.txt,"BCS-0123580 \tPI: Clifton, C. \tA three-day ..."
134612,a0123450,This proposal was submitted in response to the...
134613,a0123044,The R-Cubed program is providing scholarships ...
134614,a0123460,This proposal was submitted in response to the...


# Baseline 1: 
### using pre-trained T5-small Model

In [None]:
abs_sample = abstracts['abstract'][3]
first_sentence = abs_sample[0:205]
first_sentence

'Professor Hull is engaged in an investigation of two separateprojects:  the first concerns criteria for selection of taxonomicprinciples; the second concerns the processes by which scienceitself develops. '

In [None]:
abs_sample

'Professor Hull is engaged in an investigation of two separateprojects:  the first concerns criteria for selection of taxonomicprinciples; the second concerns the processes by which scienceitself develops.  While seemingly unrelated, Professor Hull isusing the first study as a case for exemplifying his argumentsconcerning the processes of scientific development.Although the distance between alternative principles of biologicalclassification and anything that might be considered "evidence" forthem is great, empirical considerations can be brought to bear onthe decision to choose one set of principles of classification overothers.  Professor Hull argues that the taxonomic principles thatresult in classifications that facilitate the discovery of naturalregularities are preferable to those that do not facilitate suchdiscoveries.  Dr Hull is examining the recent literature inevolutionary biology to see what sorts of classification aid eitherphylogenetic reconstruction or the discovery of re

In [None]:
model_name = 't5-base'
t5_tokenizer = T5Tokenizer.from_pretrained(model_name)
t5_model = TFT5ForConditionalGeneration.from_pretrained(model_name)
# t5_model = TFT5ForConditionalGeneration.from_pretrained('t5-small')
# t5_tokenizer = T5Tokenizer.from_pretrained('t5-small')
# t5_model.summary()

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [None]:
ARTICLE = first_sentence
t5_input_text = "summarize: " + abs_sample
t5_inputs = t5_tokenizer([t5_input_text], return_tensors='tf')

In [None]:
t5_summary_ids = t5_model.generate(t5_inputs['input_ids'],
                                   num_beams=7, 
                                   no_repeat_ngram_size=2, 
                                   min_length=100, 
                                   max_length=300)

print([t5_tokenizer.decode(g, skip_special_tokens=True,
                           clean_up_tokenization_spaces=False)
       for g in t5_summary_ids])

['professor Hull is examining the processes by which science develops . he argues that taxonomic principles that facilitate discovery of naturalregularities are preferable to those that do not facilitate suchdiscoveries. the results of these investigations will inturn be used to support the contention that the use that scientists make of each others work is central to the ongoing process of scientific development. in april, the u.s. department of the environment and the university of california published its report on the']


#Baseline 2: 
### using simplet5 to train on our data

Things to think about: Source and Target are necessary for training the T5 model
- Source: 1 Sentence | Target: Abstract ~ generation task
- Source: entire essay | Target: Abstract ~ summarization task

For this task I am going to isoalte the first sentence in the abstract (source) and attempt to predict the abstract (target)

#### Set up the data into X and Y

In [None]:
print("number of abstracts: {:,}".format(len(abstracts)))
print(f"train test split of 80%/20%")
print(f" {len(abstracts) * 0.8} abstracts in train")
print(f" {len(abstracts) * 0.2} abstracts in test")

number of abstracts: 134,616
train test split of 80%/20%
 107692.8 abstracts in train
 26923.2 abstracts in test


In [None]:
# Drops empty abstracts (should add to pre-processing step)
abstract_array = abstracts['abstract']
abstract_array = [abstract for abstract in abstract_array if len(abstract) > 0 and abstract != "Not Available"]

In [None]:
abstract_all = []
first_sentence = []
index = 0
for abstract in abstracts['abstract']:
  index = index + 1
  split = [x for x in map(str.strip, abstract.split('.')) if x]
  if len(split) > 2:
    x = "generate text: " + split[0] + split[1]
    y = " ".join(split[2:len(split)])
    first_sentence.append(x)
    abstract_all.append(y)
all_data = pd.DataFrame({'first_sentence': first_sentence, 'abstract': abstract_all})
all_data

Unnamed: 0,first_sentence,abstract
0,generate text: Excavations by DrAlan Simmons a...,"Datedto the ninth millennium BC, this stratifi..."
1,generate text: This study will investigate org...,The field study will take place in theproduct...
2,generate text: Professor Hull is engaged in an...,Although the distance between alternative prin...
3,generate text: Because of recent scientific de...,"Under a previous NSF grant, Dr Eells wrote a b..."
4,generate text: Microzooplankton are a morpholo...,Incontrast to visual predation on microzooplan...
...,...,...
125412,"generate text: BCS-0123580 \tPI: Clifton, CA t...",The workshop is designed to encourage research...
125413,generate text: This proposal was submitted in ...,Large band offsets ofGaN/AlGaN or InGaN/AlGaN ...
125414,generate text: The R-Cubed program is providin...,"Professional presentations, meetings, and soci..."
125415,generate text: This proposal was submitted in ...,This fiber is predicted to have an unrivaledde...


In [None]:
X_train, X_test, y_train, y_test = train_test_split(all_data['first_sentence'], all_data['abstract'], test_size=0.2, random_state=42)
train_df = pd.DataFrame({'source_text': X_train, 'target_text': y_train}).reset_index()
test_df = pd.DataFrame({'source_text': X_test, 'target_text': y_test}).reset_index()
print(f"len train_df {len(train_df)}")
print(f"len test df {len(test_df)}")

len train_df 100333
len test df 25084


In [None]:
#instantiate
model = SimpleT5()

# load (supports t5, mt5, byT5 models)
model.from_pretrained("t5","t5-base")

# train
model.train(train_df=train_df[:5000], # pandas dataframe with 2 columns: source_text & target_text
            eval_df=test_df[:100], # pandas dataframe with 2 columns: source_text & target_text
            source_max_token_len = 500, 
            target_max_token_len = 1500,
            batch_size = 8,
            max_epochs = 5,
            use_gpu = False
            # early_stopping_patience_epochs = 0,
            # precision = 32
            )

# load trained T5 model
model.load_model("t5","trained/base2model/", use_gpu=False)

INFO:pytorch_lightning.utilities.distributed:GPU available: True, used: False
INFO:pytorch_lightning.utilities.distributed:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.distributed:IPU available: False, using: 0 IPUs
  "GPU available but not used. Set the gpus flag in your trainer `Trainer(gpus=1)` or script `--gpus=1`."
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.seed:Global seed set to 42


Training: 0it [00:00, ?it/s]

In [None]:
 # predict
model.predict("This is what we want to do ")

# Training a T5 Tokenizer