# Data Setup

In [None]:
!pip install pandas pyarrow
!pip install transformers
!pip install SentencePiece
!pip install transformers
!pip install simplet5

import os
import sys
import pandas as pd
from transformers import pipeline
from transformers import pipeline, set_seed
from transformers import T5Tokenizer, TFT5Model, TFT5ForConditionalGeneration
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel
import numpy as np
from sklearn.model_selection import train_test_split
from simplet5 import SimpleT5
import re

In [3]:
# Pull Data from Google Drive
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
abstracts = pd.read_parquet("/content/gdrive/MyDrive/w266_final_julia/df_abstract.parquet.gzip")
abstracts

Mounted at /content/gdrive


Unnamed: 0,file_key,abstract
0,0/links.,
1,a9012998,Excavations by Dr. Alan Simmons and his collea...
2,a9012376,This study will investigate organic matter deg...
3,a9012258,Professor Hull is engaged in an investigation ...
4,a9012685,"Because of recent scientific developments, phi..."
...,...,...
134611,3580.txt,"BCS-0123580 \tPI: Clifton, C. \tA three-day ..."
134612,a0123450,This proposal was submitted in response to the...
134613,a0123044,The R-Cubed program is providing scholarships ...
134614,a0123460,This proposal was submitted in response to the...


In [4]:
# If we were to use all of our data
print("number of abstracts: {:,}".format(len(abstracts)))
print(f"train test split of 80%/20%")
print(f" {len(abstracts) * 0.8} abstracts in train")
print(f" {len(abstracts) * 0.2} abstracts in test")

number of abstracts: 134,616
train test split of 80%/20%
 107692.8 abstracts in train
 26923.2 abstracts in test


In [5]:
def remove_special_characters(text):
  """
    function was referenced from this medium article:
    https://towardsdatascience.com/nlp-building-text-cleanup-and-preprocessing-pipeline-eba4095245a0
    
    It accepts text, and uses a defined pattern to parse the text and 
    remove special characters
  """
    # define the pattern to keep
    pat = r'[^a-zA-z0-9.,!?/:;\"\'\s]' 
    return re.sub(pat, '', text)
 
# call function
for abstract in abstracts:
  remove_special_characters(abstract)

In [6]:
# Drops empty abstracts
abstract_array = abstracts['abstract']
abstract_array = [abstract for abstract in abstract_array if len(abstract) > 0 and abstract != "Not Available"]
abstracts_new = pd.DataFrame({'abstract':[]})
abstracts_new['abstract'] = abstract_array
abstracts_new = abstracts_new.sample(frac=1, random_state=1).reset_index()
abstracts_new

Unnamed: 0,index,abstract
0,19034,The focus of the research is on the marine nat...
1,37411,Fluids that rapidly and reversibly change to s...
2,25947,"Abstract ATM-9423233 Ou, Hsien W. Columbia ..."
3,39542,9303456 Cole Newly isolated mutations that c...
4,2750,This research includes studies of: nuclear st...
...,...,...
128833,98047,It is a common intuition that one of the best ...
128834,5192,The uniqueness of this program lies in a colla...
128835,77708,9705735 Burrows This project plans to deve...
128836,98539,Mathematical Sciences Fellowship


In [7]:
def slice_data(slice_val):
  """ 
    function accepts a slice_val which is the percentage of our 
    entire dataset that we want to use for training/testing. 
    shuffled_abstracts = abstracts_new.sample(frac=1, random_state=1)
  """
  train_cap = int(len(abstracts_new) * slice_val)
  train_df=abstracts_new[:train_cap]
  test_df=abstracts_new[train_cap: (train_cap + int((train_cap*0.2)))]

  print(f"Length of all data: {len(abstracts_new)}")
  print(f"Length of training set: {len(train_df)}")
  print(f"Length of test set: {len(test_df)}")
  return train_df, test_df

In [8]:
def prep_data(df):
  """
    function accepts either a train or a test dataframe and splits the data
    into the first sentence of an abstract, and the rest of the abstract as
    source and target variables.
  """
  abstract_all = []
  first_sentence = []

  for abstract in df['abstract']:
    split = [x for x in map(str.strip, abstract.split('.')) if x]
    if len(split) > 2:
      # Take the first two sentences of the abstract
      x = "summarize: " + split[0] + split[1]
      # Take remaining sentences as the target output
      y = " ".join(split[2:len(split)])
      first_sentence.append(x)
      abstract_all.append(y)
  all_data = pd.DataFrame({'first_sentence': first_sentence, 'abstract': abstract_all})
  return all_data

In [9]:
# Split dataset (using 10% of our total data)
train_df, test_df = slice_data(0.1)
train_df_prepped, test_df_prepped = prep_data(train_df), prep_data(test_df)

Length of all data: 128838
Length of training set: 12883
Length of test set: 2576


In [10]:
# Calculate the average length of first few sentences, and abstract
first = []
abs = []
for i in range(len(train_df_prepped)):
  first.append(len(train_df_prepped['first_sentence'][i]))
  abs.append(len(train_df_prepped['abstract'][i]))

print("------------------------------------------")
print(f"Average Train First Sentence Length {np.mean(first)}")
print(f"Average Abstract Length {np.mean(abs)}")

------------------------------------------
Average Train First Sentence Length 341.6133748801534
Average Abstract Length 1175.8358900607222


# Baseline: 
Here we will be using the pre-trained T5-base Model to generate an abstract when given the first few sentences of the abstract as the source text. We will use the "summarize: " prompt here, adjusting the max and min lengths of sequences to match our desired output.

In [None]:
model_name = 't5-base'
t5_tokenizer = T5Tokenizer.from_pretrained(model_name)
t5_model = TFT5ForConditionalGeneration.from_pretrained(model_name)
t5_model.summary()

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/851M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


Model: "tft5_for_conditional_generation"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 shared (TFSharedEmbeddings)  multiple                 24674304  
                                                                 
 encoder (TFT5MainLayer)     multiple                  84954240  
                                                                 
 decoder (TFT5MainLayer)     multiple                  113275008 
                                                                 
Total params: 222,903,552
Trainable params: 222,903,552
Non-trainable params: 0
_________________________________________________________________


In [None]:
def get_baseline_predictions(batch_data):
  inputs_to_tokenize = np.array(batch_data['first_sentence'])
  predictions = []
  index = 0
  for first_sentence in inputs_to_tokenize:
    index += 1
    print(f"predicting sentence: {index}")
    t5_inputs = t5_tokenizer(first_sentence, 
                            max_length=325, 
                            truncation=True, 
                            return_tensors="tf")
    summary_ids = t5_model.generate(t5_inputs['input_ids'],
                                    num_beams=7, 
                                    no_repeat_ngram_size=2, 
                                    min_length=325, 
                                    max_length=900)
    result = t5_tokenizer.batch_decode(summary_ids, 
                                      skip_special_tokens=True, 
                                      clean_up_tokenization_spaces=False)[0]
    predictions.append(result)
  return predictions

### Run Baseline Model using Batching

In [None]:
# Point to where we want to store our predictions
cd /content/gdrive/MyDrive/w266_final_julia/results_data

/content/gdrive/MyDrive/w266_final_julia/results_data


In [None]:
# 200 test samples
num_batches = 40
start = 0
end = 5
predictions = []
for i in range(4, num_batches):
  batch = test_df_prepped[start:end]
  preds = get_baseline_predictions(batch)
  predictions += preds
  start += 5
  end += 5
  preds_chunk = pd.DataFrame({'preds': preds})
  file_name = f'preds_{i}.parquet.gzip'
  preds_chunk.to_parquet(file_name, compression='gzip')  

In [None]:
# Push the actuals to Drive so we can pull it into the Results notebook 
# to calculate Rouge
test_df_prepped[0:200].to_parquet('test_df.parquet.gzip', compression='gzip')  

Lets peak into the results

In [None]:
print(f"source: {np.array(test_df_prepped['first_sentence'])[0]}")
print("---------------------------------------------------------")
print(f"target: {np.array(test_df_prepped['abstract'])[0]}")
print("---------------------------------------------------------")
print(f"predicted_target: {predictions[0]}")

source: summarize: This award is to provide travel fellowships to young Americanscientists to participate in an international symposium on avianendocrinology to be held in Edinburgh, Scotland on September14-17th, 1992State-of-the-art lectures will be held on a broadspectrum of topics from molecular to behavioral endocrinology
---------------------------------------------------------
target: Theemphasis of the symposium is on the integration of molecularapproaches with the organismal level of analysis In addition, theforum is organized to encourage and promote the interchange ofideas between the young scientists and well-established, seniorinvestigators from all over the world Indeed a goal is to bringtogether the researchers that are developing molecular probes withthose that can provide insights as to potential experimentalapplications of these tools Some very important initialdiscoveries have been made using avian model systems because it isin these species that molecular, cellular, 

# Simple T5 with Training

Source and Target are necessary for training the T5 model
- Source: First few Sentences || Target: Abstract ~ generation task

#### Set up the data into X and Y

In [11]:
training_cap = 1000
train_df_subset = train_df_prepped[0:training_cap]
train_df = train_df_subset[0:int(len(train_df_subset)*0.98)]
# Take 2% of the training data for validation in the model
val_df = train_df_subset[int(len(train_df_subset)*0.98): int(len(train_df_subset))]
test_df = test_df_prepped[0:200]
print(f"Length of train df: {len(train_df)}")
print(f"Length of val df: {len(val_df)}")
print(f"Length of test df: {len(test_df)}")
train_df = train_df.rename(columns={"first_sentence": "source_text", "abstract": "target_text"})
val_df = val_df.rename(columns={"first_sentence": "source_text", "abstract": "target_text"})
test_df = test_df.rename(columns={"first_sentence": "source_text", "abstract": "target_text"})

Length of train df: 980
Length of val df: 20
Length of test df: 200


## Load and Run Model

In [None]:
"""
Definitions for the hyperparameters
"""
# source_max_token_len (int, optional): max token length of source text. Defaults to 512.
# target_max_token_len (int, optional): max token length of target text. Defaults to 512.
# batch_size (int, optional): batch size. Defaults to 8.
# max_epochs (int, optional): max number of epochs. Defaults to 5.
# early_stopping_patience_epochs (int, optional): monitors val_loss on epoch end and stops training, if val_loss does not improve after the specied number of epochs. set 0 to disable early stopping. Defaults to 0 (disabled)
# precision (int, optional): sets precision training - Double precision (64), full precision (32) or half precision (16). Defaults to 32.

In [None]:
!cd /content/gdrive/MyDrive/w266_final_julia/

In [None]:
# instantiate
model = SimpleT5()
# # load (supports t5, mt5, byT5 models)
model.from_pretrained("t5","t5-base")
# train
model_num = 7
model_folder = f"/content/gdrive/MyDrive/w266_final_julia/T5_Shuffled_{model_num}/"
model.train(train_df=train_df, # pandas dataframe with 2 columns: source_text & target_text
            eval_df=val_df, # pandas dataframe with 2 columns: source_text & target_text
            source_max_token_len = 300, 
            target_max_token_len = 900,
            batch_size = 3,
            max_epochs = 10,
            use_gpu = True,
            outputdir = model_folder,
            precision = 32
            )

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.distributed:GPU available: True, used: True
INFO:pytorch_lightning.utilities.distributed:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.distributed:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  rank_zero_warn(
INFO:pytorch_lightning.utilities.seed:Global seed set to 42
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [None]:
!ls /content/gdrive/MyDrive/w266_final_julia/T5_Shuffled_7/

simplet5-epoch-0-train-loss-4.4668-val-loss-3.9443
simplet5-epoch-1-train-loss-4.0206-val-loss-3.8505
simplet5-epoch-2-train-loss-3.8432-val-loss-3.8074
simplet5-epoch-3-train-loss-3.6938-val-loss-3.7924
simplet5-epoch-4-train-loss-3.5705-val-loss-3.7831
simplet5-epoch-5-train-loss-3.4646-val-loss-3.7784
simplet5-epoch-6-train-loss-3.3557-val-loss-3.7892
simplet5-epoch-7-train-loss-3.252-val-loss-3.8024
simplet5-epoch-8-train-loss-3.1593-val-loss-3.8245
simplet5-epoch-9-train-loss-3.0623-val-loss-3.8484


In [None]:
best_epoch = "simplet5-epoch-5-train-loss-3.4646-val-loss-3.7784"
model_path = f"/content/gdrive/MyDrive/w266_final_julia/T5_Shuffled_7/{best_epoch}"
model.load_model("t5", model_path, use_gpu=True)

In [None]:
inputs = np.array(test_df['source_text'])
predictions = []
i = 0
for first_sentence in inputs:
  print(f"predicting sentence: {i}")
  i+=1
  # TODO Read on details of how this works
  prediction = model.predict(first_sentence)
  predictions.append(prediction)

preds_df = pd.DataFrame({'preds': predictions})
results_path = f"/content/gdrive/MyDrive/w266_final_julia/T5_Shuffled_Models/preds_simple_t5_{model_num}.parquet.gzip"
preds_df.to_parquet(results_path, compression='gzip') 

In [16]:
test_df.to_parquet('/content/gdrive/MyDrive/w266_final_julia/T5_Shuffled_Models/test_df.parquet.gzip', compression='gzip')  