### Imports

In [17]:
# !pip install transformers
# !pip install SentencePiece
# !pip install transformers
import os
import sys
import pandas as pd
from transformers import pipeline
from transformers import pipeline, set_seed
from transformers import T5Tokenizer, TFT5Model, TFT5ForConditionalGeneration
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel

## Load Data

In [2]:
my_root = "/home/jbobro/final_project/W266_Final_Project"

In [11]:
def pull_data(df_name):
    return pd.read_parquet(f"{my_root}/Processed_Data/{df_name}.parquet.gzip")

In [13]:
df_abstract = pull_data("df_abstract")
df_author = pull_data("df_author")
df_title = pull_data("df_title")
df_fid = pull_data("df_fid")
df_bow = pull_data("df_bow")

## GPT-2 Text Generation Model - Decoder Only
Can we generate a meaningful abstract given a sentence or two? 

1. Identify evaluation task (GLUE) 
2. Determine pre-training approaches
3. Determine pre-training data sets 
4. Determine fine-tuning approach

In [None]:
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model='gpt2')
set_seed(42)
generator("Hello, I'm a language model,", max_length=30, num_return_sequences=5)

In [None]:
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel

gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_model = TFGPT2LMHeadModel.from_pretrained('gpt2')

In [None]:
abs_sample = df_abstract['abstract'][1]
first_sentence = abs_sample[20:397]

In [None]:
text_start = 'This award is for one year of support to continue work onsamples '
input_ids = gpt2_tokenizer.encode(text_start, return_tensors='tf')
input_ids

In [None]:
generated_text_outputs = gpt2_model.generate(
    input_ids, 
    max_length=300,
    num_return_sequences=1,
    no_repeat_ngram_size=3,
    repetition_penalty=1.5,
    top_p=0.92,
    temperature=.85,
    do_sample=True,
    top_k=125,
    early_stopping=True
)

#Clearly the raw GPT-2 model is not doing too well at predicting Scientific abstracts
for i, beam in enumerate(generated_text_outputs):
  print()
  print("{}: {}".format(i, gpt2_tokenizer.decode(beam, skip_special_tokens=True)))

## GPT-3

In [None]:
generator = pipeline('text-generation', model='EleutherAI/gpt-neo-1.3B')

In [None]:
abs_sample = df_abstract['abstract'][3]
first_sentence = abs_sample[20:397]

In [None]:
text = 'Who is Elon Musk'
abs_sample = df_abstract['abstract'][1
result = generator(text, max_length=100, do_sample=True, temperature=0.9)
result

## T5 model for text generation
*Transformer: type of neural network architectu*

How does T5 work? 
- T5 is trained on C4 -- Colossal Clean Crawled Corpus (obtained by scraping web pages and ignoring the markup from the HTML, hence causing lots of gibberish text as we see bellow, to flow through to the model 


**Reference Abstract: (Not fed into the model)**

"Recent advances in our understanding of minor biologicalfractionation of Ge/Si by oceanic diatoms suggest that Ge/Si)opal  variationsmeasured in late Pleistocene piston cores and Cenozoic  drill cores arerecording whole ocean (Ge/Si)seawater variations  driven by rapid and largeglacial to interglacial changes in  continental weathering intensity and riverfluxes to the sea. If  so, then it is clearly important to produce highresolution records  of (Ge/Si)opal in cores with well-established  18 O and 14Cstratigraphies across transitions to establish the shape and timing  of theoceanic Ge/Si response.." 

**Input Sentence:**

" Recent advances in our understanding of minor biologicalfractionation of Ge/Si by oceanic diatoms suggest that Ge/Si)opal  variationsmeasured in late Pleistocene piston cores and Cenozoic  drill cores arerecording whole ocean (Ge/Si)seawater variations  driven by rapid and largeglacial to interglacial changes in  continental weathering intensity and riverfluxes to the sea."

**Pre-Trained T5 model summarization**

"this project aims to produce high resolution records of (Ge/Si)opal in cores with well-established 18 O and 14Cstratigraphies across transitions . it is also necessary to carefully evaluate and remove the effects of ge/si fractionation in local records, says dr. j. floelich. this is aimed at accomplishingtwo major goals: (1) Produce highresolution records incores that contain abundant diatoms and forams in collaboration with others' measurements a -"

Notice the previously unseen output that matches between the input sentence, and the predicted output: 
- Original: If  so, then it is clearly important to produce highresolution records  of (Ge/Si)opal ..
- Predicted: Produce highresolution records incores that contain abundant diatoms and forams in collaboration with others' measurements



In [None]:
t5_model = TFT5ForConditionalGeneration.from_pretrained('t5-large')
t5_tokenizer = T5Tokenizer.from_pretrained('t5-large')
t5_model.summary()

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

2022-11-16 04:46:59.853424: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-11-16 04:46:59.853529: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-11-16 04:46:59.853561: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (instance-3): /proc/driver/nvidia/version does not exist
2022-11-16 04:46:59.955307: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-16 04:47:00.433167: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, b

In [None]:
abs_sample = df_abstract['abstract'][3]
first_sentence = abs_sample[20:397]

In [None]:
ARTICLE = first_sentence
t5_input_text = "summarize: " + abs_sample
t5_inputs = t5_tokenizer([t5_input_text], return_tensors='tf')

In [None]:
t5_summary_ids = t5_model.generate(t5_inputs['input_ids'],
                                   num_beams=7, 
                                   no_repeat_ngram_size=2, 
                                   min_length=400, 
                                   max_length=600)

print([t5_tokenizer.decode(g, skip_special_tokens=True,
                           clean_up_tokenization_spaces=False)
       for g in t5_summary_ids])