## Imports

In [1]:
import pandas as pd
import numpy as np
from google.colab import drive

!pip install -q rouge_score
!pip install -q evaluate

import evaluate
import rouge_score
rouge = evaluate.load('rouge')

  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 72 kB 769 kB/s 
[K     |████████████████████████████████| 451 kB 9.2 MB/s 
[K     |████████████████████████████████| 132 kB 6.5 MB/s 
[K     |████████████████████████████████| 182 kB 72.2 MB/s 
[K     |████████████████████████████████| 212 kB 90.1 MB/s 
[K     |████████████████████████████████| 127 kB 65.0 MB/s 
[?25h

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

### Functions

In [12]:
def evaluate_baseline(results_df, name, type_abstract):
  r1, r2, rl, rsum = [], [], [], []

  references = np.array(results_df[type_abstract])
  predictions = np.array(results_df['predictions'])
  results = rouge.compute(predictions=predictions,
                          references=references)
  r1.append(results['rouge1'])
  r2.append(results['rouge2'])
  rl.append(results['rougeL'])
  rsum.append(results['rougeLsum'])

  metrics_df = pd.DataFrame({'model_name': [name], 
                            'num_beams': [7],
                            'no_repeat_ngram_size': [2],
                            'min_length': [325],
                            'max_length': [900],
                            'rouge_1': r1,
                            'rouge_2': r2,
                            'rouge_L': rl,
                            'rouge_L_sum': rsum
                            })
  return metrics_df

In [3]:
def evaluate_model_T5(results_df, type_abstract, batch_size, max_epochs, precision):
  r1, r2, rl, rsum = [], [], [], []

  references = np.array(results_df[type_abstract])
  predictions = np.array(results_df['predictions'])
  results = rouge.compute(predictions=predictions,
                          references=references)
  r1.append(results['rouge1'])
  r2.append(results['rouge2'])
  rl.append(results['rougeL'])
  rsum.append(results['rougeLsum'])

  metrics_df = pd.DataFrame({'model_name': ["T5_Simple_Model"], 
                            'source_max_token_len': [325],
                            'target_max_token_len': [900],
                            'batch_size': [batch_size],
                            'max_epochs': [max_epochs],
                            'precision': [precision],
                            'rouge_1': r1,
                            'rouge_2': r2,
                            'rouge_L': rl,
                            'rouge_L_sum': rsum
                            })
  return metrics_df

In [4]:
# Create a column that merges the first few sentences and abstract 
# (as we are predicting the whole abstract not just the part after first sentence)
def create_compo_colomn(df):
  comp_abs = []
  for index, row in df.iterrows():
    first_sentence = row['first_sentence']
    target_text = row['abstract']
    comp = first_sentence[10:len(first_sentence)] + target_text
    comp_abs.append(comp)

  df['composite_abstract'] = comp_abs
  return df

## Baseline Results

In [5]:
# Pull Data from Google Drive
drive.mount('/content/gdrive')
!cd /content/gdrive/MyDrive/w266_final_julia/results_data

Mounted at /content/gdrive


In [6]:
# Construct baseline results dataframe 
baseline_results_df = pd.read_parquet('/content/gdrive/MyDrive/w266_final_julia/results_data/test_df.parquet.gzip')
baseline_results = []
num_batches = 40
for i in range(num_batches):
  path = f"/content/gdrive/MyDrive/w266_final_julia/results_data/preds_{i}.parquet.gzip"
  df = pd.read_parquet(path)
  preds = df['preds'].tolist()
  baseline_results += preds
baseline_results_df['predictions'] = baseline_results
baseline_results_df = create_compo_colomn(baseline_results_df)

In [7]:
baseline_results_df

Unnamed: 0,first_sentence,abstract,predictions,composite_abstract
0,summarize: This award is to provide travel fel...,Theemphasis of the symposium is on the integra...,a symposium on avianendocrinology to be held i...,This award is to provide travel fellowships t...
1,"summarize: In this project, the PI's team will...",These will then feed in to studies of the ocea...,"in this project, the PI's team will study the ...","In this project, the PI's team will study the..."
2,summarize: This project seeks to fully analyze...,The PI's previous work indicatesthat substanti...,this project seeks to fully analyze and interp...,This project seeks to fully analyze and inter...
3,summarize: The carbon-isotopic compositions of...,""" In cold, high latitude waters, both suspend...",the carbon-isotopic compositions of long-chain...,The carbon-isotopic compositions of long-chai...
4,summarize: This workshop will bring together e...,The workshop will provide input to emerging HP...,this workshop will bring together experts from...,This workshop will bring together experts fro...
...,...,...,...,...
195,summarize: A fundamental problem in eucaryotic...,The functions of fivecharacterized regulatory ...,the long term goal of this project is to appro...,A fundamental problem in eucaryotic biology i...
196,summarize: This award is made in the Special P...,Vectorial energy and electron transferreaction...,this award is made in the Special Projects Off...,This award is made in the Special Projects Of...
197,summarize: This research is concerned with the...,The research supported concerns the representa...,this research is concerned with the representa...,This research is concerned with the represent...
198,summarize: The workshop investigates applicati...,The main objective is to bring together resear...,the workshop investigates application and rese...,The workshop investigates application and res...


### Example

In [15]:
index = 1
# Promt
baseline_results_df['first_sentence'][index]

"summarize: In this project, the PI's team will study the circulation of theNorth Pacific, from the sea surface to the bottom, and relate itto results from an earlier study of the South PacificTheseresults will be compared with circulation patterns determined bythe PI in the South and North Atlantic, using similar techniques"

In [16]:
baseline_results_df['predictions'][index]

"in this project, the PI's team will study the circulation of theNorth Pacific, from the sea surface to the bottom . results will be compared with results from an earlier study ofthe South and North Atlantic, using similar techniques. the project is expected to be completed by the end of january, with a final report expected in mid-october, but not before november, according to cnn.com/pi-projects/aspx/sci/2005// s- gragra \xad\xadn\xadr\xads\xad e\xadl\xadw\xad-\xadly\xad_\xadre\xad[\xadh\xadi\xad\xad/\xad*\xad–\xad...\xad—\xad---\xad de\xad&\xad,\xad»-__[_*-[ _ ,--]\xado\xadp\xadc\xad] -»\xad?\xad.\xade-/__-*_n_d\xady\xadt\xad(\xadm\xad’\xadd_“\xad”\xad«\xad“[[*»[--(_([t_]_=\xadâ\xad%\xadv_s_&_––-=-–=–_/-—–——-’-'-,==_—= ''—_'_,_results [._e_f_a_o_u_p_en_________y_i_and_in_the_."

In [18]:
baseline_results_df['composite_abstract'][index]

" In this project, the PI's team will study the circulation of theNorth Pacific, from the sea surface to the bottom, and relate itto results from an earlier study of the South PacificTheseresults will be compared with circulation patterns determined bythe PI in the South and North Atlantic, using similar techniquesThese will then feed in to studies of the ocean/ocean relationsand determination of inter-basin exchange processes"

### Results

In [14]:
baseline_composite = evaluate_baseline(baseline_results_df, "baseline_composite_abstract", "composite_abstract")
baseline_just_abs = evaluate_baseline(baseline_results_df, "baseline_abstract", "abstract")
results_baseline = pd.concat([baseline_just_abs, baseline_composite]).reset_index().drop('index', axis=1)
results_baseline

Unnamed: 0,model_name,num_beams,no_repeat_ngram_size,min_length,max_length,rouge_1,rouge_2,rouge_L,rouge_L_sum
0,baseline_abstract,7,2,325,900,0.186895,0.028193,0.111203,0.110839
1,baseline_composite_abstract,7,2,325,900,0.34063,0.223717,0.284259,0.284261


# Simple T5 Results

In [None]:
# Pull Data from Google Drive
drive.mount('/content/gdrive')
!cd /content/gdrive/MyDrive/w266_final_julia/T5_Shuffled_Models/

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
# Construct baseline results dataframe 
T5_results_df = pd.read_parquet('/content/gdrive/MyDrive/w266_final_julia/T5_Shuffled_Models/test_df.parquet.gzip')
T5_results_df = T5_results_df.rename(columns={"source_text": "first_sentence", "target_text": "abstract"})
T5_results_df = create_compo_colomn(T5_results_df)
T5_results_df

Unnamed: 0,first_sentence,abstract,composite_abstract
0,summarize: Intellectual merit: As an explanati...,Soil fungal pathogens are likely causal factor...,Intellectual merit: As an explanation for the...
1,summarize: This grant provides funding for bui...,"Moreover, these algorithms and thenecessary in...",This grant provides funding for building high...
2,summarize: ABSTRACTSThis project is designed t...,Threeseries of studies are presented: Series I...,ABSTRACTSThis project is designed to answer i...
3,summarize: This project provides the Computer ...,The cornerstone of this set of laboratory cou...,This project provides the Computer Science De...
4,summarize: CTS-9501842 Joseph McGuire Orego...,Simple kinetic models previously used to analy...,CTS-9501842 Joseph McGuire Oregon State Un...
...,...,...,...
195,summarize: The proposed research focuses on co...,", and develop a new collaboration with Lucent...",The proposed research focuses on computationa...
196,summarize: In this project in the Experimental...,Sando of the University ofIowa will continue...,In this project in the Experimental Physical ...
197,summarize: This award supports the development...,"g women's colleges, historically blackcolleges...",This award supports the development and imple...
198,summarize: McKeown 9311618 This doctoral d...,The first partof the study examines the develo...,McKeown 9311618 This doctoral dissertatio...


In [None]:
# Predictions 
model_num = 7
def get_t5_results(model_number):
  path = f"/content/gdrive/MyDrive/w266_final_julia/T5_Shuffled_Models/preds_simple_t5_{model_number}.parquet.gzip"
  preds = pd.read_parquet(path)
  return preds

def format_T5_preds(pred_df):
  preds_to_return = []
  for pred in pred_df['preds']:
    preds_to_return.append(pred[0])
  return preds_to_return

predictions = get_t5_results(model_num)
predictions = format_T5_preds(predictions)
T5_results_df['predictions'] = predictions

### Sample

In [None]:
index = 1
# Promt
T5_results_df['first_sentence'][index]

'summarize: This grant provides funding for building highly scalable distributed algorithmsfor a new class of adaptive manufacturing enterprises using distributed agentarchitecture over the InternetThese algorithms will be used for reconfiguringwork-in-process inventory levels and production schedules to adapt to changingmarket demands and supply-chain conditions'

In [None]:
# Prediction
T5_results_df['predictions'][index]

'The algorithms will be used to reconfigure inventory levels and production schedules to adapt to changing supply-chain conditions This project will provide funding for the development of a new class of adaptive manufacturing enterprises using distributed agentarchitecture over the Internet'

In [None]:
# Abstract
T5_results_df['composite_abstract'][index]

' This grant provides funding for building highly scalable distributed algorithmsfor a new class of adaptive manufacturing enterprises using distributed agentarchitecture over the InternetThese algorithms will be used for reconfiguringwork-in-process inventory levels and production schedules to adapt to changingmarket demands and supply-chain conditionsMoreover, these algorithms and thenecessary information will be embedded in agents geographically distributedthroughout the enterprise The goal is to maintain responsiveness andeffectiveness to enable the next generation of scalable enterprises Scalability will be achieved through (1) distributed algorithms that arepredictable and computationally efficient; (2) distributed agent architecturesthat support growth in size and capability; and (3) distributed clusters thatprovide rapid access to information using cost effective technologies Analytical models will be developed to predict the emergent behavior of suchsystems and stability and c

### T5 Results

In [None]:
eval_T5_1 = evaluate_model_T5(T5_results_df, 'composite_abstract', 5, 2, 16)
eval_T5_2 = evaluate_model_T5(T5_results_df, 'composite_abstract', 5, 2, 32)
eval_T5_4 = evaluate_model_T5(T5_results_df, 'composite_abstract', 2, 2, 32)
eval_T5_5 = evaluate_model_T5(T5_results_df, 'composite_abstract', 10, 2, 32)
eval_T5_6 = evaluate_model_T5(T5_results_df, 'composite_abstract', 6, 2, 32)
eval_T5_7 = evaluate_model_T5(T5_results_df, 'composite_abstract', 3, 10, 32)
results_T5 = pd.concat([eval_T5_1, eval_T5_2, eval_T5_4, eval_T5_5, eval_T5_6, eval_T5_7]).reset_index().drop('index', axis=1)
results_T5

In [None]:
results_T5

Unnamed: 0,model_name,source_max_token_len,target_max_token_len,batch_size,max_epochs,precision,rouge_1,rouge_2,rouge_L,rouge_L_sum
0,T5_Simple_Model,325,900,5,2,16,0.270529,0.115546,0.178785,0.178976
1,T5_Simple_Model,325,900,5,2,32,0.272564,0.124517,0.185725,0.185727
2,T5_Simple_Model,325,900,2,2,32,0.224578,0.114472,0.164401,0.164905
3,T5_Simple_Model,325,900,10,2,32,0.30824,0.13289,0.197229,0.197588
4,T5_Simple_Model,325,900,6,2,32,0.295845,0.14355,0.202287,0.202286
5,T5_Simple_Model,325,900,3,10,32,0.24301,0.106074,0.170603,0.170661


# GPT-2 Results