
#Work 2: Extractive Summarization Attempt 1

In this workbook, we work to mitigate the long length of the patents using extractive summarization (SumBasic) to capture the most important ideas (extraction on full patent).

## Setup

In [None]:
#install libraries
!pip install -q datasets
# !pip install -q transformers
# !pip install --quiet --upgrade accelerate
!pip install -q sentencepiece
!pip install -q evaluate
!pip install -q rouge_score

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [None]:
# Install Packages

#standard data science libraries
import pandas as pd
import numpy as np
import random
import string

#visualization
import matplotlib.pyplot as plt
from pprint import pprint
from IPython.display import display, HTML

#datasets
import datasets
from datasets import load_dataset, load_metric
#transformers
#from transformers import AutoTokenizer
# from transformers import PegasusForConditionalGeneration, PegasusTokenizer
# from datasets import load_from_disk
from datasets import load_dataset_builder

# PyTorch
import torch
from torch.utils.data import DataLoader

#rouge
import evaluate

#NKLT for extractive summarization
import nltk
import nltk.corpus
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import sent_tokenize, regexp_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.probability import FreqDist

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# This cell will authenticate you and mount your Drive in the Colab.
##### ensure you mount to the folder that you want.

from google.colab import drive
drive.mount('/content/drive', force_remount=True)
save_dir="/content/drive/MyDrive/W266/HUPD"

Mounted at /content/drive


##Load Data

In [None]:
#Load Dataset

dataset_dict = load_dataset('HUPD/hupd',
    name='sample',
    data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
    ipcr_label='C07',
    #selecting only a small portion of the training data from Jan 2016
    train_filing_start_date='2016-01-01',
    train_filing_end_date='2016-01-31',
    #for the moment we won't work with validation data (extract 0)
    val_filing_start_date='2016-01-06',
    val_filing_end_date='2016-01-06',
)

print('Loading is done!')

Downloading builder script:   0%|          | 0.00/14.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/10.9k [00:00<?, ?B/s]

Loading dataset with config: PatentsConfig(name='sample', version=0.0.0, data_dir='sample', data_files={'train': ['https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather']}, description='Patent data from January 2016, for debugging')


Downloading data:   0%|          | 0.00/6.67M [00:00<?, ?B/s]

Using metadata file: /root/.cache/huggingface/datasets/downloads/bac34b767c2799633010fa78ecd401d2eeffd62eff58abdb4db75829f8932710


Downloading data:   0%|          | 0.00/388M [00:00<?, ?B/s]

From HF: If you do not want to extract the entire dataset at once but rather extract the years that you are interested in, please make sure to set the force_extract parameter to be True. When this parameter is True, you download the files that appear only in the years of our interest; hence, you might save a lot of disk space.

## Explore HUPD Dataset

In [None]:
# Print info about the sizes of the train and validation sets
print(f'Train dataset size: {dataset_dict["train"].shape}')
print(f'Validation dataset size: {dataset_dict["validation"].shape}')

In [None]:
#prepare training and validation sets
train_set = dataset_dict['train']
val_set = dataset_dict['validation']

Prepare HUPD for Extractive Summarization -- We want to summarize the entire patent, not just one section.

In [None]:
train_set = train_set.map(lambda obs: {'full_patent': obs['claims'] + obs['background']+ obs['summary']+ obs['description']},
                          remove_columns=['claims','background','summary', 'description'])

Map:   0%|          | 0/720 [00:00<?, ? examples/s]

## Extractive Summarization through SumBasic



Here goes nothing.

In [None]:
#score the sentences and print the highest scoring sentence with the highest scoring word
#keep repeating (with word score recalulation) until length is reached

def sumbasic(lem_sentences, lem_words, len_summary = 45):

    freq = FreqDist(lem_words)
    total = sum(freq.values())
    probs = {k: v/total for k, v in freq.items()}

    summary = []

    for _ in range(len_summary):

        scores = {k: [] for k in lem_sentences}
        importance = {k: 0 for k in scores}
        for key, value in lem_sentences.items():               #recalulate the sentence scores
            for word in value:
                scores[key].append(probs[word])
            if len(scores[key]) <= 0:
              importance[key] = 0
            else:
              importance[key] = sum(scores[key]) / len(scores[key])

        most_importance_sentence = max(scores, key=scores.get)  #pull out the most important sentence
        summary.append(most_importance_sentence)

        for word in lem_sentences[most_importance_sentence]:    #recalculate word scores
            probs[word] = probs[word] * probs[word]

    string_summary = ''

    for sentence in lem_sentences:
        if sentence in summary:
            string_summary += sentence + ' '
    return string_summary


In [None]:
extractive_summaries = []

for obs in train_set['full_patent']:
    #break obs first into sentences using NLTK's sent_tokenize
    all_sentences = sent_tokenize(obs)

    #Let's walk through each of these sentences so we can divide into tokens (e.g. words)
    word_tokens = []
    sentence_tokens = {sentence: [] for sentence in all_sentences}

    for one_sentence in all_sentences:
        for token in regexp_tokenize(one_sentence.lower(), '\w+'):  #divide the sentences into tokens based on the regex for whitespace
            if token not in string.punctuation:                     #ignore punctuation
                if token not in stopwords.words('english'):         #ignore stopwords
                    word_tokens.append(token)
                    sentence_tokens[one_sentence].append(token)

    #A lemmatizer takes conjugated verbs and returns their infinitive form (e.g. conjugating -> conjugate)
    #It does the same thing with nouns taking the plural form and returning the singular form.
    #We're doing this because we want to count up occurences of word roots to get a tighter distribution
    lem = WordNetLemmatizer()
    lem_words = [lem.lemmatize(word) for word in word_tokens]
    lem_sentences = {sentence: [lem.lemmatize(word) for word in sentence_tokens[sentence]] for sentence in sentence_tokens}

    #Now we have a list of lemmatized words and a list of sentences containing lemmatized words
    #we pass them to the sumbasic fiunction along with a size parameter
    #We'll also pass a summary size as a percentage of the sentences in the original document
    summary = sumbasic(lem_sentences, lem_words, len_summary = 30)
    extractive_summaries.append(summary)

In [None]:
#add extractive summaries to the train set
train_set = train_set.add_column('extractive_summaries', extractive_summaries)

#remove the full patent to save storage space
# train_set = train_set.remove_columns('full_patent')

In [None]:
#check that it did what we asked
train_set

Dataset({
    features: ['patent_number', 'decision', 'title', 'abstract', 'cpc_label', 'ipc_label', 'filing_date', 'patent_issue_date', 'date_published', 'examiner_id', 'extractive_summaries'],
    num_rows: 720
})

In [None]:
#now we save the extracted data to Drive so that we can access it in our other Colab notebook
train_set.save_to_disk(save_dir + '/extracted_train_sample')

Saving the dataset (0/1 shards):   0%|          | 0/720 [00:00<?, ? examples/s]

In [None]:
pprint(train_set['abstract'][3])

('New designed ankyrin repeat proteins with binding specificity for HGF are '
 'described, as well as nucleic acids encoding such HGF binding proteins, '
 'pharmaceutical compositions comprising such proteins and the use of such '
 'proteins in the treatment of diseases.')


In [None]:
len(train_set['extractive_summaries'][3].split(' '))

676

In [None]:
pprint(train_set['extractive_summaries'][3])

('20. 4. The IC50 values were determined to be 72 and 116 nM for DARPin #43 '
 'and 51, respectively. Residue positions 2, 5, 7-13, and 16-33 correspond to '
 'positions which typically contain framework residues. Preferably, said '
 'polymer moiety is connected by a polypeptide linker to a binding domain. '
 'cit. The target may be a whole cell or a tissue sample, or it may be any '
 'non-natural molecule or moiety. In the particular application of the present '
 'invention, the target is HGF. In designed repeat proteins, there are at '
 'least 2, usually about 2 to 6, in particular at least about 6, frequently 20 '
 'or more repeat units (or modules). N-terminal capping repeats) are SEQ ID '
 'NO:1 to 3 and examples of ankyrin C-terminal capping modules (i.e. The term '
 '“PBS” means a phosphate buffered water solution containing 137 mM NaCl, 10 '
 'mM phosphate and 2.7 mM KCl and having a pH of 7.4. A binding protein or a '
 'binding domain of the invention possesses a defined secon

## Observations from Extracted Data:

*   In some extractions, we see strange phenomena like long chains of DNA sequences instead of sentences.
*   I tried to set the length to 25 sentences based on average sentence length, but most extractive summaries are shorter than 1024 tokens now. We can probably afford to extend the summaries and then have Pegasus chop off what is doesn't need.


