#Work 4: Extractive Summarization of Large Dataset


In this workbook, we work to mitigate the long length of the patents using extractive summarization (SumBasic) to capture the most important ideas (extraction on claims and description sections).

Except there's not enough memory and it fails miserably every time. :D

## Setup

In [None]:
#install libraries
!pip install -q datasets
!pip install -q sentencepiece
!pip install -q evaluate
!pip install -q rouge_score

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [None]:
# Install Packages

#standard data science libraries
import pandas as pd
import numpy as np
import random
import string

#visualization
import matplotlib.pyplot as plt
from pprint import pprint

#datasets
import datasets
from datasets import load_dataset, load_from_disk

# PyTorch
# import torch

#rouge
# import evaluate

#NKLT for extractive summarization
import nltk
import nltk.corpus
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import sent_tokenize, regexp_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.probability import FreqDist

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# This cell will authenticate you and mount your Drive in the Colab.
##### ensure you mount to the folder that you want.

from google.colab import drive
drive.mount('/content/drive', force_remount=True)
save_dir="/content/drive/MyDrive/W266/HUPD"

Mounted at /content/drive


##Load Data

In [None]:
dataset_dict = load_from_disk(save_dir + '/HUPD_C07')

## Explore HUPD Dataset

In [None]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['patent_number', 'decision', 'title', 'abstract', 'claims', 'background', 'summary', 'description', 'cpc_label', 'ipc_label', 'filing_date', 'patent_issue_date', 'date_published', 'examiner_id'],
        num_rows: 30915
    })
    validation: Dataset({
        features: ['patent_number', 'decision', 'title', 'abstract', 'claims', 'background', 'summary', 'description', 'cpc_label', 'ipc_label', 'filing_date', 'patent_issue_date', 'date_published', 'examiner_id'],
        num_rows: 7311
    })
})

In [None]:
#prepare training and validation sets
train_set = dataset_dict['train']
val_set = dataset_dict['validation']

In [None]:
#we must shrink the dataset for time concerns (google colab times out if any longer)
train_set = train_set.shuffle(seed=42).select(range(15000))
val_set = val_set.shuffle(seed=42).select(range(3750))

Prepare HUPD for Extractive Summarization -- We want to summarize the entire patent, not just one section.

In [None]:
train_set = train_set.map(lambda obs: {'claims_desc': obs['claims'] + obs['description']},
                          remove_columns=['claims','background','summary', 'description'])
val_set = val_set.map(lambda obs: {'claims_desc': obs['claims'] + obs['description']},
                          remove_columns=['claims','background','summary', 'description'])

## Extractive Summarization through SumBasic



Here goes nothing.

In [None]:
#score the sentences and print the highest scoring sentence with the highest scoring word
#keep repeating (with word score recalulation) until length is reached

def sumbasic(lem_sentences, lem_words, len_summary = 45): #here we increase the number of sentences based on our findings from the first test

    freq = FreqDist(lem_words)
    total = sum(freq.values())
    probs = {k: v/total for k, v in freq.items()}

    summary = []

    for _ in range(len_summary):

        scores = {k: [] for k in lem_sentences}
        importance = {k: 0 for k in scores}
        for key, value in lem_sentences.items():               #recalulate the sentence scores
            for word in value:
                scores[key].append(probs[word])
            if len(scores[key]) <= 0:
              importance[key] = 0
            else:
              importance[key] = sum(scores[key]) / len(scores[key])

        most_importance_sentence = max(scores, key=scores.get)  #pull out the most important sentence
        summary.append(most_importance_sentence)

        for word in lem_sentences[most_importance_sentence]:    #recalculate word scores
            probs[word] = probs[word] * probs[word]

    string_summary = ''

    for sentence in lem_sentences:
        if sentence in summary:
            string_summary += sentence + ' '
    return string_summary


In [None]:
extractive_summaries_train = []

for obs in train_set['claims_desc']:
    #break obs first into sentences using NLTK's sent_tokenize
    all_sentences = sent_tokenize(obs)

    #Let's walk through each of these sentences so we can divide into tokens (e.g. words)
    word_tokens = []
    sentence_tokens = {sentence: [] for sentence in all_sentences}

    for one_sentence in all_sentences:
        for token in regexp_tokenize(one_sentence.lower(), '\w+'):  #divide the sentences into tokens based on the regex for whitespace
            if token not in string.punctuation:
                            #ignore punctuation
                if token not in stopwords.words('english'):         #ignore stopwords
                    word_tokens.append(token)
                    sentence_tokens[one_sentence].append(token)

    #A lemmatizer takes conjugated verbs and returns their infinitive form (e.g. conjugating -> conjugate)
    #It does the same thing with nouns taking the plural form and returning the singular form.
    #We're doing this because we want to count up occurences of word roots to get a tighter distribution
    lem = WordNetLemmatizer()
    lem_words = [lem.lemmatize(word) for word in word_tokens]
    lem_sentences = {sentence: [lem.lemmatize(word) for word in sentence_tokens[sentence]] for sentence in sentence_tokens}

    #Now we have a list of lemmatized words and a list of sentences containing lemmatized words
    #we pass them to the sumbasic fiunction along with a size parameter
    #We'll also pass a summary size as a percentage of the sentences in the original document
    summary = sumbasic(lem_sentences, lem_words, len_summary = 30)
    extractive_summaries_train.append(summary)

KeyboardInterrupt: ignored

In [None]:
len(extractive_summaries_train)

6612

In [None]:
extracted_train_set = train_set.select(list(range(6612)))

In [None]:
extracted_train_set

Dataset({
    features: ['patent_number', 'decision', 'title', 'abstract', 'cpc_label', 'ipc_label', 'filing_date', 'patent_issue_date', 'date_published', 'examiner_id', 'claims_desc'],
    num_rows: 6612
})

In [None]:
#add extractive summaries to the train set
extracted_train_set = extracted_train_set.add_column('extractive_summaries', extractive_summaries_train)

In [None]:
#now we save the extracted data to Drive so that we can access it in our other Colab notebook
extracted_train_set.save_to_disk(save_dir + '/extracted_dataset_large_train')

In [None]:
extractive_summaries_val = []

for obs in val_set['claims_desc']:
    #break obs first into sentences using NLTK's sent_tokenize
    all_sentences = sent_tokenize(obs)

    #Let's walk through each of these sentences so we can divide into tokens (e.g. words)
    word_tokens = []
    sentence_tokens = {sentence: [] for sentence in all_sentences}

    for one_sentence in all_sentences:
        for token in regexp_tokenize(one_sentence.lower(), '\w+'):  #divide the sentences into tokens based on the regex for whitespace
            if token not in string.punctuation:                     #ignore punctuation
                if token not in stopwords.words('english'):         #ignore stopwords
                    word_tokens.append(token)
                    sentence_tokens[one_sentence].append(token)

    #A lemmatizer takes conjugated verbs and returns their infinitive form (e.g. conjugating -> conjugate)
    #It does the same thing with nouns taking the plural form and returning the singular form.
    #We're doing this because we want to count up occurences of word roots to get a tighter distribution
    lem = WordNetLemmatizer()
    lem_words = [lem.lemmatize(word) for word in word_tokens]
    lem_sentences = {sentence: [lem.lemmatize(word) for word in sentence_tokens[sentence]] for sentence in sentence_tokens}

    #Now we have a list of lemmatized words and a list of sentences containing lemmatized words
    #we pass them to the sumbasic fiunction along with a size parameter
    #We'll also pass a summary size as a percentage of the sentences in the original document
    summary = sumbasic(lem_sentences, lem_words, len_summary = 30)
    extractive_summaries_val.append(summary)

In [None]:
#add extractive summaries to the val set
val_set = val_set.add_column('extractive_summaries', extractive_summaries_val)

In [None]:
#now we save the extracted data to Drive so that we can access it in our other Colab notebook
val_set.save_to_disk(save_dir + '/extracted_dataset_large_val')

In [None]:
pprint(train_set['abstract'][3])

In [None]:
len(train_set['extractive_summaries'][3].split(' '))

In [None]:
pprint(train_set['extractive_summaries'][3])