## Importing packages

Importing all needed packages for the full notebook. Only needs to be ran once. 

In [4]:
import gzip as gz
import json
import sys as sklearn
import spacy as sp
import pandas as pd
import numpy as np
import re
import math
from sklearn.feature_extraction.text import TfidfVectorizer

## Importing data

In [6]:
# Setting local file path
path = "release/train.jsonl.gz"

# Creating list entity to hold full set of loaded data
data = []

# Using gz to set path to zip file and iteritavly load each json line
with gz.open(path) as f:
    for ln in f:
        obj = json.loads(ln)
        data.append(obj)

In [7]:
# Extracting appropriate data and transforming to pandas dataframe
df = pd.DataFrame(data[0:100000])
df_extractive = df[df.density_bin == 'extractive']

## Intermidiate Input Representation


### Sentence splitting

In [8]:
nlp = sp.load("en_core_web_sm", disable=["tagger", "parser", "ner", "textcat"])
nlp.add_pipe(nlp.create_pipe('sentencizer'))

In [11]:
# SENTENCE SPLITTING FUNCTION
def sentence_splitting(doc):
    doc = nlp(doc['text'])
    sentences = [sent.string.strip() for sent in doc.sents]
    
    return sentences

### Sparse TF-ISF matrix representation

In [13]:
# PREPROCESS FUNCTION 
def preprocess(text):
    text = nlp(text)
    tokens = []
    for token in text:
        if token.is_stop == False and token.is_alpha == True:
            tokens.append(token.lemma_)
    
    return tokens

In [97]:
# TF-ISF MATRIX CREATION FUNCTION
def create_tfisf_matrix(sentences, preprocessor = preprocess):
    #Initializing ScikitLearn TF-IDF vectorizer and creating TF-IDF sparse matrix
    vectorizer = TfidfVectorizer(tokenizer = preprocessor)
    tfisf_matrix = vectorizer.fit_transform(sentences)
    
    # Saving list of all corpus tokens
    feature_names = vectorizer.get_feature_names()

    # Inspecting dimension of sparse matrix
    # Rows should equal no. of df_extractive data entities
    # Number of columns equals number of unique corpus tokens
    #print("TF-IDF matrix dimension: ", tfisf_matrix.get_shape(), "\nAligning with no. df_extractive enteties? ", tfisf_matrix.get_shape()[0] == len(sentences))
    
    # Returning TF-IDF matrix
    return tfisf_matrix, feature_names

### Sentence scoring

In [83]:
# SENTENCE SCORING FUNCTION
def tfisf_sentence_scoring(tfisf_matrix, feature_names, sentences):
    sentence_scores = []
    tfisf_matrix =  pd.DataFrame.sparse.from_spmatrix(tfisf_matrix, columns=feature_names)
    tfisf_matrix_row_sum = tfisf_matrix.sum(axis = 1)

    for i, sentence in enumerate(sentences):
        sentence_scores.append((i, tfisf_matrix_row_sum[i]/len(sentence)))
    
    sentence_scores = pd.DataFrame(sentence_scores,columns=["sentence_index", "sentence_score"]).sort_values(by='sentence_score', ascending=False)
    
    return sentence_scores

### Extract summary sentences and merge

In [102]:
# SENTENCE EXTRACTION FUNCTION
def tfisf_sentence_extraction(sentences, sentence_scores, n):
    summary = []
    
    # Extracting indices of the n number of top scoring sentences
    # sort them in ascending order
    top_sentence_indices = np.sort(sentence_scores[0:n].sentence_index.values)

    # Extracting the original sentences and appending to summary list
    for index in top_sentence_indices:
        summary.append(sentences[index])
        
    # Joining summary sentences
    summary = ' '.join(summary)
    
    return summary

# Running the Expiriment

In [103]:
corpus = df_extractive[0:10]
summaries = []

for index, doc in corpus.iterrows():
    sentences = sentence_splitting(doc)
    tfisf_matrix, feature_names = create_tfisf_matrix(sentences = sentences)
    sentence_scores = tfisf_sentence_scoring(tfisf_matrix, feature_names, sentences)
    summary = tfisf_sentence_extraction(sentences, sentence_scores, 3)
    summaries.append(summary)

In [104]:
summaries

['If the board rejects the resignation offer, it will publicly state why. The thing has gotten so out of hand that words almost fail me. The shareholders should not tolerate it."',
 'Oh, we\'ve bee n there. I like quiet." Spooky.',
 'In a sautÃ© pan, heat the olive oil over low heat. SautÃ© the spinach and onion for 10 minutes. Mix in the cheese, salt and pepper.',
 'All day, every day, Cheryl Bernstein thanks her 16-month-old son. Bernstein is now a travel agent, and Flaumenbaum is studying desktop publishing. Yesterday, Baker held their seventh child, 11-month-old Ashley.',
 'Area is 1 square mile. Population about 60,000. Covers 6 square miles.',
 'The long arm of the tough new anti-smoking law stops at the gates of Rikers Island. Meringolo asked. he said. "',
 'Gag us with a spoon. Don\'t tell anybody. Somebody\'s got to eat, right?"',
 "It's light, summery and wonderful for sipping outside. Strain into a martini glass. Garnish with pink cotton candy all around the rim and serve.",