## Import Packages

All packages that are needed are imported in the next cell, should only be ran once. 

In [351]:
import gzip as gz
import json
import sys as sklearn
import spacy as sp
import pandas as pd
import numpy as np
import re
import networkx as nx
import math

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

## Load Data

In [2]:
path = "release/train.jsonl.gz"
data = []

with gz.open(path) as f:
    for ln in f:
        obj = json.loads(ln)
        data.append(obj)

In [169]:
# Extracting appropriate data and transforming to pandas dataframe
df = pd.DataFrame(data)
df_extractive = df[df.density_bin == 'extractive']

# Filtering on rough estimate of lenght text
article_lengths = [len(text.split()) for text in df_extractive.text]
_ = plt.hist(article_lengths, bins = 100, range = (0, 2000))
length_check = [len > 250 for len in article_lengths]
df_extractive = df_extractive[length_check]

In [265]:
df_extractive.keys()

Index(['url', 'archive', 'title', 'date', 'text', 'summary', 'compression',
       'coverage', 'density', 'compression_bin', 'coverage_bin',
       'density_bin'],
      dtype='object')

## Intermidiate Input Representation

### Sentence splitting

In [170]:
nlp = sp.load("en_core_web_sm", disable=["tagger", "parser", "ner", "textcat"])
nlp.add_pipe(nlp.create_pipe('sentencizer'))

In [171]:
# SENTENCE SPLITTING FUNCTION
def sentence_splitting(doc):
    # Using the NLP sentencizer pipeline to extract all text sentences
    doc = nlp(doc['text'])
    sentences = [sent.string.strip() for sent in doc.sents]
    
    return sentences

### Sparse TF-ISF matrix representation

In [399]:
# PREPROCESSING FUNCTION
def preprocess(text):
    # Transform text with SpaCy model for NLP procedures
    text = nlp(text)
    
    # loop through the words in the text, removing stopwords and numerics
    # Assign the remaining tokens to the token list in the lemma form
    tokens = []
    for token in text:
        if token.is_stop == False and token.is_alpha == True:
            tokens.append(token.lemma_)
    
    return tokens

In [398]:
# TF-ISF MATRIX CREATION FUNCTION
def create_tfisf_matrix(sentences, preprocessor = preprocess):
    #Initializing ScikitLearn TF-IDF vectorizer and creating TF-IDF sparse matrix
    vectorizer = TfidfVectorizer(tokenizer = preprocessor)
    tfisf_matrix = vectorizer.fit_transform(sentences)
    
    # Saving list of all corpus tokens
    feature_names = vectorizer.get_feature_names()

    # Returning TF-IDF matrix
    return tfisf_matrix, feature_names

## Similarity matrix 

In [397]:
# SIMILARITY MATRIX FUNCTIUON
def create_sim_matrix(sentences, tfisf_matrix):
    # Remove possible NaN or inf or -inf and replace with numerical value
    tfisf_matrix = np.nan_to_num(tfisf_matrix)
    
    # Transform matrix to list representation, needed for the cosine distance function
    tfisf_matrix_list = tfisf_matrix.toarray().tolist()
    
    # Initialize empty quadratic sentence similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
    
    # Loop over all similarity matrix positions, calculating the cosine similarity sentence i,j
    for i in range(len(sentences)):
        # Ignore if sentence has no token, avoids cosine distance calculation of zero-vector
        if sum(tfisf_matrix_list[i]) == 0:
            continue
        for j in range(len(sentences)):
            # Ignore if both are same sentences or sentence has no token
            if i == j or sum(tfisf_matrix_list[j]) == 0: 
                continue 
            similarity_matrix[i][j] = 1 - cosine_distance(tfisf_matrix_list[i], tfisf_matrix_list[j])
    
    # Return the final similarity matrix
    return similarity_matrix


## Graph representation and sentence scoring

In [395]:
# SENTENCE SCORING FUNCTION
def textrank_sentence_scoring(similarity_matrix, sentences):
    # Create graphical representation from similarity matrix
    graph = nx.from_numpy_array(similarity_matrix)
    
    # Rank all sentences according to pagerank algorithm
    sentence_scores = nx.pagerank(graph, max_iter = 500)
    
    # Sort all sentences and sentence index according to score
    sentence_scores_sorted = sorted(((sentence_scores[i],i) for i,s in enumerate(sentences)), reverse=True)    
    
    # Return the sorted sentence scores + index 
    return sentence_scores_sorted

## Sentence Extraction

In [396]:
def textrank_sentence_extraction(sentences, sentence_scores, n):
    summary = []
    
    # Sort sentence indices in ascending order
    sentence_indices_sorted = sorted(sentence_scores[0:n], key=lambda tup: tup[1])
    
    # Extract sentences and append to summary in taht order
    for i in sentence_indices_sorted:
        summary.append(sentences[i[1]])
    
    # Join all sentences to form final summary
    summary = ' '.join(summary)
    
    # Return summary
    return summary
    

 # Running the Model


In [400]:
# Setting corpus size to evaluate
corpus = df_extractive[0:1000]
summaries = []

# Looping over every document in every corpus, running each model step
for index, doc in corpus.iterrows():
    # Sentence split document
    sentences = sentence_splitting(doc)
    
    # TF-ISF matrix construction
    tfisf_matrix, feature_names = create_tfisf_matrix(sentences = sentences)
    
    # Similarity matrix construction
    similarity_matrix = create_sim_matrix(sentences, tfisf_matrix)
    
    # Scoring each document sentence
    sentence_scores = textrank_sentence_scoring(similarity_matrix, sentences)
    
    # Extracting and merging sentences 
    summary = textrank_sentence_extraction(sentences, sentence_scores, 3)
    
    # Append summary to list of summaries 
    summaries.append(summary)
    


In [394]:
summaries[18]

"Manhattan prosecutors charged yesterday that Douglas Meyer, the former vice president of marketing at Syms Advertising, a subsidiary, wove a scam that fleeced his bosses for $5.5 million. Meyer pulled the worsted wool over their eyes, prosecutors said, by saying that he needed to hire three separate vendors to design, produce and place print advertising for the clothing firm. Prosecutors said that when the scam began in January 1998, Meyer and Jelle (Jay) Eijpe, 39, also of Secaucus, shared half of the money paid to Birnbach's allegedly fake firms."