## Importing packages

Importing all needed packages for the full notebook. Only needs to be ran once. 

In [2]:
import gzip as gz
import json
import sys as sklearn
import spacy as sp
import pandas as pd
import numpy as np
import math
import re
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from rouge import Rouge
from rouge_score import rouge_scorer
import networkx as nx
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import pairwise_distances_argmin_min
import matplotlib.pyplot as plt

# Common functions

In [None]:
# DATA THINNING FUNCTION
def data_thinning(data): 
    # Extracting appropriate data and transforming to pandas dataframe
    df = pd.DataFrame(data)
    df_extractive = df[df.density_bin == 'extractive']

    # Filtering on rough estimate of lenght text
    article_lengths = [len(text.split()) for text in df_extractive.text]
    _ = plt.hist(article_lengths, bins = 100, range = (0, 2000))
    length_check = [len > 250 for len in article_lengths]
    df_extractive = df_extractive[length_check]

    # Inspecting head of dataframe for inspection
    df_extractive.head()

In [None]:
# SENTENCE SPLITTING FUNCTION
def sentence_splitting(doc):
    doc = nlp_sentencizer(doc['text'])
    sentences = [sent.string.strip() for sent in doc.sents]
    
    return sentences

In [None]:
# PREPROCESSING FUNCTION
def preprocess(text):
    # Transform text with SpaCy model for NLP procedures
    text = nlp(text)
    
    # loop through the words in the text, removing stopwords and numerics
    # Assign the remaining tokens to the token list in the lemma form
    tokens = []
    for token in text:
        if token.is_stop == False and token.is_alpha == True:
            tokens.append(token.lemma_)
    
    # Return all lemmatized tokens in the input text
    return tokens

In [None]:
rouge = Rouge()
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL','rougeLsum'], use_stemmer=True)
# ROUGE AND BLEU SCORING FUNCTION
def rouge_blue_scoring(summary, reference):
    rouge_score = rouge.get_scores(summary, reference)
    rouge_score_2 = scorer.score(summary, reference)
    return rouge_score_2, rouge_score

# TFIDF Functions

### Sparse TF-IDF matrix representation

In [None]:
# TF-IDF MATRIX CREATION FUNCTION
def create_tfidf_matrix(corpus, preprocessor = preprocess):
    #Initializing ScikitLearn TF-IDF vectorizer and creating TF-IDF sparse matrix
    vectorizer = TfidfVectorizer(tokenizer = preprocessor)
    tfidf_matrix = vectorizer.fit_transform(corpus['text'])
    
    # Saving list of all corpus tokens
    feature_names = vectorizer.get_feature_names()

    # Inspecting dimension of sparse matrix
    # Rows should equal no. of df_extractive data entities
    # Number of columns equals number of unique corpus tokens
    print("TF-IDF matrix dimension: ", tfidf_matrix.get_shape(), "\nAligning with no. df_extractive enteties? ", tfidf_matrix.get_shape()[0] == len(df_extractive))
    
    # Returning TF-IDF matrix
    return tfidf_matrix, feature_names### Sparse TF-IDF matrix representation

### Token and TF-IDF score pairing

In [None]:
# TOKEN TFIDF PAIRING FUNCTION
def token_tfidf_ranking(feature_names, tfidf_matrix, row_index = 0):
    # Exctracting indices of document tokens from the TF-IDF matrix
    token_indices = tfidf_matrix[row_index,:].nonzero()[1]
   
    # Extract token names and pair with corresponding TF-IDF value from the TF-IDF matrix
    # Sort by TF-IDF score
    token_tfidf = pd.DataFrame(np.column_stack(([feature_names[index] for index in token_indices], [tfidf_matrix[row_index, x] for x in token_indices])), columns=['token', 'tfidf_score'])
    token_tfidf = token_tfidf.sort_values(by='tfidf_score', ascending=False)
    
    #token_tfidf = token_tfidf.astype({"word": str, "tfidf_score": float})
    
    # Return the sorted list of (token, TF-IDF value) data frame
    return token_tfidf

### Sentence splitting

In [20]:
# Loading new model from SpaCy and adding sentencizer pipeline
nlp_sentencizer = sp.load("en_core_web_sm", disable=["tagger", "parser", "ner", "textcat"])
nlp_sentencizer.add_pipe(nlp.create_pipe('sentencizer'))

### Sentence level tokenization and scoring

In [129]:
# SENTENCE SCORING FUNTION
def sentence_scoring(sentences, token_tfidf_pairs):
    sentence_scores = []
    
    for i, sentence in enumerate(sentences):
        score = 0
        sentence_length = len(sentence)

        # Using preprocessing function to extract sentence tokens
        sentence_tokens = preprocess(sentence)

        # Summation of sentence tokens' TF-IDF values 
        for token in sentence_tokens:
            token = token.lower()
            if token in token_tfidf_pairs['token'].values:
                score = score + float(token_tfidf_pairs.loc[token_tfidf_pairs['token'] == token]['tfidf_score'].values)

        # Normalizing sentence score dependent on sentence length
        score = score / sentence_length
        
        # Append to list of sentence scores
        sentence_scores.append((i,score))
    
    # Save scores in pd dataframe
    sentence_scores = pd.DataFrame(sentence_scores,columns=["sentence_index", "sentence_score"]).sort_values(by='sentence_score', ascending=False)
    
    # Return final sentence scores
    return sentence_scores 

### Sentence Extraction

In [23]:
# SENTENCE EXTRACTION FUNCTION
def sentence_extraction(sentences, sentence_scores, n):
    summary = []
    
    # Extracting indices of the n number of top scoring sentences
    # sort them in ascending order
    top_sentence_indices = np.sort(sentence_scores[0:n]['sentence_index'].values)
    
    # Extracting the original sentences and appending to summary list
    for index in top_sentence_indices:
        summary.append(sentences[index])
        
    # Joining summary sentences
    summary = ' '.join(summary)
    
    return summary

# TFISF Functions

### Sparse TF-ISF matrix representation

In [97]:
# TF-ISF MATRIX CREATION FUNCTION
def create_tfisf_matrix(sentences, preprocessor = preprocess):
    #Initializing ScikitLearn TF-IDF vectorizer and creating TF-IDF sparse matrix
    vectorizer = TfidfVectorizer(tokenizer = preprocessor)
    tfisf_matrix = vectorizer.fit_transform(sentences)
    
    # Saving list of all corpus tokens
    feature_names = vectorizer.get_feature_names()
    
    # Returning TF-IDF matrix
    return tfisf_matrix, feature_names

### Sentence scoring

In [83]:
# SENTENCE SCORING FUNCTION
def tfisf_sentence_scoring(tfisf_matrix, feature_names, sentences):
    sentence_scores = []
    tfisf_matrix =  pd.DataFrame.sparse.from_spmatrix(tfisf_matrix, columns=feature_names)
    tfisf_matrix_row_sum = tfisf_matrix.sum(axis = 1)

    for i, sentence in enumerate(sentences):
        sentence_scores.append((i, tfisf_matrix_row_sum[i]/len(sentence)))
    
    sentence_scores = pd.DataFrame(sentence_scores,columns=["sentence_index", "sentence_score"]).sort_values(by='sentence_score', ascending=False)
    
    return sentence_scores

### Extract summary sentences and merge

In [102]:
# SENTENCE EXTRACTION FUNCTION
def tfisf_sentence_extraction(sentences, sentence_scores, n):
    summary = []
    
    # Extracting indices of the n number of top scoring sentences
    # sort them in ascending order
    top_sentence_indices = np.sort(sentence_scores[0:n].sentence_index.values)

    # Extracting the original sentences and appending to summary list
    for index in top_sentence_indices:
        summary.append(sentences[index])
        
    # Joining summary sentences
    summary = ' '.join(summary)
    
    return summary

# TextRank Functions

## Similarity matrix 

In [None]:
# SIMILARITY MATRIX FUNCTIUON
def create_sim_matrix(sentences, tfisf_matrix):
    # Remove possible NaN or inf or -inf and replace with numerical value
    tfisf_matrix = np.nan_to_num(tfisf_matrix)
    
    # Transform matrix to list representation, needed for the cosine distance function
    tfisf_matrix_list = tfisf_matrix.toarray().tolist()
    
    # Initialize empty quadratic sentence similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
    
    # Loop over all similarity matrix positions, calculating the cosine similarity sentence i,j
    for i in range(len(sentences)):
        # Ignore if sentence has no token, avoids cosine distance calculation of zero-vector
        if sum(tfisf_matrix_list[i]) == 0:
            continue
        for j in range(len(sentences)):
            # Ignore if both are same sentences or sentence has no token
            if i == j or sum(tfisf_matrix_list[j]) == 0: 
                continue 
            similarity_matrix[i][j] = 1 - cosine_distance(tfisf_matrix_list[i], tfisf_matrix_list[j])
    
    # Return the final similarity matrix
    return similarity_matrix
## Similarity matrix 

## Graph representation and sentence scoring

In [None]:
# SENTENCE SCORING FUNCTION
def textrank_sentence_scoring(similarity_matrix, sentences):
    # Create graphical representation from similarity matrix
    graph = nx.from_numpy_array(similarity_matrix)
    
    # Rank all sentences according to pagerank algorithm
    sentence_scores = nx.pagerank(graph, max_iter = 500)
    
    # Sort all sentences and sentence index according to score
    sentence_scores_sorted = sorted(((sentence_scores[i],i) for i,s in enumerate(sentences)), reverse=True)    
    
    # Return the sorted sentence scores + index 
    return sentence_scores_sortedb

## Sentence Extraction

In [None]:
def textrank_sentence_extraction(sentences, sentence_scores, n):
    summary = []
    
    # Sort sentence indices in ascending order
    sentence_indices_sorted = sorted(sentence_scores[0:n], key=lambda tup: tup[1])
    
    # Extract sentences and append to summary in taht order
    for i in sentence_indices_sorted:
        summary.append(sentences[i[1]])
    
    # Join all sentences to form final summary
    summary = ' '.join(summary)
    
    # Return summary
    return summary

# KMeans Functions

In [306]:
# SENTENCE SPLITTING FUNCTION
def sentence_splitting(doc):
    article = nlp(doc['text'])
    sentences = [sent.string.strip() for sent in article.sents]
    
    tokenized_sentences = []
    all_tokens = []
    for sentence in article.sents:
        tokens = []
        for token in sentence:
            if token.is_stop == False and token.is_alpha == True:
                t = token.lemma_.lower()
                tokens.append(t)
                all_tokens.append(t)
        if len(tokens) > 0:
            tokenized_sentences.append(tokens)  
    
    return sentences, tokenized_sentences, all_tokens


## Sentence vector representation

In [307]:
# SENTENCE VECTOR REPRESENTATION FUNCTION
def vectorize_sentences(tokenized_sentences, model):
    sentence_vectors = []
    for sentence in tokenized_sentences:
        vec = np.zeros(300)
        for token in sentence:
            vec = vec + model.wv[token]
        vec = vec/len(sentence)
        sentence_vectors.append(vec)
    
    return sentence_vectors

## Sentence clustering with Kmeans

In [308]:
def kmeans_clustering(k, sentence_vectors):
    kmeans = KMeans(k, init = 'k-means++', random_state = 42)
    kmeans_fit = kmeans.fit(sentence_vectors)
    pred = kmeans_fit.predict(sentence_vectors)
    
    return kmeans_fit, pred

In [309]:
# PLOT PCA REDUCED VECTOR CLUSTER ASSIGNMENTS
def kmeans_pca_plot(sentence_vectors, predictions):
    k = len(set(predictions))
    pca = PCA(n_components = 2)
    pca_fit = sklearn_pca.fit_transform(sentence_vectors)
    
    plt.figure()
    for i in range(k): 
        cluster_predictions = pca_fit[predictions == i]
        plt.scatter(cluster_predictions[:,0] , cluster_predictions[:,1])

## Extract sentences closest to each cluster centroid

In [310]:
# EXTRACT REPRESENTATIVE CLUSTER SENTENCES
def extract_sentences(kmeans_fit, sentence_vectors):
    closest_indices, _ = pairwise_distances_argmin_min(kmeans_fit.cluster_centers_, sentence_vectors)
    closest_indices = np.sort(closest_indices)
    
    summary = []
    for index in closest_indices:
        summary.append(sentences[index])
    summary = ' '.join(summary)
    
    return summary

# Importing Data

In [132]:
# Setting local file path
path = "release/train.jsonl.gz"

# Creating list entity to hold full set of loaded data
data = []

# Using gz to set path to zip file and iteritavly load each json line
with gz.open(path) as f:
    for ln in f:
        obj = json.loads(ln)
        data.append(obj)

In [198]:
# Extracting appropriate data and transforming to pandas dataframe
df = pd.DataFrame(data)
df_extractive = df[df.density_bin == 'extractive']

# Filtering on rough estimate of lenght text
article_lengths = [len(text.split()) for text in df_extractive.text]
_ = plt.hist(article_lengths, bins = 100, range = (0, 2000))
length_check = [len > 250 for len in article_lengths]
df_extractive = df_extractive[length_check]