## Importing packages

Importing all needed packages for the full notebook. Only needs to be ran once. 

In [252]:
import sys
!{sys.executable} -m pip install rouge



In [348]:
import gzip as gz
import json
import pandas as pd
import spacy as sp
import numpy as np
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import pairwise_distances_argmin_min
import matplotlib.pyplot as plt
from rouge import Rouge

## Importing data

In [132]:
# Setting local file path
path = "release/train.jsonl.gz"

# Creating list entity to hold full set of loaded data
data = []

# Using gz to set path to zip file and iteritavly load each json line
with gz.open(path) as f:
    for ln in f:
        obj = json.loads(ln)
        data.append(obj)

In [198]:
# Extracting appropriate data and transforming to pandas dataframe
df = pd.DataFrame(data)
df_extractive = df[df.density_bin == 'extractive']

# Filtering on rough estimate of lenght text
article_lengths = [len(text.split()) for text in df_extractive.text]
_ = plt.hist(article_lengths, bins = 100, range = (0, 2000))
length_check = [len > 250 for len in article_lengths]
df_extractive = df_extractive[length_check]

## Sentence splitting and tokenization

In [221]:
nlp = sp.load("en_core_web_sm", disable=["tagger", "parser", "ner", "textcat"])
nlp.add_pipe(nlp.create_pipe('sentencizer'))

In [306]:
# SENTENCE SPLITTING FUNCTION
def sentence_splitting(doc):
    article = nlp(doc['text'])
    sentences = [sent.string.strip() for sent in article.sents]
    
    tokenized_sentences = []
    all_tokens = []
    for sentence in article.sents:
        tokens = []
        for token in sentence:
            if token.is_stop == False and token.is_alpha == True:
                t = token.lemma_.lower()
                tokens.append(t)
                all_tokens.append(t)
        if len(tokens) > 0:
            tokenized_sentences.append(tokens)  
    
    return sentences, tokenized_sentences, all_tokens


## Sentence vector representation

In [307]:
# SENTENCE VECTOR REPRESENTATION FUNCTION
def vectorize_sentences(tokenized_sentences, model):
    sentence_vectors = []
    for sentence in tokenized_sentences:
        vec = np.zeros(300)
        for token in sentence:
            vec = vec + model.wv[token]
        vec = vec/len(sentence)
        sentence_vectors.append(vec)
    
    return sentence_vectors

## Sentence clustering with Kmeans

In [308]:
def kmeans_clustering(k, sentence_vectors):
    kmeans = KMeans(k, init = 'k-means++', random_state = 42)
    kmeans_fit = kmeans.fit(sentence_vectors)
    pred = kmeans_fit.predict(sentence_vectors)
    
    return kmeans_fit, pred

In [309]:
# PLOT PCA REDUCED VECTOR CLUSTER ASSIGNMENTS
def kmeans_pca_plot(sentence_vectors, predictions):
    k = len(set(predictions))
    pca = PCA(n_components = 2)
    pca_fit = sklearn_pca.fit_transform(sentence_vectors)
    
    plt.figure()
    for i in range(k): 
        cluster_predictions = pca_fit[predictions == i]
        plt.scatter(cluster_predictions[:,0] , cluster_predictions[:,1])

## Extract sentences closest to each cluster centroid

In [310]:
# EXTRACT REPRESENTATIVE CLUSTER SENTENCES
def extract_sentences(kmeans_fit, sentence_vectors):
    closest_indices, _ = pairwise_distances_argmin_min(kmeans_fit.cluster_centers_, sentence_vectors)
    closest_indices = np.sort(closest_indices)
    
    summary = []
    for index in closest_indices:
        summary.append(sentences[index])
    summary = ' '.join(summary)
    
    return summary

## Score summaries with ROUGE and BLEU

In [346]:
rouge = Rouge()
# ROUGE AND BLEU SCORING FUNCTION
def rouge_blue_scoring(summary, reference):
    rouge_score = rouge.get_scores(summary, reference)
    
    return rouge_score
    

# Running the experiment

In [347]:
corpus = df_extractive[0:10]
summaries = []
rouge_scores = []

for index, doc in corpus.iterrows():
    sentences, tokenized_sentences, all_tokens = sentence_splitting(doc)
    model = Word2Vec(tokenized_sentences, min_count=1,size= 300)
    sentence_vectors = vectorize_sentences(tokenized_sentences, model)
    k = 3
    kmeans_fit, predictions = kmeans_clustering(k, sentence_vectors)
    #kmeans_pca_plot(sentence_vectors, predictions)
    summary = extract_sentences(kmeans_fit, sentence_vectors)
    summaries.append(summary)
    rouge_score= rouge_blue_scoring(summary, doc.summary)
    rouge_scores.append(rouge_score)
