In [1]:
import numpy as np
import pandas as pd
import spacy as sp
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

nlp = sp.load("en_core_web_lg")

In [2]:
df = pd.read_csv('Newsarticles8.csv')
df = df.dropna(subset=['text']) ## drop missing values 
df = df.dropna(subset=['title'])

In [3]:
class model:
    def __init__(self, corpus):
        self.vectorizer = TfidfVectorizer(analyzer='word', stop_words='english')
        self.vectorizer.fit_transform(corpus)
        self.feature_names = np.array(self.vectorizer.get_feature_names())
        
    def generate_titles(self, text, n_titles, n_keywords, min_lenght, max_lenght):
        top_keywords = self.get_keywords(text, n_keywords)
        keywords, _ = zip(*top_keywords)
        candidates_with_sim, new_keywords = self.get_sentence_candidates_with_sim(text, top_keywords, 20)
        keywords_ext = new_keywords + list(keywords)
        return self.get_generated_titles(candidates_with_sim, keywords_ext, n_titles, min_lenght, max_lenght)
    
    def get_keywords(self, text, n=10):
        query_transformed = self.vectorizer.transform([text]).toarray()
        keyword_index = np.argsort(query_transformed[0])[::-1]
        rank = query_transformed[0][keyword_index[:n]]
        keywords = self.feature_names[keyword_index[:n]]
        return list(zip(keywords, rank))
    
    def get_generated_titles(self, candidates, keywords, n, lower_l, upper_l):
        list_of_sentences = []
        for sentence in candidates[:n]:
            doc = nlp(sentence[0])
            list_of_aux = self.get_AUX(doc, keywords)
            list_of_det = self.get_DET(doc)
            list_of_adp = self.get_ADP(doc, keywords)
            list_of_acc = self.get_accordingtos(doc)
            to_delete = list(set(list_of_aux) | set(list_of_det) | set(list_of_acc) | set(list_of_adp))
            resulting_sentence = ' '.join([tok.orth_ for tok in doc if tok.i not in to_delete]).replace(" '", "'").replace(' .', '.').replace(" - ", "-").replace(" ,.", ".").replace(",.", ".").replace(" ,", ",").replace(" :", ":").replace('. "', '"').replace(" n't", "n't")
            if (len(resulting_sentence.split()) <= upper_l and len(resulting_sentence.split()) > lower_l):
                list_of_sentences.append(resulting_sentence[0].upper() + resulting_sentence[1:])
                #print(resulting_sentence[0].upper() + resulting_sentence[1:])

        return list_of_sentences
    
    def sentence_score_with_sim(self, sentence, top_keywords, cutoff):
        keywords, keywords_score = zip(*top_keywords)
        tokenized_keywords = nlp(' '.join(keywords))
        new_keywords = []
        score = 0
        for tok in sentence:
            if not tok.is_stop:
                for keytok in tokenized_keywords:
                    if tok.orth_ == keytok.orth_:
                        score += keywords_score[keytok.i]
                    else: 
                        if tok.similarity(keytok) > cutoff:
                            new_keywords.append(tok.orth_)
                            score += tok.similarity(keytok) * keywords_score[keytok.i]
        return score, new_keywords


    def get_sentence_candidates_with_sim(self, content, top_keywords, n):
        doc = nlp(content)
        sents = list(doc.sents)
        sentence_keywords = []
        for sentence in sents:
            score, new_keywords = self.sentence_score_with_sim(sentence, top_keywords, 0.7)
            sentence_keywords.append((sentence.text, score))
        return sorted(sentence_keywords, key=lambda tup: tup[1], reverse=True)[:n], new_keywords
    
    def get_ADP(self, doc, keywords):
        to_delete = []
        for token in doc:
            if token.pos_ == 'ADP' and token.text != 'to' and token.text != 'of' and token.text not in keywords:
                tmp = []
                keyword_found = False
                for tok in token.subtree:
                    if tok.orth_ in keywords:
                        keyword_found = True
                        break
                    tmp.append(tok.i)
                if not (keyword_found):
                    to_delete = tmp
        return to_delete
                    
    def get_AUX(self, doc, keywords):
        to_delete = []
        #print("AUX: ")
        for token in doc:
            if token.pos_ == 'AUX' and token.orth_ not in keywords:
                tmp = []
                keyword_found = False
                for tok in token.subtree:
                    if tok.orth_ in keywords:
                        keyword_found = True
                        break
                    tmp.append(tok.i)
                if not (keyword_found):
                    to_delete = tmp
        return to_delete

    def get_accordingtos(self, doc):
        to_delete = []
        for token in doc:
            if token.text in ('according', 'According'):
                for tok in token.subtree:
                    to_delete.append(tok.i)

        return to_delete

    def get_DET(self, doc):
        to_delete = []
        for token in doc:
            if token.pos_ == 'DET':
                for tok in token.subtree:
                    to_delete.append(tok.i)
        return to_delete

In [4]:
atg = model(df["text"])

In [16]:
textd = '''A California man is facing murder charges after police say he intentionally rammed his vehicle into another, resulting in the death of three teenage boys.

The incident took place Sunday around 10:30 p.m. (1:30 a.m. ET) about 60 miles southeast of Los Angeles near Corona, when the man, police said, rammed into a 2002 Prius, causing the driver of the vehicle to lose control and hit a tree. There were six teenagers in the car, including an 18-year-old driver, according to CNN affiliate KTLA.
Three passengers were trapped in the car and had to be taken out using the Jaws of Life, according to CNN affiliate KCAL. One of the teens died at the scene, and the other five were taken to a hospital where two more died, a spokesperson with the California Highway Patrol told KCAL. Three of the boys had non-life threatening injuries, said California Highway Patrol Lt. David Yokley.
Anurag Chandra, 42, was taken into custody without incident and was charged with three counts of murder and three counts of attempted murder, Yokley said. It is unclear if Chandra has an attorney.
The initial investigation led authorities to believe it was a hit-and-run incident, however, witnesses came forward after following Chandra to his home and alerting police. Further investigation determined the crash was an intentional act. Authorities are now treating the incident as a homicide investigation, Yokley said.
It is also unclear if Chandra or any of the boys knew each other, Yokley said.
"We really don't know, obviously there was some sort of contact. We're looking into those exact same questions of whether or not he was known to the victims," Yokley said. "There was some sort of contact which led to this incident."
Yokley said there's no indication that drugs or alcohol played a factor in this crash.
CNN has reached out to the Riverside County District Attorney's Office and the Sheriff's Office.'''

In [17]:
atg.generate_titles(textd, 5, 5, 5, 20)

['Initial investigation led authorities to believe it was hit-and-run incident, however, witnesses came forward.',
 'It is also unclear if Chandra or knew other, Yokley said. \n',
 'California man facing murder charges after police say he intentionally rammed vehicle, resulting in death of three teenage boys. \n\n']

The model generates three titles for the above article text:
'Initial investigation led authorities to believe it was hit-and-run incident, however, witnesses came forward.'
'It is also unclear if Chandra or knew other, Yokley said.'
'California man facing murder charges after police say he intentionally rammed vehicle, resulting in death of three teenage boys.'

The last generated title is a perfect example to see the sentence compression in action. 

the original extracted sentence: 'A California man is facing murder charges after police say he intentionally rammed his vehicle into another, resulting in the death of three teenage boys.'

the model prunes redundant words like 'a' and 'is' and the phrase 'rammed his vehicle into another' was compressed to 'rammed vehicle'.

