In [33]:
import sys, glob, nltk

summary_length = 90
stopwords = nltk.corpus.stopwords.words("english")
lemmatizer = nltk.stem.WordNetLemmatizer()

def tokenize_sentence(sentence):
    tokens = nltk.word_tokenize(sentence)
    tokens = [t.lower() for t in tokens if t not in stopwords]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return tokens

def compute_probs(cluster):
    word_probs = {}
    token_count = 0 
    
    for file_path in cluster:
        with open(file_path) as f:
            tokens = tokenize_sentence(f.read().decode("utf-8"))
            token_count += len(tokens)
        for t in tokens:
            if t not in word_probs:
                word_probs[t] = 1.
            else:
                word_probs[t] += 1.
                
    return {k: (v / token_count) for k, v in word_probs.items()}
    
def extract_sentences(cluster):
    sentences = []
    
    for file_path in cluster:
        with open(file_path) as f:
            sentences += nltk.sent_tokenize(f.read().decode("utf-8"))
            
    return sentences
        
def score_sentence(sentence, word_probs):
    score = 0.
    token_count = 0
    tokens = tokenize_sentence(sentence)
    
    for t in tokens:
        score += word_probs[t]
        token_count += 1
        
    return score / token_count

def best_sentence(sentences, word_probs, non_redundancy):
    best_sentence = sentences[0]
    max_score = score_sentence(best_sentence, word_probs)
    
    for sent in sentences:
        score = score_sentence(sent, word_probs)
        if score > max_score:
            best_sentence = sent
            max_score = score
    
    if non_redundancy:
        tokens = tokenize_sentence(best_sentence)
        for t in tokens:
            word_probs[t] = word_probs[t] ** 2
        
    return best_sentence
            
def sum_basic(cluster, non_redundancy):
    cluster = glob.glob(cluster)
    word_probs = compute_probs(cluster)
    sentences = extract_sentences(cluster)

    summary = []
    word_count = 0
    while word_count < summary_length:
        sent = best_sentence(sentences, word_probs, non_redundancy)
        summary.append(sent)
        word_count += len(nltk.word_tokenize(sent))
        sentences.remove(sent)
        
    return " ".join(summary)

def leading_baseline(cluster):
    cluster = glob.glob(cluster)
    sentences = extract_sentences(cluster)

    summary = []
    word_count = 0
    while word_count < summary_length:
        sent = sentences[0]
        summary.append(sent)
        word_count += len(nltk.word_tokenize(sent))
        sentences.remove(sent)
        
    return " ".join(summary)
    
method = "leading"
cluster = "docs/doc1-*.txt"

if method == "orig":
    print sum_basic(cluster, True)
elif method == "simplified":
    print sum_basic(cluster, False)
elif method == "leading":
    print leading_baseline(cluster)

US President Donald Trump is breaking precedent once again by not meeting with America's Nobel prize winners. The eight US laureates have not been scheduled to see Mr Trump ahead of their December trip to Sweden to receive their awards. However, not all of them are disappointed at the prospect of missing out on the Oval office greeting. Joachim Frank, awarded the Nobel in chemistry for work in microscopy, told Stat News that he “will not put my foot into the White House as long as Trump, Pence...occupy it.”

He also included House Speaker Paul Ryan on that list in case he ends up in the White House as a result of the “possible succession of impeachments".
