## Summarization Model
<hr>

In [1]:
from nltk.corpus import stopwords

class sentence(object):

    def __init__(self, docName, preproWords, originalWords):
        self.docName = docName
        self.preproWords = preproWords
        self.wordFrequencies = self.sentenceWordFreq()
        self.originalWords = originalWords

    def getDocName(self):
        return self.docName

    def getPreProWords(self):
        return self.preproWords

    def getOriginalWords(self):
        return self.originalWords

    def getWordFreq(self):
        return self.wordFrequencies	

    def sentenceWordFreq(self):
        wordFreq = {}
        for word in self.preproWords:
            if word not in wordFreq.keys():
                wordFreq[word] = 1
            else:
                wordFreq[word] = wordFreq[word] + 1
        return wordFreq

In [2]:
import nltk
import os
import math
import string
import re
from nltk.corpus import stopwords

def getSentences(answer, id):

    sentence_token = nltk.data.load('tokenizers/punkt/english.pickle')
    lines = sentence_token.tokenize(answer.strip())
    
    sentences = []
    porter = nltk.PorterStemmer()

    for line in lines:

        originalWords = line[:]
        line = line.strip().lower()

        sent = nltk.word_tokenize(line)

        stemmedSent = [porter.stem(word) for word in sent]
        stemmedSent = filter(lambda x: x!='.'and x!='`'and x!=','and x!='?'and x!="'" 
            and x!='!' and x!='''"''' and x!="''" and x!="'s", stemmedSent)

        if stemmedSent != []:
            sentences.append(sentence(id, stemmedSent, originalWords))

    return sentences

In [3]:
def TFs(sentences):
    
    tfs = {}

    for sent in sentences:
        wordFreqs = sent.getWordFreq()

        for word in wordFreqs.keys():
            if tfs.get(word, 0) != 0:
                tfs[word] = tfs[word] + wordFreqs[word]
            else:
                tfs[word] = wordFreqs[word]	
                
    return tfs

In [4]:
def IDFs(sentences):
    N = len(sentences)
    idf = 0
    idfs = {}
    words = {}
    w2 = []
    
    for sent in sentences:
        
        for word in sent.getPreProWords():

            if sent.getWordFreq().get(word, 0) != 0:
                words[word] = words.get(word, 0)+ 1

    for word in words:
        n = words[word]
        
        try:
            w2.append(n)
            idf = math.log10(float(N)/n)
        except ZeroDivisionError:
            idf = 0
                
        idfs[word] = idf
            
    return idfs

In [5]:
def TF_IDF(sentences):
    tfs = TFs(sentences)
    idfs = IDFs(sentences)
    retval = {}
    for word in tfs:
        tf_idfs=  tfs[word]
        if retval.get(tf_idfs, None) == None:
            retval[tf_idfs] = [word]
        else:
            retval[tf_idfs].append(word)

    return retval

In [6]:
def sentenceSim(sentence1, sentence2, IDF_w):
    numerator = 0
    denominator = 0

    for word in sentence2.getPreProWords():
        numerator+= sentence1.getWordFreq().get(word,0) * sentence2.getWordFreq().get(word,0) *  IDF_w.get(word,0) ** 2

    for word in sentence1.getPreProWords():
        denominator+= ( sentence1.getWordFreq().get(word,0) * IDF_w.get(word,0) ) ** 2

    try:
        return numerator / math.sqrt(denominator)
    except ZeroDivisionError:
        return float("-inf")

In [7]:
def buildQuery(sentences, TF_IDF_w, n):
    
    scores = TF_IDF_w.keys()
    sorted(scores, reverse=True)
    i = 0
    j = 0
    queryWords = []

    while(i<n):
        words = TF_IDF_w[list(scores)[j]]
        for word in words:
            queryWords.append(word)
            i=i+1
            if (i>n): 
                break
        j=j+1

    return sentence("query", queryWords, queryWords)

In [8]:
def bestSentence(sentences, query, IDF):
    best_sentence = None
    maxVal = float("-inf")

    for sent in sentences:
        similarity = sentenceSim(sent, query, IDF)

        if similarity > maxVal:
            best_sentence = sent
            maxVal = similarity
    if best_sentence in sentences:
        sentences.remove(best_sentence)

    return best_sentence

In [9]:
def makeSummary(sentences, best_sentence, query, summary_length, lambta, IDF):
    summary = [best_sentence]
    sum_len = len(best_sentence.getPreProWords())

    MMRval={}

    while (sum_len < summary_length):
        MMRval={}

        for sent in sentences:
            MMRval[sent] = MMRScore(sent, query, summary, lambta, IDF)

        maxxer = max(MMRval, key=MMRval.get)
        summary.append(maxxer)
        if maxxer in sentences:
            sentences.remove(maxxer)
        sum_len += len(maxxer.getPreProWords())

    return summary

In [10]:
def MMRScore(Si, query, Sj, lambta, IDF):
    Sim1 = sentenceSim(Si, query, IDF)
    l_expr = lambta * Sim1
    value = [float("-inf")]

    for sent in Sj:
        Sim2 = sentenceSim(Si, sent, IDF)
        value.append(Sim2)

    r_expr = (1-lambta) * max(value)
    MMR_SCORE = l_expr - r_expr	

    return MMRScore

In [11]:
sentences = []

answer = "In 1992, Tim Berners-Lee circulated a document titled \"HTML Tags,\" which outlined just 20 tags, many of which are now obsolete or have taken other forms. The first surviving tag to be defined in the document, after the crucial anchor tag, is the paragraph tag. It wasn’t until 1993 that a discussion emerged on the proposed image tag."

answers = [answer]

for answer in answers:
    sentences = sentences + getSentences(answer, 0)
    
IDF_w = IDFs(sentences)
TF_IDF_w = TF_IDF(sentences)

query = buildQuery(sentences, TF_IDF_w, 10)

best1sentence = bestSentence(sentences, query, IDF_w)

summary = makeSummary(sentences, best1sentence, query, 100, 0.5, IDF_w)

final_summary = ""
for sent in summary:
    final_summary = final_summary + sent.getOriginalWords() + "\n"
final_summary = final_summary[:-1]

AttributeError: 'NoneType' object has no attribute 'getPreProWords'

In [151]:
from nltk.corpus import stopwords

class sentence:
    
    def __init__(self, id, processedWords, originalWords):
        self.id = id
        self.processedWords = processedWords
        self.originalWords = originalWords
        self.wordFreq = self.sentWordFreq()

    def getId(self):
        return self.id

    def getProcessedWords(self):
        return self.processedWords

    def getOriginalWords(self):
        return self.originalWords
    
    def getWordFreq(self):
        return self.wordFreq

    def sentWordFreq(self):
        wordFreq = {}
        for word in self.processedWords:
            if word not in wordFreq.keys():
                wordFreq[word] = 1
            else:
                wordFreq[word] = wordFreq[word] + 1
        return wordFreq

In [152]:
import nltk
import os
import math
import string
import re
from nltk.corpus import stopwords

def processAns(id, answer):

    sentence_token = nltk.data.load('tokenizers/punkt/english.pickle')
    lines = sentence_token.tokenize(answer.strip())

    sentences = []
    porter = nltk.PorterStemmer()

    for line in lines:
        originalWords = line[:]
        line = line.strip().lower()

        sent = nltk.word_tokenize(line)

        stemmedSent = [re.sub(r'[^\w\s]', '', porter.stem(word)) for word in sent]

        new_stemsent = []

        for word in stemmedSent:
            if word!='':
                new_stemsent.append(word)

        stemmedSent = new_stemsent
        
        if stemmedSent != [] :
            sentences.append(sentence(id, stemmedSent, originalWords))

    return sentences        


In [153]:
def TFs(sentences):
    
    tfs = {}
    
    for sent in sentences:
        wordFreqs = sent.getWordFreq()

        for word in wordFreqs.keys():
            if tfs.get(word, 0)!=0:
                tfs[word] = tfs[word] + wordFreqs[word]
            else:
                tfs[word] = wordFreqs[word]
        
    return tfs

In [154]:
def IDFs(sentences):
    
    N = len(sentences)
    idf = 0
    idfs = {}
    words = {}
    w2 = []
    
    for sent in sentences:

        for word in sent.getProcessedWords():

            if sent.getWordFreq().get(word, 0) != 0:
                words[word] = words.get(word, 0)+1

    for word in words:
        n = words[word]

        try:
            w2.append(n)                
            idf = math.log10(float(N)/n)
        except ZeroDivisionError:
            idf = 0

        idfs[word] = idf

    return idfs

In [155]:
def TF_IDF(sentences):
    tfs = TFs(sentences)
    idfs = IDFs(sentences)
    retval = {}

    for word in tfs:
        tf_idfs = tfs[word]*idfs[word]

        if retval.get(tf_idfs, None) == None:
            retval[tf_idfs] = [word]
        else:
            retval[tf_idfs].append(word)

    return retval

In [156]:
def sentenceSim(sentence1, sentence2, IDF):
    
    numerator = 0
    denominator = 0

    for word in sentence2.getProcessedWords():
        numerator+= sentence1.getWordFreq().get(word,0) * sentence2.getWordFreq().get(word,0) *  IDF.get(word,0) ** 2

    for word in sentence1.getProcessedWords():
        denominator+= ( sentence1.getWordFreq().get(word,0) * IDF.get(word,0) ) ** 2

    try:
        return numerator / math.sqrt(denominator)
    except ZeroDivisionError:
        return float("-inf")

In [157]:
def buildBase(sentences, TF_IDF_dict, n):

    scores = TF_IDF_dict.keys()
    sorted(scores, reverse=True)

    i = 0
    j = 0
    baseWords = []

    while(i<n):
        words = TF_IDF_dict[list(scores)[j]]
        for word in words:
            baseWords.append(word)
            i = i+1
            if(i>n):
                break
        j = j+1

        return sentence("base", baseWords, baseWords)

In [158]:
def bestSentence(senteces, base, IDF):
    best_sentence = None
    maxVal = float("-inf")

    for sent in sentences:
        similarity = sentenceSim(sent, base, IDF)

        if similarity > maxVal:
            best_sentence = sent
            maxVal = similarity

    if best_sentence != None:
        sentences.remove(best_sentence)

    return best_sentence

In [159]:
def MMRScore(Si, base, Sj, lambta, IDF):
    Sim1 = sentenceSim(Si, base, IDF)
    l_expr = lambta * Sim1
    value = [float("-inf")]

    for sent in Sj:
        Sim2 = sentenceSim(Si, sent, IDF)
        value.append(Sim2)

    r_expr = (1-lambta)*max(value)
    MMR_SCORE = l_expr-r_expr

    return MMRScore

In [None]:
def makeSummary(sentences, best_sentence, base, summary_len, lambta, IDF):
    summary = [best_sentence]
    sum_len = len(best_sentence.getProcessedWords())

    MMRval = {}

    while(sum_len<summary_len):
        MMRval={}

        for sent in sentences:
            MMRval[sent] = MMRScore(sent, base, summary, lambta, IDF)

        maxxer = max(MMRval, key=MMRval.get)
        if maxxer != None:
            summary.append(maxxer)
            sentences.remove(maxxer)
            sum_len += len(maxxer.getPreProWords())
        
    return summary

In [None]:
answer = "In this article, I’d like to reacquaint you with the humble workhorse of communication that is the paragraph. Paragraphs are everywhere. In fact, at the high risk of stating the obvious, you are reading one now. Despite their ubiquity, we frequently neglect their presentation. This is a mistake."

answers = [answer]

sentences = []

for answer in answers:
    sentences = sentences + processAns("0", answer)
    
IDF = IDFs(sentences)
TF_IDF_dict = TF_IDF(sentences)

base = buildBase(sentences, TF_IDF_dict, 10)

bestsent = bestSentence(sentences, base, IDF)

summary = makeSummary(sentences, bestsent, base, 100, 0.5, IDF)