In [1]:
import re
import pandas as pd
import numpy as np
import csv
import nltk
import random
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [208]:
rawtext = "Dear Sean, How are you? This is from the Baruch College Admission Office. I am writing to let you know that you tuition check did not go through. The courses that you registered is now on hold. If you still want to enroll this semester, please pay the tuition by the end of this week. Otherwise, You will be suspended. Please let us know as soon as possible. Let me know if you have any questions. You can call us at 999-000-000. Thank you and have a nice day."

In [209]:
sents = nltk.sent_tokenize(rawtext)
num_sent = len(sents)

In [210]:
print(sents)

['Dear Sean, How are you?', 'This is from the Baruch College Admission Office.', 'I am writing to let you know that you tuition check did not go through.', 'The courses that you registered is now on hold.', 'If you still want to enroll this semester, please pay the tuition by the end of this week.', 'Otherwise, You will be suspended.', 'Please let us know as soon as possible.', 'Let me know if you have any questions.', 'You can call us at 999-000-000.', 'Thank you and have a nice day.']


In [211]:
def preprocessing (text):
    
    # Normalization and cleaning
    text = text.lower()
    text = re.sub("(http|https|www)(:|\.)\S+.com"," ",text)
    text = re.sub("[^\w\d]"," ",text)
    text = re.sub("\d+"," ",text)
        
    return text

In [212]:
processed_sents = [preprocessing(sent) for sent in sents]

In [213]:
print(processed_sents)

['dear sean  how are you ', 'this is from the baruch college admission office ', 'i am writing to let you know that you tuition check did not go through ', 'the courses that you registered is now on hold ', 'if you still want to enroll this semester  please pay the tuition by the end of this week ', 'otherwise  you will be suspended ', 'please let us know as soon as possible ', 'let me know if you have any questions ', 'you can call us at       ', 'thank you and have a nice day ']


In [9]:
import gensim
w2v = gensim.models.keyedvectors.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [175]:
def sent_vectorize(post):
    vec_size = w2v.vector_size
    sent_vec = np.zeros(vec_size)
    vec_count = 1
    for word in nltk.word_tokenize(post):
        if word not in nltk.corpus.stopwords.words('english') and lemmatizer.lemmatize(word) in w2v:
            vec_count += 1
            sent_vec += w2v[lemmatizer.lemmatize(word)]
    sent_vec = sent_vec/vec_count          
    return sent_vec

In [214]:
vectorized_sents = [sent_vectorize(sent) for sent in processed_sents]

### Cos similarity matrix and angle similarity matrix

In [177]:
from scipy import spatial
import math
import networkx as nx

In [215]:
similarity_matrix = np.zeros([len(vectorized_sents), len(vectorized_sents)])
angle_matrix = np.zeros([len(vectorized_sents), len(vectorized_sents)])

In [216]:
for i, row in enumerate(vectorized_sents):
    for j, col in enumerate(vectorized_sents):
        similarity_matrix[i][j] = 1 - spatial.distance.cosine(row,col)
        angle_matrix[i][j] = math.acos(1 - spatial.distance.cosine(row,col))

### 1. W2V + Page Rank:

In [217]:
nx_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(nx_graph)

In [219]:
print(scores)

{0: 0.08389246188711963, 1: 0.08630959825954422, 2: 0.12309747736748654, 3: 0.0872070162721341, 4: 0.11641747079461795, 5: 0.07148485651555794, 6: 0.1225584079547313, 7: 0.11435838627167476, 8: 0.09893105363109449, 9: 0.09574327104603897}


In [218]:
cutoff = 1/num_sent

In [220]:
top_sent_keys=sorted([x for x in scores if scores[x]>=cutoff])

In [221]:
for id in top_sent_keys:
    print(sents[id])

I am writing to let you know that you tuition check did not go through.
If you still want to enroll this semester, please pay the tuition by the end of this week.
Please let us know as soon as possible.
Let me know if you have any questions.


### 2. W2V + Radian Distance:

In [226]:
sum = 0
for i in range(len(vectorized_sents)):
               for j in range(len(vectorized_sents)):
                              sum = sum + angle_matrix[i][j]
                
cutoff_angle = sum /(2*n_pair)
n_pair = (num_sent**2-num_sent)/2

In [227]:
angle_scores = [angle_matrix[i].sum()/(num_sent-1) for i in range(len(vectorized_sents))]

In [230]:
top_sent_keys_angle =sorted([i for i in range(num_sent) if angle_scores[i]<=cutoff_angle])

In [231]:
for id in top_sent_keys_angle:
    print(sents[id])

I am writing to let you know that you tuition check did not go through.
If you still want to enroll this semester, please pay the tuition by the end of this week.
Please let us know as soon as possible.
Let me know if you have any questions.


### 3. TFIDF

In [235]:
from nltk import sent_tokenize, word_tokenize, PorterStemmer
sentences = nltk.sent_tokenize(rawtext) # NLTK function
total_documents = len(sentences)

In [236]:
def _create_frequency_matrix(sentences):
    frequency_matrix = {}
    stopWords = set(stopwords.words("english"))
    ps = PorterStemmer()

    for sent in sentences:
        freq_table = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            word = ps.stem(word)
            if word in stopWords:
                continue

            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        frequency_matrix[sent[:15]] = freq_table
    return frequency_matrix

In [237]:
freq_matrix = _create_frequency_matrix(sentences)
print(freq_matrix)

{'Dear Sean, How ': {'dear': 1, 'sean': 1, ',': 1, '?': 1}, 'This is from th': {'thi': 1, 'baruch': 1, 'colleg': 1, 'admiss': 1, 'offic': 1, '.': 1}, 'I am writing to': {'write': 1, 'let': 1, 'know': 1, 'tuition': 1, 'check': 1, 'go': 1, '.': 1}, 'The courses tha': {'cours': 1, 'regist': 1, 'hold': 1, '.': 1}, 'If you still wa': {'still': 1, 'want': 1, 'enrol': 1, 'thi': 2, 'semest': 1, ',': 1, 'pleas': 1, 'pay': 1, 'tuition': 1, 'end': 1, 'week': 1, '.': 1}, 'Otherwise, You ': {'otherwis': 1, ',': 1, 'suspend': 1, '.': 1}, 'Please let us k': {'pleas': 1, 'let': 1, 'us': 1, 'know': 1, 'soon': 1, 'possibl': 1, '.': 1}, 'Let me know if ': {'let': 1, 'know': 1, 'ani': 1, 'question': 1, '.': 1}, 'You can call us': {'call': 1, 'us': 1, '999-000-000': 1, '.': 1}, 'Thank you and h': {'thank': 1, 'nice': 1, 'day': 1, '.': 1}}


In [238]:
def _create_tf_matrix(freq_matrix):
    tf_matrix = {}

    for sent, f_table in freq_matrix.items():
        tf_table = {}

        count_words_in_sentence = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_sentence

        tf_matrix[sent] = tf_table

    return tf_matrix

In [239]:
tf_matrix = _create_tf_matrix(freq_matrix)
print(tf_matrix)

{'Dear Sean, How ': {'dear': 0.25, 'sean': 0.25, ',': 0.25, '?': 0.25}, 'This is from th': {'thi': 0.16666666666666666, 'baruch': 0.16666666666666666, 'colleg': 0.16666666666666666, 'admiss': 0.16666666666666666, 'offic': 0.16666666666666666, '.': 0.16666666666666666}, 'I am writing to': {'write': 0.14285714285714285, 'let': 0.14285714285714285, 'know': 0.14285714285714285, 'tuition': 0.14285714285714285, 'check': 0.14285714285714285, 'go': 0.14285714285714285, '.': 0.14285714285714285}, 'The courses tha': {'cours': 0.25, 'regist': 0.25, 'hold': 0.25, '.': 0.25}, 'If you still wa': {'still': 0.08333333333333333, 'want': 0.08333333333333333, 'enrol': 0.08333333333333333, 'thi': 0.16666666666666666, 'semest': 0.08333333333333333, ',': 0.08333333333333333, 'pleas': 0.08333333333333333, 'pay': 0.08333333333333333, 'tuition': 0.08333333333333333, 'end': 0.08333333333333333, 'week': 0.08333333333333333, '.': 0.08333333333333333}, 'Otherwise, You ': {'otherwis': 0.25, ',': 0.25, 'suspend': 0.

In [240]:
def _create_documents_per_words(freq_matrix):
    word_per_doc_table = {}

    for sent, f_table in freq_matrix.items():          # go through entire matrix, add 1 if catch
        for word, count in f_table.items():
            if word in word_per_doc_table:
                word_per_doc_table[word] += 1
            else:
                word_per_doc_table[word] = 1

    return word_per_doc_table

In [241]:
count_doc_per_words = _create_documents_per_words(freq_matrix)
print(count_doc_per_words)

{'dear': 1, 'sean': 1, ',': 3, '?': 1, 'thi': 2, 'baruch': 1, 'colleg': 1, 'admiss': 1, 'offic': 1, '.': 9, 'write': 1, 'let': 3, 'know': 3, 'tuition': 2, 'check': 1, 'go': 1, 'cours': 1, 'regist': 1, 'hold': 1, 'still': 1, 'want': 1, 'enrol': 1, 'semest': 1, 'pleas': 2, 'pay': 1, 'end': 1, 'week': 1, 'otherwis': 1, 'suspend': 1, 'us': 2, 'soon': 1, 'possibl': 1, 'ani': 1, 'question': 1, 'call': 1, '999-000-000': 1, 'thank': 1, 'nice': 1, 'day': 1}


In [242]:
def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
    idf_matrix = {}

    for sent, f_table in freq_matrix.items():
        idf_table = {}

        for word in f_table.keys():
            idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))

        idf_matrix[sent] = idf_table

    return idf_matrix

In [243]:
idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents)
print(idf_matrix)

{'Dear Sean, How ': {'dear': 1.0, 'sean': 1.0, ',': 0.5228787452803376, '?': 1.0}, 'This is from th': {'thi': 0.6989700043360189, 'baruch': 1.0, 'colleg': 1.0, 'admiss': 1.0, 'offic': 1.0, '.': 0.04575749056067514}, 'I am writing to': {'write': 1.0, 'let': 0.5228787452803376, 'know': 0.5228787452803376, 'tuition': 0.6989700043360189, 'check': 1.0, 'go': 1.0, '.': 0.04575749056067514}, 'The courses tha': {'cours': 1.0, 'regist': 1.0, 'hold': 1.0, '.': 0.04575749056067514}, 'If you still wa': {'still': 1.0, 'want': 1.0, 'enrol': 1.0, 'thi': 0.6989700043360189, 'semest': 1.0, ',': 0.5228787452803376, 'pleas': 0.6989700043360189, 'pay': 1.0, 'tuition': 0.6989700043360189, 'end': 1.0, 'week': 1.0, '.': 0.04575749056067514}, 'Otherwise, You ': {'otherwis': 1.0, ',': 0.5228787452803376, 'suspend': 1.0, '.': 0.04575749056067514}, 'Please let us k': {'pleas': 0.6989700043360189, 'let': 0.5228787452803376, 'us': 0.6989700043360189, 'know': 0.5228787452803376, 'soon': 1.0, 'possibl': 1.0, '.': 0.

In [244]:
def _create_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}

    for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):

        tf_idf_table = {}

        for (word1, value1), (word2, value2) in zip(f_table1.items(),
                                                    f_table2.items()):  # here, keys are the same in both the table
            tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[sent1] = tf_idf_table

    return tf_idf_matrix

In [245]:
tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)
print(tf_idf_matrix)

{'Dear Sean, How ': {'dear': 0.25, 'sean': 0.25, ',': 0.1307196863200844, '?': 0.25}, 'This is from th': {'thi': 0.1164950007226698, 'baruch': 0.16666666666666666, 'colleg': 0.16666666666666666, 'admiss': 0.16666666666666666, 'offic': 0.16666666666666666, '.': 0.0076262484267791905}, 'I am writing to': {'write': 0.14285714285714285, 'let': 0.0746969636114768, 'know': 0.0746969636114768, 'tuition': 0.0998528577622884, 'check': 0.14285714285714285, 'go': 0.14285714285714285, '.': 0.0065367843658107345}, 'The courses tha': {'cours': 0.25, 'regist': 0.25, 'hold': 0.25, '.': 0.011439372640168786}, 'If you still wa': {'still': 0.08333333333333333, 'want': 0.08333333333333333, 'enrol': 0.08333333333333333, 'thi': 0.1164950007226698, 'semest': 0.08333333333333333, ',': 0.043573228773361464, 'pleas': 0.0582475003613349, 'pay': 0.08333333333333333, 'tuition': 0.0582475003613349, 'end': 0.08333333333333333, 'week': 0.08333333333333333, '.': 0.0038131242133895953}, 'Otherwise, You ': {'otherwis': 

In [246]:
def _score_sentences(tf_idf_matrix) -> dict:
    """
    score a sentence by its word's TF
    Basic algorithm: adding the TF frequency of every non-stop word in a sentence divided by total no of words in a sentence.
    :rtype: dict
    """

    sentenceValue = {}

    for sent, f_table in tf_idf_matrix.items():
        total_score_per_sentence = 0

        count_words_in_sentence = len(f_table)
        for word, score in f_table.items():
            total_score_per_sentence += score

        sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence   #sentence average

    return sentenceValue

In [247]:
sentence_scores = _score_sentences(tf_idf_matrix)
print(sentence_scores)

{'Dear Sean, How ': 0.2201799215800211, 'This is from th': 0.13179798596935258, 'I am writing to': 0.09776499970321162, 'The courses tha': 0.1903598431600422, 'If you still wa': 0.07197580731378535, 'Otherwise, You ': 0.1605397647400633, 'Please let us k': 0.0916215304039467, 'Let me know if ': 0.12366059924485402, 'You can call us': 0.17154546843104337, 'Thank you and h': 0.1903598431600422}


In [248]:
def _find_average_score(sentenceValue) -> int:
    """
    Find the average score from the sentence value dictionary
    :rtype: int
    """
    sumValues = 0
    for entry in sentenceValue:
        sumValues += sentenceValue[entry]

    # Average value of a sentence from original summary_text
    average = (sumValues / len(sentenceValue))

    return average

In [249]:
threshold = _find_average_score(sentence_scores)
print(threshold)

0.14498057637063624


In [250]:
def _generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]] >= (threshold):
            summary += " " + sentence
            sentence_count += 1

    return summary

In [251]:
summary=_generate_summary(sentences, sentence_scores, threshold)

In [252]:
print(summary)

 Dear Sean, How are you? The courses that you registered is now on hold. Otherwise, You will be suspended. You can call us at 999-000-000. Thank you and have a nice day.
