# Process Text using TFIDF in Python

ref: https://towardsdatascience.com/tfidf-for-piece-of-text-in-python-43feccaa74f8

In [1]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import math

In [2]:
text = """
If you like tuna and tomato sauce- try combining the two.
It's really not as bad as it sounds.
If the Easter Bunny and the Tooth Fairy had babies world they take
your teeth and leave chocolate for you?
"""

# Simple Pre-processing

In [13]:
def remove_string_special_characters(s):
    """
    This function removes special characters from within a string
    
    parameters:
        s(str): single input string.
        
    return:
        stripped(str): A string with special characters removed
    """
    
    # Replace special character with ' '
    stripped = re.sub('[^\w\s]', '', s)
    stripped = re.sub('_', '', stripped)
    
    # Change any whitespace to one space
    stripped = re.sub('\s+', ' ', stripped)
    
    # Remove start and end white spaces
    stripped = stripped.strip()
    
    return stripped

# Create a documents

In [55]:
def get_doc(sent):
    """
    this function splits the text into sentences and
    considering each sentence as a document, calculates the
    total word count of each.
    """
    
    doc_info = []
    i = 0
    for sent in text_sents_clean:
        i += 1
        count = count_words(sent)
        temp = {'doc_id': i, 'doc_length': count}
        doc_info.append(temp)
    return doc_info

# Pre-requisite to calculating the TF and IDF scores

In [5]:
def count_words(sent):
    """This function returns the 
    total number of words in the input text
    """
    count = 0
    words = word_tokenize(sent)
    for word in words:
        count += 1
    return count

In [6]:
def create_freq_dict(sents):
    """
    This function creates a frequency dictionary
    for each word in each document.
    """
    i = 0
    freqDict_list = []
    for sent in sents:
        i += 1
        freq_dict = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            if word in freq_dict:
                freq_dict[word] += 1
            else:
                freq_dict[word] = 1
            temp = {'doc_id':i, 'freq_dict':freq_dict}
        freqDict_list.append(temp)
    return freqDict_list

# The functions to get the TF and IDF score

In [7]:
def computeTF(doc_info, freqDict_list):
    """
    tf = (frequency of the term in the doc/total number of terms in the doc)
    """
    TF_scores = []
    for tempDict in freqDict_list:
        id = tempDict['doc_id']
        for k in tempDict['freq_dict']:
            temp = {
                'doc_id': id,
                'TF_score': tempDict['freq_dict'][k]/doc_info[id-1]['doc_length'],
                'key':k
            }
            TF_scores.append(temp)
    return TF_scores

In [8]:
def computeIDF(doc_info, freqDict_list):
    """
    idf = ln(total number of docs/number of docs with term in it)
    """
    IDF_scores = []
    counter = 0
    for dict in freqDict_list:
        counter += 1
        for k in dict['freq_dict'].keys():
            count = sum([k in tempDict['freq_dict'] for tempDict in freqDict_list])
            temp = {'doc_id': counter, 'IDF_score': math.log(len(doc_info)/count), 'key':k}
            
            IDF_scores.append(temp)
    return IDF_scores

# Combine

In [24]:
def computeTFIDF(TF_scores, IDF_scores):
    TFIDF_scores = []
    for j in IDF_scores:
        for i in TF_scores:
            if j['key'] == i['key'] and j['doc_id'] == i['doc_id']:
                temp = {'doc_id':j['doc_id'],
                        'TFIDF_score':j['IDF_score']*i['TF_score'],
                        'key':i['key']
                       }
        TFIDF_scores.append(temp)
    return TFIDF_scores

In [15]:
text_sents = sent_tokenize(text)
text_sents_clean = [remove_string_special_characters(s) for s in text_sents]
doc_info = get_doc(text_sents_clean)

In [18]:
freqDict_list = create_freq_dict(text_sents_clean)
TF_scores = computeTF(doc_info, freqDict_list)
IDF_scores = computeIDF(doc_info, freqDict_list)

In [19]:
doc_info

[{'doc_id': 1, 'doc_length': 11},
 {'doc_id': 2, 'doc_length': 8},
 {'doc_id': 3, 'doc_length': 20}]

In [20]:
freqDict_list

[{'doc_id': 1,
  'freq_dict': {'if': 1,
   'you': 1,
   'like': 1,
   'tuna': 1,
   'and': 1,
   'tomato': 1,
   'sauce': 1,
   'try': 1,
   'combining': 1,
   'the': 1,
   'two': 1}},
 {'doc_id': 2,
  'freq_dict': {'its': 1,
   'really': 1,
   'not': 1,
   'as': 2,
   'bad': 1,
   'it': 1,
   'sounds': 1}},
 {'doc_id': 3,
  'freq_dict': {'if': 1,
   'the': 2,
   'easter': 1,
   'bunny': 1,
   'and': 2,
   'tooth': 1,
   'fairy': 1,
   'had': 1,
   'babies': 1,
   'world': 1,
   'they': 1,
   'take': 1,
   'your': 1,
   'teeth': 1,
   'leave': 1,
   'chocolate': 1,
   'for': 1,
   'you': 1}}]

In [21]:
TF_scores

[{'doc_id': 1, 'TF_score': 0.09090909090909091, 'key': 'if'},
 {'doc_id': 1, 'TF_score': 0.09090909090909091, 'key': 'you'},
 {'doc_id': 1, 'TF_score': 0.09090909090909091, 'key': 'like'},
 {'doc_id': 1, 'TF_score': 0.09090909090909091, 'key': 'tuna'},
 {'doc_id': 1, 'TF_score': 0.09090909090909091, 'key': 'and'},
 {'doc_id': 1, 'TF_score': 0.09090909090909091, 'key': 'tomato'},
 {'doc_id': 1, 'TF_score': 0.09090909090909091, 'key': 'sauce'},
 {'doc_id': 1, 'TF_score': 0.09090909090909091, 'key': 'try'},
 {'doc_id': 1, 'TF_score': 0.09090909090909091, 'key': 'combining'},
 {'doc_id': 1, 'TF_score': 0.09090909090909091, 'key': 'the'},
 {'doc_id': 1, 'TF_score': 0.09090909090909091, 'key': 'two'},
 {'doc_id': 2, 'TF_score': 0.125, 'key': 'its'},
 {'doc_id': 2, 'TF_score': 0.125, 'key': 'really'},
 {'doc_id': 2, 'TF_score': 0.125, 'key': 'not'},
 {'doc_id': 2, 'TF_score': 0.25, 'key': 'as'},
 {'doc_id': 2, 'TF_score': 0.125, 'key': 'bad'},
 {'doc_id': 2, 'TF_score': 0.125, 'key': 'it'},
 

In [22]:
IDF_scores

[{'doc_id': 1, 'IDF_score': 0.4054651081081644, 'key': 'if'},
 {'doc_id': 1, 'IDF_score': 0.4054651081081644, 'key': 'you'},
 {'doc_id': 1, 'IDF_score': 1.0986122886681098, 'key': 'like'},
 {'doc_id': 1, 'IDF_score': 1.0986122886681098, 'key': 'tuna'},
 {'doc_id': 1, 'IDF_score': 0.4054651081081644, 'key': 'and'},
 {'doc_id': 1, 'IDF_score': 1.0986122886681098, 'key': 'tomato'},
 {'doc_id': 1, 'IDF_score': 1.0986122886681098, 'key': 'sauce'},
 {'doc_id': 1, 'IDF_score': 1.0986122886681098, 'key': 'try'},
 {'doc_id': 1, 'IDF_score': 1.0986122886681098, 'key': 'combining'},
 {'doc_id': 1, 'IDF_score': 0.4054651081081644, 'key': 'the'},
 {'doc_id': 1, 'IDF_score': 1.0986122886681098, 'key': 'two'},
 {'doc_id': 2, 'IDF_score': 1.0986122886681098, 'key': 'its'},
 {'doc_id': 2, 'IDF_score': 1.0986122886681098, 'key': 'really'},
 {'doc_id': 2, 'IDF_score': 1.0986122886681098, 'key': 'not'},
 {'doc_id': 2, 'IDF_score': 1.0986122886681098, 'key': 'as'},
 {'doc_id': 2, 'IDF_score': 1.09861228866

In [25]:
TFIDF_scores = computeTFIDF(TF_scores, IDF_scores)
TFIDF_scores

[{'doc_id': 1, 'TFIDF_score': 0.036860464373469494, 'key': 'if'},
 {'doc_id': 1, 'TFIDF_score': 0.036860464373469494, 'key': 'you'},
 {'doc_id': 1, 'TFIDF_score': 0.09987384442437362, 'key': 'like'},
 {'doc_id': 1, 'TFIDF_score': 0.09987384442437362, 'key': 'tuna'},
 {'doc_id': 1, 'TFIDF_score': 0.036860464373469494, 'key': 'and'},
 {'doc_id': 1, 'TFIDF_score': 0.09987384442437362, 'key': 'tomato'},
 {'doc_id': 1, 'TFIDF_score': 0.09987384442437362, 'key': 'sauce'},
 {'doc_id': 1, 'TFIDF_score': 0.09987384442437362, 'key': 'try'},
 {'doc_id': 1, 'TFIDF_score': 0.09987384442437362, 'key': 'combining'},
 {'doc_id': 1, 'TFIDF_score': 0.036860464373469494, 'key': 'the'},
 {'doc_id': 1, 'TFIDF_score': 0.09987384442437362, 'key': 'two'},
 {'doc_id': 2, 'TFIDF_score': 0.13732653608351372, 'key': 'its'},
 {'doc_id': 2, 'TFIDF_score': 0.13732653608351372, 'key': 'really'},
 {'doc_id': 2, 'TFIDF_score': 0.13732653608351372, 'key': 'not'},
 {'doc_id': 2, 'TFIDF_score': 0.27465307216702745, 'key':

---

# Summarise Text with TFIDF in Python

text.txt ref: https://www.bbc.com/news/world-asia-china-44816715
Medium ref: https://medium.com/@shivangisareen/summarise-text-with-tfidf-in-python-bc7ca10d3284

In [57]:
def get_doc2(sent):
    """
    this function splits the text into sentences and
    considering each sentence as a document, calculates the
    total word count of each.
    """
    
    doc_info = []
    i = 0
    for sent in second_text_sents_clean:
        i += 1
        count = count_words(sent)
        temp = {'doc_id': i, 'doc_length': count}
        doc_info.append(temp)
    return doc_info

In [58]:
with open('text.txt') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
content = [x.strip() for x in content] 
content = ' '.join(content)

In [59]:
second_text_sents = sent_tokenize(content)
second_text_sents_clean = [remove_string_special_characters(s) for s in second_text_sents]
second_doc_info = get_doc2(second_text_sents_clean)

In [60]:
second_freqDict_list = create_freq_dict(second_text_sents_clean)
second_TF_scores = computeTF(second_doc_info, second_freqDict_list)
second_IDF_scores = computeIDF(second_doc_info, second_freqDict_list)

In [62]:
second_doc_info

[{'doc_id': 1, 'doc_length': 21},
 {'doc_id': 2, 'doc_length': 24},
 {'doc_id': 3, 'doc_length': 17},
 {'doc_id': 4, 'doc_length': 8},
 {'doc_id': 5, 'doc_length': 18},
 {'doc_id': 6, 'doc_length': 10},
 {'doc_id': 7, 'doc_length': 9},
 {'doc_id': 8, 'doc_length': 10},
 {'doc_id': 9, 'doc_length': 24},
 {'doc_id': 10, 'doc_length': 11},
 {'doc_id': 11, 'doc_length': 19},
 {'doc_id': 12, 'doc_length': 25}]

In [63]:
second_freqDict_list

[{'doc_id': 1,
  'freq_dict': {'an': 2,
   'explosion': 1,
   'at': 1,
   'industrial': 1,
   'park': 1,
   'in': 1,
   'chinas': 1,
   'sichuan': 1,
   'province': 1,
   'has': 1,
   'left': 1,
   '19': 1,
   'people': 1,
   'dead': 1,
   'and': 1,
   '12': 1,
   'others': 1,
   'injured': 1,
   'officials': 1,
   'say': 1}},
 {'doc_id': 2,
  'freq_dict': {'in': 2,
   'a': 2,
   'statement': 1,
   'quoted': 1,
   'by': 2,
   'reuters': 1,
   'the': 2,
   'authorities': 1,
   'jiangan': 1,
   'county': 1,
   'said': 1,
   'blast': 1,
   'happened': 1,
   'at': 1,
   'chemical': 1,
   'plant': 1,
   'run': 1,
   'yibin': 1,
   'hengda': 1,
   'technology': 1}},
 {'doc_id': 3,
  'freq_dict': {'photos': 1,
   'on': 1,
   'chinese': 1,
   'social': 1,
   'media': 1,
   'showed': 1,
   'a': 1,
   'huge': 1,
   'fire': 1,
   'and': 1,
   'plumes': 1,
   'of': 1,
   'smoke': 1,
   'rising': 1,
   'from': 1,
   'the': 1,
   'facility': 1}},
 {'doc_id': 4,
  'freq_dict': {'it': 1,
   'is': 1,
 

In [64]:
second_TF_scores

[{'doc_id': 1, 'TF_score': 0.09523809523809523, 'key': 'an'},
 {'doc_id': 1, 'TF_score': 0.047619047619047616, 'key': 'explosion'},
 {'doc_id': 1, 'TF_score': 0.047619047619047616, 'key': 'at'},
 {'doc_id': 1, 'TF_score': 0.047619047619047616, 'key': 'industrial'},
 {'doc_id': 1, 'TF_score': 0.047619047619047616, 'key': 'park'},
 {'doc_id': 1, 'TF_score': 0.047619047619047616, 'key': 'in'},
 {'doc_id': 1, 'TF_score': 0.047619047619047616, 'key': 'chinas'},
 {'doc_id': 1, 'TF_score': 0.047619047619047616, 'key': 'sichuan'},
 {'doc_id': 1, 'TF_score': 0.047619047619047616, 'key': 'province'},
 {'doc_id': 1, 'TF_score': 0.047619047619047616, 'key': 'has'},
 {'doc_id': 1, 'TF_score': 0.047619047619047616, 'key': 'left'},
 {'doc_id': 1, 'TF_score': 0.047619047619047616, 'key': '19'},
 {'doc_id': 1, 'TF_score': 0.047619047619047616, 'key': 'people'},
 {'doc_id': 1, 'TF_score': 0.047619047619047616, 'key': 'dead'},
 {'doc_id': 1, 'TF_score': 0.047619047619047616, 'key': 'and'},
 {'doc_id': 1,

In [65]:
second_IDF_scores

[{'doc_id': 1, 'IDF_score': 1.0986122886681098, 'key': 'an'},
 {'doc_id': 1, 'IDF_score': 1.3862943611198906, 'key': 'explosion'},
 {'doc_id': 1, 'IDF_score': 1.0986122886681098, 'key': 'at'},
 {'doc_id': 1, 'IDF_score': 2.4849066497880004, 'key': 'industrial'},
 {'doc_id': 1, 'IDF_score': 2.4849066497880004, 'key': 'park'},
 {'doc_id': 1, 'IDF_score': 0.8754687373538999, 'key': 'in'},
 {'doc_id': 1, 'IDF_score': 2.4849066497880004, 'key': 'chinas'},
 {'doc_id': 1, 'IDF_score': 2.4849066497880004, 'key': 'sichuan'},
 {'doc_id': 1, 'IDF_score': 2.4849066497880004, 'key': 'province'},
 {'doc_id': 1, 'IDF_score': 2.4849066497880004, 'key': 'has'},
 {'doc_id': 1, 'IDF_score': 2.4849066497880004, 'key': 'left'},
 {'doc_id': 1, 'IDF_score': 2.4849066497880004, 'key': '19'},
 {'doc_id': 1, 'IDF_score': 1.791759469228055, 'key': 'people'},
 {'doc_id': 1, 'IDF_score': 2.4849066497880004, 'key': 'dead'},
 {'doc_id': 1, 'IDF_score': 0.8754687373538999, 'key': 'and'},
 {'doc_id': 1, 'IDF_score': 2

In [66]:
second_TFIDF_scores = computeTFIDF(second_TF_scores, second_IDF_scores)
second_TFIDF_scores

[{'doc_id': 1, 'TFIDF_score': 0.10462974177791522, 'key': 'an'},
 {'doc_id': 1, 'TFIDF_score': 0.06601401719618526, 'key': 'explosion'},
 {'doc_id': 1, 'TFIDF_score': 0.05231487088895761, 'key': 'at'},
 {'doc_id': 1, 'TFIDF_score': 0.11832888808514287, 'key': 'industrial'},
 {'doc_id': 1, 'TFIDF_score': 0.11832888808514287, 'key': 'park'},
 {'doc_id': 1, 'TFIDF_score': 0.041688987493042846, 'key': 'in'},
 {'doc_id': 1, 'TFIDF_score': 0.11832888808514287, 'key': 'chinas'},
 {'doc_id': 1, 'TFIDF_score': 0.11832888808514287, 'key': 'sichuan'},
 {'doc_id': 1, 'TFIDF_score': 0.11832888808514287, 'key': 'province'},
 {'doc_id': 1, 'TFIDF_score': 0.11832888808514287, 'key': 'has'},
 {'doc_id': 1, 'TFIDF_score': 0.11832888808514287, 'key': 'left'},
 {'doc_id': 1, 'TFIDF_score': 0.11832888808514287, 'key': '19'},
 {'doc_id': 1, 'TFIDF_score': 0.08532187948705024, 'key': 'people'},
 {'doc_id': 1, 'TFIDF_score': 0.11832888808514287, 'key': 'dead'},
 {'doc_id': 1, 'TFIDF_score': 0.0416889874930428

In [84]:
def get_sent_score(TFIDF_scores, text_sents, doc_info):
    """
    This function prints out the summary and reutrns the
    score of each sentence in a list.
    
    The score of a sentence is calculated by adding the TFIDF
    scores of the words that make up the sentence.
    """
    sentence_info = []
    for doc in doc_info:
        """
        This loops through each document(sentence)
        and calculates their 'sent_score'
        """
        sent_score = 0
        for i in range(0, len(TFIDF_scores)):
            temp_dict = TFIDF_scores[i]
            #print(temp_dict['TFIDF_score'])
            #print(doc['doc_id'])
            if doc['doc_id'] == temp_dict['doc_id']:
                sent_score += temp_dict['TFIDF_score']
        temp = {'doc_id': doc['doc_id'], 'sent_score':sent_score,
               'sentence':text_sents[doc['doc_id']-1]}
        sentence_info.append(temp)
        
    return sentence_info

In [85]:
sentence_info = get_sent_score(second_TFIDF_scores, second_text_sents, second_doc_info)
sentence_info

[{'doc_id': 1,
  'sent_score': 2.015255908930102,
  'sentence': "An explosion at an industrial park in China's Sichuan province has left 19 people dead and 12 others injured, officials say."},
 {'doc_id': 2,
  'sent_score': 1.7705998683963082,
  'sentence': "In a statement quoted by Reuters, the authorities in Jiang'an county said the blast happened at a chemical plant run by Yibin Hengda Technology."},
 {'doc_id': 3,
  'sent_score': 1.9806900982173943,
  'sentence': 'Photos on Chinese social media showed a huge fire and plumes of smoke rising from the facility.'},
 {'doc_id': 4,
  'sent_score': 2.04784320460469,
  'sentence': 'It is not clear what caused the explosion.'},
 {'doc_id': 5,
  'sent_score': 2.0981153462176554,
  'sentence': 'According to Xinhua the fire, which broke out on Thursday evening, had been put out early on Friday.'},
 {'doc_id': 6,
  'sent_score': 1.6054241569865488,
  'sentence': 'Those injured in the blast were in a stable condition.'},
 {'doc_id': 7,
  'sent_s

https://www.youtube.com/watch?v=hXNbFNCgPfY
https://www.youtube.com/watch?v=BJ0MnawUpaU