In [1]:
import re
import pandas as pd
from math import log

In [2]:
class TfidfVectorizer:

    def clean_text(self, documents):

        processed_documents = list()
        for document in documents:
            processed_documents.append(re.sub('[^A-z ]', '', document).strip().lower())

        return processed_documents

    def calculate_tf(self, documents):

        tf = list()
        for document in documents:
            words = document.split()
            word_count = len(words)
            tf.append(dict())
            for word in words:
                tf[-1][word] = tf[-1].get(word, 0) + 1 / word_count
                
        return tf

    def calculate_idf(self, documents):

        all_words = set()
        for document in documents:
            all_words.update(document.split())

        documents_with = dict()
        for word in all_words:
            documents_with[word] = 0
            for document in documents:
                if word in document.split():
                    documents_with[word] += 1

        idf = dict()
        document_count = len(documents)
        for word, count in documents_with.items():
            idf[word] = log(document_count / count)

        return idf

    def calculate_tfidf(self, documents):

        documents = self.clean_text(documents)
        term_frequencies = self.calculate_tf(documents)
        inverse_document_frequencies = self.calculate_idf(documents)

        tfidf = list()
        for dictionary in term_frequencies:
            tfidf.append(dict())
            for word, tf in dictionary.items():
                tfidf[-1][word] = tf * inverse_document_frequencies[word]

        return pd.DataFrame(tfidf).fillna(0)


In [7]:
documents = [
    # 'This is a beautiful place',
    # 'Can you call me at 9012345678?',
    # 'Can you deliver this parcel to this address?',
    # 'The rocket is about to launch!',
    # "HTML isn't a programming language",
    'Betty bought some butter. But the butter was bitter. So she bought some better butter to make the bitter butter better',
    'The man went out for a walk',
    # 'The children sat around the fire',
]

vectorizer = TfidfVectorizer()
vectorizer.calculate_tfidf(documents)

Unnamed: 0,betty,bought,some,butter,but,the,was,bitter,so,she,better,to,make,man,went,out,for,a,walk
0,0.033007,0.066014,0.066014,0.132028,0.033007,0.0,0.033007,0.066014,0.033007,0.033007,0.066014,0.033007,0.033007,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.099021,0.099021,0.099021,0.099021,0.099021,0.099021
