In [1]:
# Implementation of CountVectorizer and Tf-Idf

import pandas as pd
import numpy as np
from collections import Counter
from math import log

# The super inefficient way...
def tf(t, d):
    n = 0
    t = t.lower()
    d = d.lower().split(' ')
    for term in d:
        if t == term:
            n +=1
    return n 
    
def idf(t, D):
    n = 0
    for document in D:
        if t in document.split(' '):
            n +=1
    return log(len(D) / n)

# implementation of CountVector
# def countVectorizer(data):
#     # Aka tf(t, d) where t: term, d: document
#     d = dict()
#     for s in data:
#         s = s.lower()
#         tokens = s.split(' ')
#         for token in tokens:
#             # if the key does not exist, it returns a default value (0)
#             val = d.get(token, 0) + 1
#             d[token] = val
#     return d
    
def countVectorizer(data):
    d = dict()
    vocab = set()
    for document in data:
        document = document.lower()
        tokens = document.split()
        vocab = vocab.union(set(tokens))

    for document in data:
        for term in vocab:
            d[term] = d.get(term, 0) + tf(term, document)
    return d

def tfidf(data):
    d = dict()
    vocab = set()
    for document in data:
        document = document.lower()
        tokens = document.split()
        vocab = vocab.union(set(tokens))
    
    for document in data:
        for term in vocab:
            d[term] = d.get(term, 0) + tf(term, document) * idf(term, data)
    return d
        
    

In [2]:
a = ['holis como a va', 'va a todo bien', 'holis a chauchis', 'holis a']

In [3]:
countVectorizer(a)

{'como': 1, 'chauchis': 1, 'bien': 1, 'holis': 3, 'a': 4, 'va': 2, 'todo': 1}

In [4]:
tfidf(a)

{'como': 1.3862943611198906,
 'chauchis': 1.3862943611198906,
 'bien': 1.3862943611198906,
 'holis': 0.8630462173553426,
 'a': 0.0,
 'va': 1.3862943611198906,
 'todo': 1.3862943611198906}