In [1]:
import re

class Tokenizer():
    def __init__(self):
        self.word_dict = {'oov': 0}
        self.fit_checker = False
  
    def preprocessing(self, sequences):
        result = []
        for sent in sequences:
            sent = re.sub(r'[^ ㄱ-ㅣ가-힣A-Za-z0-9]', '', sent)
            result.append(list(map(lambda x: x.lower(), sent.split())))
        return result
  
    def fit(self, sequences):
        self.fit_checker = False
        token = self.preprocessing(sequences)
        for sent in token:
            for word in sent:
                self.word_dict.setdefault(word, len(self.word_dict))
        self.fit_checker = True
  
    def transform(self, sequences):
        result = []
        tokens = self.preprocessing(sequences)
        if self.fit_checker:
            for sent in tokens:
                temp = []
                for word in sent:
                    temp.append(self.word_dict[word])
                result.append(temp)
            return result
        else:
            raise Exception("Tokenizer instance is not fitted yet.")
      
    def fit_transform(self, sequences):
        self.fit(sequences)
        result = self.transform(sequences)
        return result

In [2]:
import numpy as np

class TfidfVectorizer:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.fit_checker = False
  
    def fit(self, sequences):
        tokenized = self.tokenizer.fit_transform(sequences)
        
        self.idf = []
        n = len(tokenized)
        tokenized_flatten = set([y for x in tokenized for y in x])
        
        for i in tokenized_flatten:
            df = np.sum([1 if i in token else 0 for token in tokenized])
            idf = np.log(n/1+df)
            self.idf.append(idf)
        self.idf = np.array(self.idf).T

        self.fit_checker = True
    

    def transform(self, sequences):
        if self.fit_checker:
            tokenized = self.tokenizer.transform(sequences)

            tokenized_flatten = set([y for x in tokenized for y in x])
            tf = []
            for i in tokenized_flatten:
                dtm = [token.count(i) for token in tokenized]
                tf.append(dtm)
                tf_ = np.array(tf).T
            self.tfidf_matrix = tf_ * self.idf

            return self.tfidf_matrix
        else:
            raise Exception("TfidfVectorizer instance is not fitted yet.")

  
    def fit_transform(self, sequences):
        self.fit(sequences)
        return self.transform(sequences)