In [1]:
class Tokenizer():
    def __init__(self):
        self.word_dict = {'oov': 0}
        self.fit_checker = False
  
    def preprocessing(self, sequences):
        import re
        result = []
        for sent in sequences:
            sent = re.sub(r'[^ ㄱ-ㅣ가-힣A-Za-z0-9]', '', sent)
            result.append(list(map(lambda x: x.lower(), sent.split())))
        return result
  
    def fit(self, sequences):
        self.fit_checker = False
        token = self.preprocessing(sequences)
        for sent in token:
            for word in sent:
                self.word_dict.setdefault(word, len(self.word_dict))
        self.fit_checker = True
  
    def transform(self, sequences):
        result = []
        tokens = self.preprocessing(sequences)
        if self.fit_checker:
            for sent in tokens:
                temp = []
                for word in sent:
                    temp.append(self.word_dict[word])
                result.append(temp)
            return result
        else:
            raise Exception("Tokenizer instance is not fitted yet.")
      
    def fit_transform(self, sequences):
        self.fit(sequences)
        result = self.transform(sequences)
        return result

In [2]:
class TfidfVectorizer:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.fit_checker = False
  
    def fit(self, sequences):
        tokenized = self.tokenizer.fit_transform(sequences)
        import math
        n = len(self.tokenizer.word_dict)
        self.idf = []
        for i in range(n):
            df = 0
            for sent in tokenized:
                if i in sent:
                    df += 1
            self.idf.append(math.log(n/(1+df)))

        self.fit_checker = True
    

    def transform(self, sequences):
        if self.fit_checker:
            tokenized = self.tokenizer.transform(sequences)
            self.tfidf_matrix = []
            for sent in tokenized:
                temp = []
                for word in sent:
                    tf = sent.count(word)
                    temp.append(tf*self.idf[word])
                self.tfidf_matrix.append(temp)
            return self.tfidf_matrix
        else:
            raise Exception("TfidfVectorizer instance is not fitted yet.")

  
    def fit_transform(self, sequences):
        self.fit(sequences)
        return self.transform(sequences)