### 원티드 프리온보딩 AI/ML 코스: 수강생 선발과제

In [33]:
import re
import math
import numpy as np

#### 문제 1) Tokenizer 생성하기

In [34]:
class Tokenizer():
    def __init__(self):
        self.word_dict = {'oov': 0}
        self.fit_checker = False
  
    def preprocessing(self, sequences):
        result = []
        for sent in sequences:
            result.append(re.sub("[^a-zA-Z0-9]", " ", sent).lower().split())
        return result
  
    def fit(self, sequences):
        self.fit_checker = False
        for sentence in self.preprocessing(sequences):
            for token in sentence:
                if token not in self.word_dict:
                    self.word_dict[token] = len(self.word_dict)

        self.fit_checker = True
  
    def transform(self, sequences):
        result = []
        tokens = self.preprocessing(sequences)
        if self.fit_checker:
            for sentence in tokens:
                index = [self.word_dict[token] if token in self.word_dict else 0 for token in sentence]
                result.append(index)
            return result
        else:
            raise Exception("Tokenizer instance is not fitted yet.")
      
    def fit_transform(self, sequences):
        self.fit(sequences)
        result = self.transform(sequences)
        return result

#### 문제 2) TfidfVectorizer 생성하기

In [35]:
class TfidfVectorizer:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.fit_checker = False
  
    def fit(self, sequences):
        tokenized = self.tokenizer.fit_transform(sequences)
        token_num = len(self.tokenizer.word_dict)-1
        self.idf_matrix = []
        for token in range(1, token_num+1):
            df = sum([1 if token in sentence else 0 for sentence in tokenized])
            self.idf_matrix.append(math.log(len(tokenized)/(1+df)))
        
        self.fit_checker = True

    def transform(self, sequences):
        if self.fit_checker:
            tokenized = self.tokenizer.transform(sequences)
            sentence_num = len(tokenized)
            token_num = len(self.tokenizer.word_dict)-1
            tfidf_list = []
            for sentence in tokenized:
                for token in range(1, token_num+1):
                    tf = sentence.count(token)
                    tfidf_list.append(tf * self.idf_matrix[token-1])
            self.tfidf_matrix = np.reshape(tfidf_list, (sentence_num, token_num)).tolist()
            return self.tfidf_matrix
        else:
            raise Exception("TfidfVectorizer instance is not fitted yet.")

    def fit_transform(self, sequences):
        self.fit(sequences)
        return self.transform(sequences)