<a href="https://colab.research.google.com/github/citizenyves/wanted_pre_onboarding/blob/main/wanted_pre_onboarding_%EA%B9%80%ED%83%9C%EC%97%B0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re
import numpy as np

# Question 1) Tokenizer

### 1-1) Tokenizer class

In [2]:
class Tokenizer():
    def __init__(self):
        self.word_dict = {'oov': 0}
        self.fit_checker = False
    
    def preprocessing(self, sequences):
        result = []
        '''
        문제 1-1.
        '''
        lowers = [sequence.lower() for sequence in sequences]      # 입력문장 소문자화
        p = re.compile('[a-z]+')                                   # 텍스트(소문자)만 거르는 정규식 패턴
        for lower in lowers:
            result.append(p.findall(lower))                        # 반복문으로 nested list 생성

        return result
    
    def fit(self, sequences):
        self.fit_checker = False
        '''
        문제 1-2.
        '''
        tokens = self.preprocessing(sequences)                     # tokens = 토크나이징된 nested list 형태
        for sent in tokens:                                        # sent = 한개 문장이 토큰화되어 있는 list 형태
            for token in sent:                                     # token = 한개 문장 내 한개 단어토큰
                if token not in self.word_dict:
                    new_id = len(self.word_dict)                   # new_id = 단어 사전에 신규단어의 id를 만들어주는 과정
                    self.word_dict[token] = new_id

        self.fit_checker = True
    
    def transform(self, sequences):
        result = []
        tokens = self.preprocessing(sequences)                     # tokens = 토크나이징된 nested list 형태
        if self.fit_checker:
            '''
            문제 1-3.
            '''
            for sent in tokens:                                    # 각 토큰별 정수 인덱싱 과정
                result.append([self.word_dict[token] if token in self.word_dict else self.word_dict['oov'] for token in sent])
            return result
        else:
            raise Exception("Tokenizer instance is not fitted yet.")
        
    def fit_transform(self, sequences):
        self.fit(sequences)
        result = self.transform(sequences)
        return result

### 1-2) An example using the Tokenizer class

In [3]:
# Sentences
sequences = ["Data Science is the SEXIEST job of the 21st century.",
             "Machine learning is the key for data science~",
             "The machine cleans keys for free!"]

# # Instantiate tokenizer
tokenizer = Tokenizer()

# preprocessing
print("preprocessing")
print(tokenizer.preprocessing(sequences))
print("\n")

# fit
tokenizer.fit(sequences)
print("fit : word_dict")
print(tokenizer.word_dict)
print("\n")

# transform
print("transform : integer indexing")
print(tokenizer.transform(sequences))
print("\n")

# fit_transform
print("fit_transform : tokenizing and integer indexing")
print(tokenizer.fit_transform(sequences))

preprocessing
[['data', 'science', 'is', 'the', 'sexiest', 'job', 'of', 'the', 'st', 'century'], ['machine', 'learning', 'is', 'the', 'key', 'for', 'data', 'science'], ['the', 'machine', 'cleans', 'keys', 'for', 'free']]


fit : word_dict
{'oov': 0, 'data': 1, 'science': 2, 'is': 3, 'the': 4, 'sexiest': 5, 'job': 6, 'of': 7, 'st': 8, 'century': 9, 'machine': 10, 'learning': 11, 'key': 12, 'for': 13, 'cleans': 14, 'keys': 15, 'free': 16}


transform : integer indexing
[[1, 2, 3, 4, 5, 6, 7, 4, 8, 9], [10, 11, 3, 4, 12, 13, 1, 2], [4, 10, 14, 15, 13, 16]]


fit_transform : tokenizing and integer indexing
[[1, 2, 3, 4, 5, 6, 7, 4, 8, 9], [10, 11, 3, 4, 12, 13, 1, 2], [4, 10, 14, 15, 13, 16]]


# Question 2) TF/IDF 

### 2-1) TfidfVectorizer class

In [4]:
class TfidfVectorizer():
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.fit_checker = False
    
    def fit(self, sequences):
        tokenized = self.tokenizer.fit_transform(sequences)   # tokenized = 정수 인덱싱된 nested list 형태
        '''
        문제 2-1.
        '''
        self.word_dict = self.tokenizer.word_dict             # 'oov'토큰 포함된 단어사전
        n = len(sequences)                                    # 입력 데이터의 전체 문장수
        self.idf = []                                         # idf값을 넣을 리스트 생성
        
        for id in self.word_dict.values():                    # id = 단어별 id (oov토큰 포함된 상태)
            if id == 0:                                       # 'oov'토큰 id값 제외
                continue
            df = 0                                            # df값 초기화
            for sent in tokenized:                            # sent = 정수 인덱싱된 한 개 문장
                if id in sent:                            
                    df += 1                                   # id가 문장 내에 있으면 df값 +1
                else:
                    pass
            idf_elm = np.log(n/(1+df))                        # idf_elm = 각 토큰별 idf 값
            self.idf.append(idf_elm)                          

        self.fit_checker = True

    def transform(self, sequences):
        if self.fit_checker:
            tokenized = self.tokenizer.transform(sequences)   # tokenized = 정수 인덱싱된 nested list 형태
            '''
            문제 2-2.
            '''
            self.tf = []                                      # tf값 리스트를 넣을 리스트 생성
            for sent in tokenized:                            # sent = 정수 인덱싱된 한 개 문장
                doc = []                                      # doc = (모든토큰을 대상으로)특정 문장 내에 각 토큰의 등장 빈도값을 가진 리스트
                for id in self.word_dict.values():
                    if id == 0:                               # 'oov'토큰 id값 제외
                        continue
                    cnt = sent.count(id)                      # cnt = 각 문장별 각 토큰의 등장 빈도값
                    doc.append(cnt)                           
                self.tf.append(doc)                           
            
            self.tfidf_matrix = np.multiply(self.tf, self.idf).tolist()
            return self.tfidf_matrix
        else:
            raise Exception("TfidfVectorizer instance is not fitted yet.")

    def fit_transform(self, sequences):
        self.fit(sequences)
        return self.transform(sequences)

### 2-2) An example using the TfidfVectorizer

In [5]:
tfidfvectorizer = TfidfVectorizer(tokenizer)

# fit
tfidfvectorizer.fit(sequences)
print("fit : IDF")
print(tfidfvectorizer.idf)
print("\n")

# transform
print("transform : tfidf_matrix")
print(tfidfvectorizer.transform(sequences))
print("\n")

# fit_transform
print("fit_transform")
print(tfidfvectorizer.fit_transform(sequences))

fit : IDF
[0.0, 0.0, 0.0, -0.2876820724517809, 0.4054651081081644, 0.4054651081081644, 0.4054651081081644, 0.4054651081081644, 0.4054651081081644, 0.0, 0.4054651081081644, 0.4054651081081644, 0.0, 0.4054651081081644, 0.4054651081081644, 0.4054651081081644]


transform : tfidf_matrix
[[0.0, 0.0, 0.0, -0.5753641449035618, 0.4054651081081644, 0.4054651081081644, 0.4054651081081644, 0.4054651081081644, 0.4054651081081644, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, -0.2876820724517809, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4054651081081644, 0.4054651081081644, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, -0.2876820724517809, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4054651081081644, 0.4054651081081644, 0.4054651081081644]]


fit_transform
[[0.0, 0.0, 0.0, -0.5753641449035618, 0.4054651081081644, 0.4054651081081644, 0.4054651081081644, 0.4054651081081644, 0.4054651081081644, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, -0.2876820724517809, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.405465108108