<a href="https://colab.research.google.com/github/citizenyves/wanted_pre_onboarding/blob/main/wanted_pre_onboarding_%EA%B9%80%ED%83%9C%EC%97%B0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import numpy as np
import pandas as pd # If necessary

# Question 1) Tokenizer

### 1-1) Tokenizer class

In [None]:
class Tokenizer():
    def __init__(self):
        self.word_dict = {'oov': 0}
        self.id_dict = {}                # id_to_words 딕셔너리 추가 (oov토큰 제거)
        self.fit_checker = False
    
    def preprocessing(self, sequences):
        result = []
        '''
        문제 1-1.
        '''
        # 입력문장 소문자화
        lowers = [sequence.lower() for sequence in sequences]

        # 텍스트(소문자)만 거르는 정규식으로 nested list 생성
        p = re.compile('[a-z]+')
        for lower in lowers:
            result.append(p.findall(lower))

        return result
    
    def fit(self, sequences):
        self.fit_checker = False
        '''
        문제 1-2.
        '''
        result = self.preprocessing(sequences) 
        for tokens in result:
            for token in tokens:
                if token not in self.word_dict:
                    new_id = len(self.word_dict)
                    self.word_dict[token] = new_id
                    self.id_dict[new_id] = token

        self.fit_checker = True
    
    def transform(self, sequences):
        result = []
        tokens = self.preprocessing(sequences)
        if self.fit_checker:
            '''
            문제 1-3.
            '''
            for sent in tokens:
                result.append([self.word_dict[token] if token in self.word_dict else self.word_dict['oov'] for token in sent])
            return result
        else:
            raise Exception("Tokenizer instance is not fitted yet.")
        
    def fit_transform(self, sequences):
        self.fit(sequences)
        result = self.transform(sequences)
        return result

### 1-2) An example using the Tokenizer

In [None]:
# Sentences
sequences = ["Data Science is the SEXIEST job of the 21st century.",
             "Machine learning is the key for data science~",
             "The machine cleans keys for free!"]

# # Instantiate tokenizer
tokenizer = Tokenizer()

# fit_transform
print("tokenizing and integer indexing")
print(tokenizer.fit_transform(sequences))
print("\n")
print("word_dict")
print(tokenizer.word_dict)
print("\n")
print("id_dict (oov token removed)")
print(tokenizer.id_dict)

tokenizing and integer indexing
[[1, 2, 3, 4, 5, 6, 7, 4, 8, 9], [10, 11, 3, 4, 12, 13, 1, 2], [4, 10, 14, 15, 13, 16]]


word_dict
{'oov': 0, 'data': 1, 'science': 2, 'is': 3, 'the': 4, 'sexiest': 5, 'job': 6, 'of': 7, 'st': 8, 'century': 9, 'machine': 10, 'learning': 11, 'key': 12, 'for': 13, 'cleans': 14, 'keys': 15, 'free': 16}


id_dict (oov token removed)
{1: 'data', 2: 'science', 3: 'is', 4: 'the', 5: 'sexiest', 6: 'job', 7: 'of', 8: 'st', 9: 'century', 10: 'machine', 11: 'learning', 12: 'key', 13: 'for', 14: 'cleans', 15: 'keys', 16: 'free'}


# Question 2) TF/IDF 

### 2-1) TfidfVectorizer class

In [None]:
class TfidfVectorizer():
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.fit_checker = False
    
    def fit(self, sequences):
        tokenized = self.tokenizer.fit_transform(sequences)
        '''
        문제 2-1.
        '''
        self.id_dict = self.tokenizer.id_dict    # id_to_words dictionary
        n = len(sequences)                       # 전체 문장수
        self.idf = []                            # idf값 리스트
        
        for key in self.id_dict.keys():          # key = id
            df = 0                               # df값
            for tokens in tokenized:             # tokens = 정수 인덱싱된 한 개 문장
                if key in tokens:
                    df += 1
                else:
                    pass
            idf_elm = np.log(n/(1+df))           # 각 단어별 idf 값 계산
            self.idf.append(idf_elm)             # 각 단어별 idf 값 append

        self.fit_checker = True

    def transform(self, sequences):
        if self.fit_checker:
            tokenized = self.tokenizer.transform(sequences)
            '''
            문제 2-2.
            '''
            self.tf = []
            for tokens in tokenized:
                doc = []
                for idx, value in enumerate(self.id_dict):
                    cnt = tokens.count(idx + 1)             # id_dict에는 oov 토큰이 없기 때문에 idx에 +1
                    doc.append(cnt)
                self.tf.append(doc)
            
            self.tfidf_matrix = np.multiply(self.tf, self.idf).tolist()

            ## If DataFraming is necessary ##
            # vocab = self.tokenizer.id_dict.values()             
            # self.tfidf_matrix = pd.DataFrame(data=self.tfidf_matrix, columns=vocab)

            return self.tfidf_matrix
        else:
            raise Exception("TfidfVectorizer instance is not fitted yet.")

    def fit_transform(self, sequences):
        self.fit(sequences)
        return self.transform(sequences)

### 2-2) An example using the TfidfVectorizer

In [None]:
# Instantiate tfidfvectorizer
tfidfvectorizer = TfidfVectorizer(tokenizer)

In [None]:
# fit_transform
tfidfvectorizer.fit_transform(sequences)

[[0.0,
  0.0,
  0.0,
  -0.5753641449035618,
  0.4054651081081644,
  0.4054651081081644,
  0.4054651081081644,
  0.4054651081081644,
  0.4054651081081644,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 [0.0,
  0.0,
  0.0,
  -0.2876820724517809,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.4054651081081644,
  0.4054651081081644,
  0.0,
  0.0,
  0.0,
  0.0],
 [0.0,
  0.0,
  0.0,
  -0.2876820724517809,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.4054651081081644,
  0.4054651081081644,
  0.4054651081081644]]

In [None]:
# fit_transform (version of dataframe)
tfidfvectorizer.fit_transform(sequences)

Unnamed: 0,data,science,is,the,sexiest,job,of,st,century,machine,learning,key,for,cleans,keys,free
0,0.0,0.0,0.0,-0.575364,0.405465,0.405465,0.405465,0.405465,0.405465,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,-0.287682,0.0,0.0,0.0,0.0,0.0,0.0,0.405465,0.405465,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,-0.287682,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.405465,0.405465,0.405465
