# Problem 1: Creating Tokenizer

In [39]:
import re

In [40]:
class Tokenizer():
  def __init__(self):
    self.word_dict = {'oov': 0}
    self.fit_checker = False
  
  def preprocessing(self, sequences):
    result = []
    '''
    문제 1-1.
    '''
    rx = r"\w+(?:'\w+)?|[^\w\s]"
    for sequence in sequences:
      # Remove the special characters and convert into lower case.
      new_sequence = re.sub(r'[^\w\s]', '', sequence).lower()
      # Find every alphabetic word separated by space.
      result.append(re.findall(rx, new_sequence))
    return result
  
  def fit(self, sequences):
    self.fit_checker = False
    '''
    문제 1-2.
    '''
    tokens = self.preprocessing(sequences)
    index = 0
    for token in tokens:
      for word in token:
        # Check if the word already exists in the dict.
        if word not in self.word_dict.keys():
          # Increment the index only if the word doesn't exist in the dict. So,
          # each word gets an unique index value.
          index += 1
          self.word_dict[word] = index
    self.fit_checker = True
  
  def transform(self, sequences):
    result = []
    tokens = self.preprocessing(sequences)
    if self.fit_checker:
      '''
      문제 1-3.
      '''
      for token in tokens:
        # To output a nested list, create an empty list. It empties for the next
        # token.
        token_index = []
        for word in token:
          # If the word exists in the dict, get the index and put in the list.
          if word in self.word_dict.keys():
            token_index.append(self.word_dict[word])
          # If the word not in the dict, put the index of 'oov', 0, in the list.
          else:
            token_index.append(self.word_dict['oov'])
        # Add the token_index list to the parent list, result.
        result.append(token_index)
      return result
    else:
      raise Exception("Tokenizer instance is not fitted yet.")
      
  def fit_transform(self, sequences):
    self.fit(sequences)
    result = self.transform(sequences)
    return result

## Test 1-1

In [41]:
'''Test case 1-1'''
tokenizer = Tokenizer()
test_sample1 = ['I go to school.', 'I LIKE pizza!', "Joe's jacket looks cool"]
print(tokenizer.preprocessing(test_sample1))

[['i', 'go', 'to', 'school'], ['i', 'like', 'pizza'], ['joes', 'jacket', 'looks', 'cool']]


## Test 1-2

In [42]:
'''Test 1-2'''
tokenizer.fit(test_sample1)
tokenizer.word_dict

{'cool': 10,
 'go': 2,
 'i': 1,
 'jacket': 8,
 'joes': 7,
 'like': 5,
 'looks': 9,
 'oov': 0,
 'pizza': 6,
 'school': 4,
 'to': 3}

## Test 1-3

In [43]:
'''Test 1-3'''
test_sample2 = ['I go to school.', 'I LIKE pizza!', "Joe's jacket looks cool", 'I hate to go to school']
print(tokenizer.transform(test_sample2))

[[1, 2, 3, 4], [1, 5, 6], [7, 8, 9, 10], [1, 0, 3, 2, 3, 4]]


# Problem 2: Creating TfidVectorizer

In [44]:
import math

In [121]:
class TfidfVectorizer:
  def __init__(self, tokenizer):
    self.tokenizer = tokenizer
    self.fit_checker = False
  
  def fit(self, sequences):
    tokenized = self.tokenizer.fit_transform(sequences)
    '''
    문제 2-1.
    '''
    n = len(tokenized) # number of sentences

    # Document Frequency
    df = {}
    for d in range(n):
      tokens = tokenized[d]  # ex. d:0, tokens:[1, 2, 3, 4]
      for t in tokens: # ex. t:1
        try:
          df[t].add(d)
        except:
          df[t] = {d} # tf: {1: 0}

    # Occurrence of t in N documents
    for t_i in df:
      df[t_i] = len(df[t_i])

    # Inverse Document Frequency
    self.idf = []
    for token in df:
      # math.log's default base is e.
      # Addition by 1 in the denominator prevents division by 0.
      self.idf.append(math.log(n / (1 + df[token])))

    self.fit_checker = True
    

  def transform(self, sequences):
    if self.fit_checker:
      tokenized = self.tokenizer.transform(sequences)
      '''
      문제 2-2.
      '''

      n = len(tokenized) # number of sentence in the documents(or sequences)
      m = len(self.idf) # number of unique tokens
      
      # Count the token(t) in each sentence(d) and put it in matrix
      self.tfidf_matrix = []
      for d in tokenized: # [1, 2, 3, 4] in [[1, 2, 3, 4], [1, 5, 6]]
        counter_list = [0] * m # [0, 0, 0, 0, 0, 0]
        for t in d: # 1 in [1, 2, 3, 4]
          counter_list[t-1] += 1 # [1, 0, 0, 0, 0, 0]
        self.tfidf_matrix.append(counter_list) #[[1, 1, 1, 1, 0, 0]]

      return self.tfidf_matrix
    else:
      raise Exception("TfidfVectorizer instance is not fitted yet.")

  
  def fit_transform(self, sequences):
    self.fit(sequences)
    return self.transform(sequences)

## Test 2-1

In [122]:
'''Test 2-1'''
test_sample3 = ['I go to school.', 'I LIKE pizza!', 'I I I go go']
tokenizer = Tokenizer()
tfidf = TfidfVectorizer(tokenizer)
result = tfidf.fit(test_sample3)

## Test 2-2

In [123]:
'''Test 2-2'''
tfidf.transform(test_sample3)

[[1, 1, 1, 1, 0, 0], [1, 0, 0, 0, 1, 1], [3, 2, 0, 0, 0, 0]]