In [5]:
from typing import List
from nltk import word_tokenize
from collections import Counter
import re
import string

In [6]:
sample_dataset = [
    "Bagaimana perbudakan berkembang dan kemudian meninggalkan Rusia?",
    'Film apa yang menampilkan karakter Popeye Doyle?',
    'Apa kepanjangan dari .com?',
    'Profesi apa yang tertua?',
    'Siapa yang membunuh Gandhi?'
]


sample_dataset

['Bagaimana perbudakan berkembang dan kemudian meninggalkan Rusia?',
 'Film apa yang menampilkan karakter Popeye Doyle?',
 'Apa kepanjangan dari .com?',
 'Profesi apa yang tertua?',
 'Siapa yang membunuh Gandhi?']

# Ekstraksi Fitur

## Fitur Unigram

In [11]:
def extract_unigram(sentences: List[str]):
    tokens = set()
    splitted_sentence = [None for i in range(len(sentences))]
    for i, sentence in enumerate(sentences):
        data = word_tokenize(sentence.lower())
        print("Splitted: ", data)
        data = [d for d in data if d not in string.punctuation]
        print("Remove punctuation:", data)
        splitted_sentence[i] = data
        tokens.update(data)
    tokens = sorted(tokens)
    print()
    print("Tokens :", tokens)
    print()
    bag_of_words = [None for i in range(len(sentences))]
    for i, sentence in enumerate(splitted_sentence):
        counter = Counter(sentence)
        row = [(counter[word] if word in counter else 0) for word in tokens]
        print("Vectorized: ", row)
        bag_of_words[i] = row
    return bag_of_words, tokens

extract_unigram(sample_dataset)

Splitted:  ['bagaimana', 'perbudakan', 'berkembang', 'dan', 'kemudian', 'meninggalkan', 'rusia', '?']
Remove punctuation: ['bagaimana', 'perbudakan', 'berkembang', 'dan', 'kemudian', 'meninggalkan', 'rusia']
Splitted:  ['film', 'apa', 'yang', 'menampilkan', 'karakter', 'popeye', 'doyle', '?']
Remove punctuation: ['film', 'apa', 'yang', 'menampilkan', 'karakter', 'popeye', 'doyle']
Splitted:  ['apa', 'kepanjangan', 'dari', '.com', '?']
Remove punctuation: ['apa', 'kepanjangan', 'dari', '.com']
Splitted:  ['profesi', 'apa', 'yang', 'tertua', '?']
Remove punctuation: ['profesi', 'apa', 'yang', 'tertua']
Splitted:  ['siapa', 'yang', 'membunuh', 'gandhi', '?']
Remove punctuation: ['siapa', 'yang', 'membunuh', 'gandhi']

Tokens : ['.com', 'apa', 'bagaimana', 'berkembang', 'dan', 'dari', 'doyle', 'film', 'gandhi', 'karakter', 'kemudian', 'kepanjangan', 'membunuh', 'menampilkan', 'meninggalkan', 'perbudakan', 'popeye', 'profesi', 'rusia', 'siapa', 'tertua', 'yang']

Vectorized:  [0, 0, 1, 1, 1

([[0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0],
  [0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1],
  [1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1],
  [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1]],
 ['.com',
  'apa',
  'bagaimana',
  'berkembang',
  'dan',
  'dari',
  'doyle',
  'film',
  'gandhi',
  'karakter',
  'kemudian',
  'kepanjangan',
  'membunuh',
  'menampilkan',
  'meninggalkan',
  'perbudakan',
  'popeye',
  'profesi',
  'rusia',
  'siapa',
  'tertua',
  'yang'])

## Fitur Question Type

In [12]:
def extract_question_type(sentences: List[str]):
    label = ['kenapa', 'dimana', 'apa', 'berapa', 'siapa', 'bagaimana']
    label_list = [None for i in range(len(sentences))]
    for i, sentence in enumerate(sentences):
        row = [0, 0, 0, 0, 0, 0]
        lower: str = sentence.lower()
        if ('kenapa' in lower or 'mengapa' in lower):
            row[0] = 1
        if ('dimana' in lower):
            row[1] = 1
        if ('berapa' in lower):
            row[3] = 1
        if ('siapa' in lower):
            row[4] = 1
        if ('bagaimana' in lower):
            row[5] = 1
        if (any(re.match(r'^apa', word) for word in word_tokenize(lower))):
            row[2] = 1
        label_list[i] = row
    
    return label_list, label

extract_question_type(sample_dataset)

([[0, 0, 0, 0, 0, 1],
  [0, 0, 1, 0, 0, 0],
  [0, 0, 1, 0, 0, 0],
  [0, 0, 1, 0, 0, 0],
  [0, 0, 0, 0, 1, 0]],
 ['kenapa', 'dimana', 'apa', 'berapa', 'siapa', 'bagaimana'])

## Fitur Word Shapes

In [None]:
def extract_word_shapes(sentences: List[str]):
    word_shapes = [
        ('uppercase', lambda word: word.isupper() and word.isalpha()), 
        ('lowercase', lambda word: word.islower() and word.isalpha()), 
        ('mixedcase', lambda word: word.isalpha() and any(c.isupper() for c in word) and any(c.islower() for c in word)),
        ('numeric', lambda word: re.match(r'[+-]?[0-9]+(\\.[0-9]+)?([Ee][+-]?[0-9]+)?', word)), 
        ('other', lambda word: not any(func(word) for _, func in word_shapes[:-1]))]
    
    word_shape_list = [None for i in range(len(sentences))]
    for i, sentence in enumerate(sentences):
        tokenized = word_tokenize(sentence)
        print("Splitted  :", tokenized)
        tokenized = [word for word in tokenized if word not in string.punctuation]
        print("Filtered  :", tokenized)
        word_shape_freq = [0 for i in range(len(word_shapes))]
        for j, word_shape in enumerate(word_shapes):
            shape, func = word_shape
            word_shape_freq[j] = sum(1 for word in tokenized if func(word))
        
        word_shape_list[i] = word_shape_freq
    
    return word_shape_list, [x[0] for x in word_shapes]