In [38]:
from Pipeline import Pipeline
from Language import Language
from BinaryQuestionClassifier import BinaryQuestionClassifier

In [39]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from datasets import load_dataset
import nltk
from nltk.stem.snowball import SnowballStemmer                           
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import libvoikko
import numpy as np
import advertools as adv


In [40]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [41]:
dataset = load_dataset('copenlu/answerable_tydiqa')

100%|██████████| 2/2 [00:00<00:00,  5.28it/s]


In [42]:
def clean_english(text):
    lower = [x.lower() for x in text]
    stop_words = set(stopwords.words('english'))
    stemmer = SnowballStemmer("english")
    words = [stemmer.stem(word) for word in lower if not word in stop_words]
    return words

In [43]:
def clean_finnish(text):
    lower = [x.lower() for x in text]
    stop_words = set(stopwords.words('finnish'))
    stemmer = SnowballStemmer("finnish")
    words = [stemmer.stem(word) for word in lower if not word in stop_words]
    return words

In [44]:
def clean_japanese(text):
    lower = [x.lower() for x in text]
    stop_words = set(adv.stopwords['japanese'])
    words = [word for word in lower if not word in stop_words]
    return words


tokenize_japanese = lambda text: text
# from fugashi import Tagger
# japanese_tagger = Tagger('-Owakati') # Tagger has initial startup overhead, therefore it is defined here and not in lambda function
# tokenize_japanese = lambda text : japanese_tagger.parse(text).split(" ")
# japanese_tagger = Tagger('-Owakati') # Tagger has initial startup overhead, therefore it is defined here and not in lambda function

In [45]:
get_data = lambda language: dataset.filter(lambda x: x['language'] == language)

# define languages
languages = {
    'english': Language(
        name = 'english',
        tokenizer = word_tokenize,
        cleaner = clean_english,
        pipeline = Pipeline(get_data('english'))
        ),
    'japanese': Language(
        name = 'japanese',
        tokenizer = tokenize_japanese,
        cleaner = clean_japanese,
        pipeline = Pipeline(get_data('japanese'))
        ),
    'finnish': Language(
        name = 'finnish',
        tokenizer = word_tokenize,
        cleaner = clean_finnish,
        pipeline = Pipeline(get_data('finnish'))
        ),
}



In [46]:
# Preprocess the data

for language in languages.values():
    language.pipeline.tokenize(language.tokenizer)
    language.pipeline.clean(language.cleaner)
    language.pipeline.label_answerable()

100%|██████████| 8778/8778 [00:11<00:00, 781.87ex/s] 
100%|██████████| 1036/1036 [00:00<00:00, 1625.10ex/s]
100%|██████████| 8778/8778 [00:04<00:00, 1850.03ex/s]
100%|██████████| 1036/1036 [00:00<00:00, 1888.55ex/s]
100%|██████████| 8778/8778 [00:05<00:00, 1516.08ex/s]
100%|██████████| 1036/1036 [00:00<00:00, 1643.59ex/s]


# 1.1.b

In [47]:
# Find the most common first and last words in each language
for language in languages.values():
    count_words = lambda text: np.unique(text, return_counts=True) # Count occurences of words in text
    sort_words = lambda word_count: np.argsort(word_count[1])[::-1] # Get list of sorted indices based on most frequent words
    zip_words = lambda word_counts, sort_indices: list(zip(word_counts[0][sort_indices],word_counts[1][sort_indices])) # Zip the most frequent words with its number of occurences
    def find_most_common(text):
        """Finds the most frequent words in a text together with its number of occurences"""
        word_count = count_words(text)
        return zip_words(word_count, sort_words(word_count))


    tokenized_questions = language.pipeline.train_data['tokenized_question']
    first = [sublist[0] for sublist in tokenized_questions]
    last = [sublist[-1] for sublist in tokenized_questions]
    
    print(f"""
    Language: {language.name}
    Most frequent first words:
    {find_most_common(first)[:5]}
    Most frequent last words:
    {find_most_common(last)[:5]}
    """)


    Language: english
    Most frequent first words:
    [('When', 2242), ('What', 2101), ('How', 1296), ('Who', 1058), ('Where', 486)]
    Most frequent last words:
    [('?', 7379), ('zombie', 2), ('metabolite', 2), ('\\', 2), ('BCE', 2)]
    

    Language: japanese
    Most frequent first words:
    [('日本', 392), ('『', 306), ('アメリカ', 106), ('世界', 94), ('第', 56)]
    Most frequent last words:
    [('？', 5920), ('いつ', 730), ('た', 608), ('どこ', 584), ('何', 448)]
    

    Language: finnish
    Most frequent first words:
    [('Milloin', 3519), ('Mikä', 2328), ('Missä', 1646), ('Kuka', 1619), ('Mitä', 1088)]
    Most frequent last words:
    [('?', 13689), ('tulitaistelussa', 2), ('tohtoriksi+', 2), ('syntynyt', 2), ('pinta-ala', 2)]
    


# 1.2

In [48]:
for language in languages.values():
    print(f'\nLanguage: {language.name}')
    model = BinaryQuestionClassifier()
    X = model.extract_X(language.pipeline.train_data)
    y = language.pipeline.train_data['is_answerable']
    model.train(X, y)

    X_val = model.extract_X(language.pipeline.validation_data)
    y_val = language.pipeline.validation_data['is_answerable']
    model.evaluate(X_val, y_val)


Language: english


KeyboardInterrupt: 