In [93]:
from Pipeline import Pipeline
from Language import Language

In [94]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from datasets import load_dataset
from fugashi import Tagger
# import nagisa
import nltk
from nltk.stem.snowball import SnowballStemmer                           
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import libvoikko
import numpy as np


In [95]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [96]:
dataset = load_dataset('copenlu/answerable_tydiqa')

Using custom data configuration copenlu--nlp_course_tydiqa-cceecfb5416d988a
Reusing dataset parquet (C:\Users\chris\.cache\huggingface\datasets\copenlu___parquet\copenlu--nlp_course_tydiqa-cceecfb5416d988a\0.0.0\2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 2/2 [00:00<00:00,  7.06it/s]


In [97]:
def clean_english(text):
    lower = [x.lower() for x in text]
    stop_words = set(stopwords.words('english'))
    stemmer = SnowballStemmer("english")
    words = [stemmer.stem(word) for word in lower if not word in stop_words]
    return words

In [98]:
def clean_finnish(text):
    lower = [x.lower() for x in text]
    stop_words = set(stopwords.words('finnish'))
    stemmer = SnowballStemmer("finnish")
    words = [stemmer.stem(word) for word in lower if not word in stop_words]
    return words

In [99]:
def clean_japanese(text):
    return [x.lower() for x in text]

japanese_tagger = Tagger('-Owakati') # Tagger has initial startup overhead, therefore it is defined here and not in lambda function

In [100]:
get_data = lambda language: dataset.filter(lambda x: x['language'] == language)

# define languages
languages = {
    'english': Language(
        name = 'english',
        tokenizer = word_tokenize,
        cleaner = clean_english,
        pipeline = Pipeline(get_data('english'))
        ),
    'japanese': Language(
        name = 'japanese',
        tokenizer = lambda text : japanese_tagger.parse(text).split(" "), #lambda text: text.split(), #[word.surface for word in fugashi.Tagger()(text)],
        cleaner = clean_japanese,
        pipeline = Pipeline(get_data('japanese'))
        ),
    'finnish': Language(
        name = 'finnish',
        tokenizer = word_tokenize,
        cleaner = clean_finnish,
        pipeline = Pipeline(get_data('finnish'))
        ),
}

Loading cached processed dataset at C:\Users\chris\.cache\huggingface\datasets\copenlu___parquet\copenlu--nlp_course_tydiqa-cceecfb5416d988a\0.0.0\2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec\cache-42d8b8b0bba1f895.arrow
Loading cached processed dataset at C:\Users\chris\.cache\huggingface\datasets\copenlu___parquet\copenlu--nlp_course_tydiqa-cceecfb5416d988a\0.0.0\2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec\cache-302ea1ccbac68b05.arrow
Loading cached processed dataset at C:\Users\chris\.cache\huggingface\datasets\copenlu___parquet\copenlu--nlp_course_tydiqa-cceecfb5416d988a\0.0.0\2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec\cache-f1d195ad391e1424.arrow
Loading cached processed dataset at C:\Users\chris\.cache\huggingface\datasets\copenlu___parquet\copenlu--nlp_course_tydiqa-cceecfb5416d988a\0.0.0\2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec\cache-69553e7e5be5ae7b.arrow
Loading cached processed dataset at 

In [101]:
for language in languages.values():
    language.pipeline.tokenize(language.tokenizer)
    language.pipeline.clean(language.cleaner)
    language.pipeline.label_answerable()
    language.pipeline.extract_features()

Loading cached processed dataset at C:\Users\chris\.cache\huggingface\datasets\copenlu___parquet\copenlu--nlp_course_tydiqa-cceecfb5416d988a\0.0.0\2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec\cache-67bb50aaddc289fa.arrow
Loading cached processed dataset at C:\Users\chris\.cache\huggingface\datasets\copenlu___parquet\copenlu--nlp_course_tydiqa-cceecfb5416d988a\0.0.0\2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec\cache-b7b4da070683f0ab.arrow
Loading cached processed dataset at C:\Users\chris\.cache\huggingface\datasets\copenlu___parquet\copenlu--nlp_course_tydiqa-cceecfb5416d988a\0.0.0\2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec\cache-3457af876fbd303b.arrow
Loading cached processed dataset at C:\Users\chris\.cache\huggingface\datasets\copenlu___parquet\copenlu--nlp_course_tydiqa-cceecfb5416d988a\0.0.0\2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec\cache-c0bc4c45f0d3f1ea.arrow
Loading cached processed dataset at 

In [102]:
# Find the most common first and last words in each language
for language in languages.values():
    count_words = lambda text: np.unique(text, return_counts=True) # Count occurences of words in text
    sort_words = lambda word_count: np.argsort(word_count[1])[::-1] # Get list of sorted indices based on most frequent words
    zip_words = lambda word_counts, sort_indices: list(zip(word_counts[0][sort_indices],word_counts[1][sort_indices])) # Zip the most frequent words with its number of occurences
    def find_most_common(text):
        """Finds the most frequent words in a text together with its number of occurences"""
        word_count = count_words(text)
        return zip_words(word_count, sort_words(word_count))


    tokenized_questions = language.pipeline.train_data['tokenized_question']
    first = [sublist[0] for sublist in tokenized_questions]
    last = [sublist[-1] for sublist in tokenized_questions]
    
    print(f"""
    Language: {language.name}
    Most frequent first words:
    {find_most_common(first)[:5]}
    Most frequent last words:
    {find_most_common(last)[:5]}
    """)


    Language: english
    Most frequent first words:
    [('When', 2242), ('What', 2101), ('How', 1296), ('Who', 1058), ('Where', 486)]
    Most frequent last words:
    [('?', 7379), ('zombie', 2), ('metabolite', 2), ('\\', 2), ('BCE', 2)]
    

    Language: japanese
    Most frequent first words:
    [('日本', 392), ('『', 306), ('アメリカ', 106), ('世界', 94), ('第', 56)]
    Most frequent last words:
    [('？', 5920), ('いつ', 730), ('た', 608), ('どこ', 584), ('何', 448)]
    

    Language: finnish
    Most frequent first words:
    [('Milloin', 3519), ('Mikä', 2328), ('Missä', 1646), ('Kuka', 1619), ('Mitä', 1088)]
    Most frequent last words:
    [('?', 13689), ('tulitaistelussa', 2), ('tohtoriksi+', 2), ('syntynyt', 2), ('pinta-ala', 2)]
    


In [103]:
for language in languages.values():
    language.pipeline.train(LogisticRegression())
    language.pipeline.validate()

Training accuracy: 0.6571931249154148
Validation accuracy: 0.6373737373737374
Training accuracy: 0.658464342674869
Validation accuracy: 0.6486486486486487
Training accuracy: 0.7238887672432669
Validation accuracy: 0.727164887307236


In [104]:
tagger = Tagger('-Owakati')
text = "麩菓子は、麩を主材料とした日本の菓子。"
tagger.parse(text).split(" ")


['麩', '菓子', 'は', '、', '麩', 'を', '主材', '料', 'と', 'し', 'た', '日本', 'の', '菓子', '。']