In [45]:
%matplotlib inline
import pandas as pd
import json

languages = ['de', 'en']


def separated_filter(row):
    return len(row.answers) > 0 and len(row.answers) != len(range(row.answers[0], row.answers[-1] + 1))


def analyze_split(name):
    split_path = f'../datasets/splits/de/{name}_de.json'
    split_df = pd.concat([pd.DataFrame(json.load(open(split_path, 'r')))])
    split_df['answer_count'] = split_df.apply(lambda x: len(x.answers), axis=1)
    split_df['sentences'] = split_df.apply(lambda x: x.context.count('\n') + 1, axis=1)
    split_df['chars'] = split_df.apply(lambda x: len(x.context), axis=1)
    split_df['question_word'] = split_df.apply(lambda x: x.question.split(' ')[0], axis=1)
    split_df['question_chars'] = split_df.apply(lambda x: len(x.question), axis=1)
    split_df['chars_per_question'] = split_df.apply(lambda x: x.chars / x.question_chars, axis=1)
    split_df['chars_per_sentence'] = split_df.apply(lambda x: x.chars / x.sentences, axis=1)
    split_df['sentences_per_context'] = split_df.apply(lambda x: x.sentences, axis=1)
    split_df['answers_per_sentence'] = split_df.apply(lambda x: x.answer_count / x.sentences, axis=1)
    de_split_df = split_df[(split_df.language == 'de') & (split_df.sourceLanguage != 'en')]
    en_split_df = split_df[(split_df.language == 'de') & (split_df.sourceLanguage == 'en')]
    
    de_contexts_df = de_split_df[de_split_df.duplicated(subset=['pageId']) != True]
    en_contexts_df = en_split_df[en_split_df.duplicated(subset=['pageId']) != True]
    
    print(f'### {name} pt. 1 ###')
    print(f'Total: {len(split_df)}')
    print('----------------------------------------------')
    print(f'German: {len(de_split_df)}')
    print(f'No answer: {len(de_split_df[de_split_df.answers.str.len() == 0])}')
    print(f'Sep. answer: {len(de_split_df[de_split_df.apply(separated_filter, axis=1)])}')
    print('----------------------------------------------')
    print(f'English: {len(en_split_df)}')
    print(f'No answer: {len(en_split_df[en_split_df.answers.str.len() == 0])}')
    print(f'Sep. answer: {len(en_split_df[en_split_df.apply(separated_filter, axis=1)])}')

    print(f'### {name} pt. 2 ###')
    print(f'Total: {len(split_df)}')
    print('----------------------------------------------')
    print(f'German: {len(de_split_df)}')
    print(f'Questions/Context: {len(de_split_df) / len(de_contexts_df)}')
    print(f'Chars/Sentence: {de_contexts_df.chars.sum() / de_contexts_df.sentences.sum()} | {de_contexts_df.chars_per_sentence.mean()}')
    print(f'Sentences/Context: {de_contexts_df.sentences.sum() / len(de_contexts_df)} | {de_contexts_df.sentences_per_context.mean()}')
    print(f'Answers/Question (w no answers): {de_split_df.answer_count.mean()}')
    print(f'Answers/Question (w/o no answers): {de_split_df[de_split_df.answer_count > 0].answer_count.mean()}')
    print(f'Answers/Sentence % (w no answers): {de_split_df.answer_count.sum() / de_split_df.sentences.sum()} | {de_split_df.answers_per_sentence.mean()}')
    print(f'Answers/Sentence % (w/o no answers): {de_split_df[de_split_df.answer_count > 0].answer_count.sum() / de_split_df[de_split_df.answer_count > 0].sentences.sum()}')
    print('----------------------------------------------')
    print(f'Chars/Question: {de_split_df.question_chars.sum() / de_split_df.sentences.sum()} | {de_split_df.chars_per_question.mean()}')
    print(f'Question words: {de_split_df.question_word.value_counts()[de_split_df.question_word.value_counts() > 5]}')
    print(f'Other {de_split_df.question_word.value_counts()[de_split_df.question_word.value_counts() <= 5].sum()}')
    print('----------------------------------------------')
    print(f'English: {len(en_split_df)}')
    print(f'No answer: {len(en_split_df[en_split_df.answers.str.len() == 0])}')
    print(f'Sep. answer: {len(en_split_df[en_split_df.apply(separated_filter, axis=1)])}')

analyze_split('dev')


### dev pt. 1 ###
Total: 300
----------------------------------------------
German: 222
No answer: 46
Sep. answer: 43
----------------------------------------------
English: 78
No answer: 12
Sep. answer: 6
### dev pt. 2 ###
Total: 300
----------------------------------------------
German: 222
Questions/Context: 1.1871657754010696
Chars/Sentence: 57.06508515815085 | 60.537109507834046
Sentences/Context: 26.37433155080214 | 26.37433155080214
Answers/Question (w no answers): 5.671171171171171
Answers/Question (w/o no answers): 7.153409090909091
Answers/Sentence % (w no answers): 0.21804641496363006 | 0.26937889246546176
Answers/Sentence % (w/o no answers): 0.28241363840287126
----------------------------------------------
Chars/Question: 2.2743332178732247 | 27.43975913372882
Question words: question_word
Welche    64
Was       54
Wie       40
Wo        30
Wer       10
Name: count, dtype: int64
Other 24
----------------------------------------------
English: 78
No answer: 12
Sep. answer: 