In [12]:
%matplotlib inline
import pandas as pd
import json

ROW_WIDTH = 108

corpus_statistics = []
corpus_statistics_answers = []

def separated_filter(row):
    return len(row.answers) > 0 and len(row.answers) != len(range(row.answers[0], row.answers[-1] + 1))


def print_row(values, width=15):
    print(f'{values[0]:^{35}} |', ' | '.join([f'{it:^{width}}' for it in values[1:]]))
    

def add_latex_table_row(values, table):
    indentation = '' if values[0] in ['all', 'German', 'English'] else '\hspace{1em}'
    table.append(f"{indentation}{' & '.join([f'{it}' for it in values])} \\\\")


def print_split_stat(splits, get_value, name, digits=2):
    values = [get_value(split) for split in splits]
    rounded = [f'{value:.2f}' if digits > 0 else value for value in values]
    print_row([name] + rounded)
    add_latex_table_row([name] + rounded, table=corpus_statistics if digits == 0 else corpus_statistics_answers)


def print_mean_std_stat(splits, get_column, name):
    values = [[get_column(split).mean(), get_column(split).std()] for split in splits]
    rounded_print = [f'{value[0]:.2f} ({value[1]:.2f})' for value in values]
    rounded_latex = [f'{value[0]:.2f} \\textcolor{{gray}}{{\scriptsize $\pm$ {value[1]:.2f}}}' for value in values]
    print_row([name] + rounded_print)
    add_latex_table_row([name] + rounded_latex, table=corpus_statistics_answers)
    

def load_split(name, language):
    split_path = f'../datasets/splits/{language}/{name}_{language}.json'
    return pd.DataFrame(json.load(open(split_path, 'r')))


def prepare_split(name, language):
    split_df = load_split(name, language)
    split_df['answer_count'] = split_df.apply(lambda x: len(x.answers), axis=1)
    split_df['sentences'] = split_df.apply(lambda x: x.context.count('\n') + 1, axis=1)
    split_df['chars'] = split_df.apply(lambda x: len(x.context), axis=1)
    split_df['question_word'] = split_df.apply(lambda x: x.question.split(' ')[0], axis=1)
    split_df['question_chars'] = split_df.apply(lambda x: len(x.question), axis=1)
    split_df['chars_per_sentence'] = split_df.apply(lambda x: x.chars / x.sentences, axis=1)
    split_df['sentences_per_context'] = split_df.apply(lambda x: x.sentences, axis=1)
    split_df['answers_per_sentence'] = split_df.apply(lambda x: x.answer_count / x.sentences, axis=1)
    
    contexts_df = split_df[split_df.duplicated(subset=['pageId']) != True]

    if language is None:
        return [split_df, contexts_df]
    elif language == 'de':
        de_split_df = split_df[(split_df.language == 'de') & (split_df.sourceLanguage != 'en')]
        de_contexts_df = de_split_df[de_split_df.duplicated(subset=['pageId']) != True]
        return [de_split_df, de_contexts_df]
    else:
        en_split_df = split_df[(split_df.language == 'en') & (split_df.sourceLanguage != 'de')]
        en_contexts_df = en_split_df[en_split_df.duplicated(subset=['pageId']) != True]
        return [en_split_df, en_contexts_df]


def analyze_split_language_corpus(language):
    train = prepare_split('train', language)
    dev = prepare_split('dev', language)
    test = prepare_split('test', language)
    total = [pd.concat([train[i], dev[i], test[i]]) for i in range(2)]
    splits = [train, dev, test, total]
    
    # Formatting
    if language is not None:
        corpus_statistics.append('\midrule')
        corpus_statistics_answers.append('\midrule')
    add_latex_table_row(['German' if language == 'de' else 'English' if language is not None else 'all', '', '', '', ''], table=corpus_statistics_answers)
    
    # Overall Stats
    print_split_stat(splits, lambda dfs: len(dfs[0]), 'German' if language == 'de' else 'English' if language is not None else 'all', 0)
    print_split_stat(splits, lambda dfs: len(dfs[0][dfs[0].answers.str.len() == 0]), 'No Answer', 0)
    print_split_stat(splits, lambda dfs: len(dfs[0][dfs[0].apply(separated_filter, axis=1)]), 'Divided Answer', 0)
    
    # Corpus Stats
    print('-' * ROW_WIDTH)
    print_split_stat(splits, lambda dfs: len(dfs[0]) / len(dfs[1]), 'Questions/Document')
    # TODO sum vs single?
    # print_split_stat(splits, lambda dfs: dfs[1].chars.sum() / dfs[1].sentences.sum(), 'Chars/Sentence (sum)')
    print_mean_std_stat(splits, lambda dfs: dfs[1].chars_per_sentence, 'Chars/Sentence')
    print_mean_std_stat(splits, lambda dfs: dfs[1].sentences, 'Sentences/Document')

    # Answer Stats
    print('-' * ROW_WIDTH)
    corpus_statistics_answers.append('\midrule')
    print_mean_std_stat(splits, lambda dfs: dfs[0].jaccard, 'Agreement')
    # TODO w. vs w/o. answers?
    print_mean_std_stat(splits, lambda dfs: dfs[0].answer_count, 'Answers/Question')
    # print_mean_std_stat(splits, lambda dfs: dfs[0][dfs[0].answer_count > 0].answer_count, 'Answers/Question (w/o no answers)')
    # TODO sum vs single?
    # print_split_stat(splits, lambda dfs: dfs[0].answer_count.sum() / dfs[0].sentences.sum(), 'Answers/Sentence % (w no answers) (sum)')
    # TODO w. vs w/o. answers?
    print_mean_std_stat(splits, lambda dfs: dfs[0].answers_per_sentence, 'Answers/Sentence')
    # print_mean_std_stat(splits, lambda dfs: dfs[0][dfs[0].answer_count > 0].answers_per_sentence, 'Answers/Sentence (w/o no answers)')

    # Question Stats
    print('-' * ROW_WIDTH)
    corpus_statistics_answers.append('\midrule')
    print_mean_std_stat(splits, lambda dfs: dfs[0].question_chars, 'Chars/Question')


def analyze_split_corpus():
    print_row(['', 'train', 'dev', 'test', 'total'])
    for language in [None, 'de', 'en']:
        print('#' * ROW_WIDTH)
        # TODO english vs german w. english source
        analyze_split_language_corpus(language)

    train = prepare_split('train', None)
    dev = prepare_split('dev', None)
    test = prepare_split('test', None)
    total = [pd.concat([train[i], dev[i], test[i]]) for i in range(2)]
    print()
    print('#' * ROW_WIDTH)
    print('Question words')
    print('\n'.join([f'{key}: {value}' for key, value in total[0].question_word.value_counts()[total[0].question_word.value_counts() > 5].items()]))
    print(f'Other {total[0].question_word.value_counts()[total[0].question_word.value_counts() <= 5].sum()}')
    
    table_file = open('./resources/corpus.txt', 'w')
    table_file.write('\n'.join(corpus_statistics))

    table_file = open('./resources/corpus_answers.txt', 'w')
    table_file.write('\n'.join(corpus_statistics_answers))


analyze_split_corpus()

                                    |      train      |       dev       |      test       |      total     
############################################################################################################
                all                 |       461       |       193       |       252       |       906      
             No Answer              |       81        |       38        |       55        |       174      
          Divided Answer            |       76        |       31        |       44        |       151      
------------------------------------------------------------------------------------------------------------
        Questions/Document          |      1.50       |      1.45       |      1.43       |      1.47      
          Chars/Sentence            |  61.96 (17.51)  |  63.10 (15.63)  |  62.52 (16.79)  |  62.36 (16.90) 
        Sentences/Document          |  25.94 (18.18)  |  27.19 (16.11)  |  26.43 (16.56)  |  26.35 (17.28) 
--------------------------