# Zimp Word to Sentence Ratio Analysis
How many sentences? How many words?

In [1]:
import pandas as pd
import numpy as np
import os
import logging
import matplotlib.pyplot as plt
import time
from zimp.pos.wordcount_analyzer import WordCountAnalyzer
from zimp.pos.sentence_count_analyzer import SentenceCountAnalyzer
from glob import glob

plt.style.use('seaborn-whitegrid')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Martin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Martin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package perluniprops to
[nltk_data]     C:\Users\Martin\AppData\Roaming\nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!


In [2]:
source_dir = '../../zimp_orchestrator/orch/resources'
files = {}

In [3]:
for ds_path in glob(source_dir + '/*'):
    if not os.path.isdir(ds_path):
        continue
    ds_name = os.path.basename(ds_path)
    files[ds_name] = [os.path.join(ds_path, 'train.csv'), os.path.join(ds_path, 'test.csv')]

In [4]:
def get_ds_stats(paths, wca_builder, track_name):
    texts = pd.read_csv(paths[0]).text.append(pd.read_csv(paths[1]).text)
    s = pd.Series(wca_builder(texts).extract_batch_metrics(), name=track_name)
    return s

def get_ds_language(track):
    if track in ['10K-GNAD', 'GERMEVAL-2018', 'GERMEVAL-2020']:
        return 'german'
    return 'english'

In [5]:
file_path = 'measures/words_per_sentence.csv'
if os.path.exists(file_path):
    df_wps = pd.read_csv(file_path)
else:
    dfs = []
    for dataset in files.keys():
        s_words = get_ds_stats(files[dataset], lambda texts: WordCountAnalyzer(texts, language=get_ds_language(dataset)), 'words')
        s_sentences = get_ds_stats(files[dataset], lambda texts: SentenceCountAnalyzer(texts, language=get_ds_language(dataset)), 'sentences')
        s_words_per_sentence = s_words/s_sentences
        df = pd.DataFrame([s_words.mean(), s_sentences.mean(), s_words_per_sentence.mean()], index=['avg_words', 'avg_sentences', 'avg_words_per_sentence'], columns=[dataset])
        dfs.append(df)
    df_wps = pd.concat(dfs, axis=1)
    df_wps.to_csv(file_path)

df_wps

Unnamed: 0,10K-GNAD,DBP-14,GERMEVAL-2018,GERMEVAL-2020,TREC-6,YELP-5
avg_words,414.018008,54.729171,24.241346,23.307666,9.991431,154.578214
avg_sentences,22.597781,2.397305,1.865112,1.506239,1.010585,8.500249
avg_words_per_sentence,18.701749,25.016594,14.904357,17.718422,9.915295,18.666857
