In [13]:
import pickle
from pathlib import Path

import humanize
import numpy as np

import git

In [11]:
repo = git.Repo(Path(".").absolute(), search_parent_directories=True)
ROOT = Path(repo.working_tree_dir)
default_outdir = ROOT / "data" / "processed"

n_sents = 1000000
out_file_name = "wiki_{}.pkl".format(
    "_".join(humanize.intword(n_sents, format="%.0f").split())
)

# Load the tokenized wikipedia corpus

In [14]:
with open(default_outdir / out_file_name, "rb") as f:
    wiki_corpus = pickle.load(f)

Print an example sentence

In [37]:
print(wiki_corpus[0])

['Kolkata', 'Suburban', 'Railway', 'The', 'Kolkata', 'Suburban', 'Railway', 'is', 'a', 'suburban', 'rail', 'system', 'serving', 'the', 'suburbs', 'surrounding', 'the', 'city', 'of', 'Kolkata', '.']


# Sentence Length Statistics

About 60% of the sentences consist of 3 to 25 tokens, 30% of the sentences consist of 26 to 40 tokens, and 10% of the sentences consist of 41 to 70 tokens.

In [45]:
wiki_sent_lengths = np.array([len(sent) for sent in wiki_corpus])
print(
    (np.logical_and(wiki_sent_lengths >= 3, wiki_sent_lengths <= 25)).sum()
    / len(wiki_sent_lengths)
)
print(
    (np.logical_and(wiki_sent_lengths >= 26, wiki_sent_lengths <= 40)).sum()
    / len(wiki_sent_lengths)
)
print(
    (np.logical_and(wiki_sent_lengths >= 41, wiki_sent_lengths <= 70)).sum()
    / len(wiki_sent_lengths)
)

0.590842
0.294003
0.101577


The median and the mean of the sentence lengths are about 23 and 25 respectively

In [33]:
print(np.median(wiki_sent_lengths))
print(np.mean(wiki_sent_lengths))

23.0
25.211804
