# Output Dataset Analysis

This notebook analyzes the final GoodWiki dataset to collect statistics.

In [14]:
import sys
sys.path.append("..")

from collections import Counter
import math
import unicodedata

from datasets import load_dataset
from transformers import AutoTokenizer
from tqdm.notebook import tqdm

from src.utils import count_rows, get_batch

In [2]:
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

In [3]:
tokenizer.encode("Hello I like apple pie.")

[2, 31414, 38, 101, 15162, 11637, 4]

In [4]:
DATA_FILENAME = "../out/09_04_2023.parquet"

In [5]:
row_count = count_rows(DATA_FILENAME, 256)
row_count

44754

In [6]:
token_count = 0
with tqdm(total=row_count) as pbar:
    for batch in get_batch(DATA_FILENAME, 1):
        ex = batch[0]
        token_count += len(tokenizer.encode(ex["markdown"]))
        pbar.update()

token_count

  0%|          | 0/44754 [00:00<?, ?it/s]

179198101

In [7]:
dataset = load_dataset("wikitext", "wikitext-103-raw-v1")

In [8]:
splits = ['test', 'train', 'validation']
wikitext_token_count = 0
batch_size = 128
for split in splits:
    for i in tqdm(range(0, len(dataset[split]), batch_size)):
        text = "".join(dataset[split][i:i+batch_size]["text"])
        text = unicodedata.normalize("NFKC", text)
        wikitext_token_count += len(tokenizer.encode(text))
wikitext_token_count

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/14074 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

118465269

In [9]:
token_count / wikitext_token_count

1.512663606073439

## Counting unique characters

In [15]:
unique_chars = Counter()
batch_size = 128
with tqdm(total=row_count) as pbar:
    for batch in get_batch(DATA_FILENAME, batch_size):
        for ex in batch:
            unique_chars.update(Counter(ex["markdown"]))
            pbar.update()
unique_chars.most_common(100)

  0%|          | 0/350 [00:00<?, ?it/s]

[(' ', 130930982),
 ('e', 76726574),
 ('t', 53271005),
 ('a', 52893993),
 ('n', 45905092),
 ('i', 45466764),
 ('o', 45021761),
 ('r', 41220374),
 ('s', 39406700),
 ('h', 29588555),
 ('l', 25389701),
 ('d', 25181335),
 ('c', 18969708),
 ('u', 16133909),
 ('m', 14765055),
 ('f', 13146138),
 ('g', 12151919),
 ('p', 11823854),
 ('w', 10186376),
 ('y', 9344971),
 ('b', 8534511),
 (',', 7998723),
 ('.', 6481598),
 ('v', 6240639),
 ('k', 3962288),
 ('\n', 3551271),
 ('1', 2986317),
 ('T', 2711102),
 ('0', 2605446),
 ('S', 2597403),
 ('A', 2459985),
 ('"', 2129441),
 ('C', 2105794),
 ('2', 1925981),
 ('M', 1790186),
 ('-', 1716358),
 ('B', 1672243),
 ('I', 1585651),
 ("'", 1534544),
 ('9', 1522668),
 ('H', 1327603),
 ('P', 1258629),
 ('#', 1204393),
 ('x', 1193709),
 ('R', 1165975),
 ('D', 1163424),
 ('F', 1045554),
 ('L', 1029776),
 ('W', 1015142),
 ('N', 964828),
 ('G', 963445),
 (')', 924942),
 ('(', 924620),
 ('3', 886482),
 ('8', 860826),
 ('5', 857635),
 ('E', 843775),
 ('J', 816037),
 (

In [16]:
# Total chars
unique_chars.total()

811791686

## Count unique words
Using whitespace tokenizer.

In [19]:
unique_words = Counter()
batch_size = 128
with tqdm(total=row_count) as pbar:
    for batch in get_batch(DATA_FILENAME, batch_size):
        for ex in batch:
            unique_words.update(Counter(ex["markdown"].split()))
            pbar.update()

  0%|          | 0/44754 [00:00<?, ?it/s]

[('the', 8623358),
 ('of', 4189651),
 ('and', 3837891),
 ('to', 3067426),
 ('in', 2901869),
 ('a', 2560973),
 ('was', 1704132),
 ('The', 1275717),
 ('on', 1090964),
 ('as', 1065180),
 ('for', 1062360),
 ('that', 1017750),
 ('with', 997642),
 ('by', 931845),
 ('is', 710845),
 ('from', 674132),
 ('his', 667396),
 ('at', 659704),
 ('were', 558115),
 ('he', 473342),
 ('an', 465424),
 ('had', 463769),
 ('it', 445292),
 ('In', 443464),
 ('which', 415677),
 ('be', 350507),
 ('but', 309290),
 ('also', 295780),
 ('their', 294879),
 ('not', 285486),
 ('first', 281285),
 ('are', 280789),
 ('her', 276260),
 ('its', 275939),
 ('-', 272407),
 ('have', 240534),
 ('##', 238670),
 ('who', 232713),
 ('or', 231706),
 ('been', 228855),
 ('two', 222025),
 ('He', 221012),
 ('one', 214557),
 ('has', 207167),
 ('after', 203672),
 ('they', 203204),
 ('would', 201783),
 ('this', 198671),
 ('###', 197585),
 ('into', 189155),
 ('It', 166201),
 ('she', 165573),
 ('other', 165083),
 ('more', 158719),
 ('during', 15

In [20]:
unique_words.most_common(500)

[('the', 8623358),
 ('of', 4189651),
 ('and', 3837891),
 ('to', 3067426),
 ('in', 2901869),
 ('a', 2560973),
 ('was', 1704132),
 ('The', 1275717),
 ('on', 1090964),
 ('as', 1065180),
 ('for', 1062360),
 ('that', 1017750),
 ('with', 997642),
 ('by', 931845),
 ('is', 710845),
 ('from', 674132),
 ('his', 667396),
 ('at', 659704),
 ('were', 558115),
 ('he', 473342),
 ('an', 465424),
 ('had', 463769),
 ('it', 445292),
 ('In', 443464),
 ('which', 415677),
 ('be', 350507),
 ('but', 309290),
 ('also', 295780),
 ('their', 294879),
 ('not', 285486),
 ('first', 281285),
 ('are', 280789),
 ('her', 276260),
 ('its', 275939),
 ('-', 272407),
 ('have', 240534),
 ('##', 238670),
 ('who', 232713),
 ('or', 231706),
 ('been', 228855),
 ('two', 222025),
 ('He', 221012),
 ('one', 214557),
 ('has', 207167),
 ('after', 203672),
 ('they', 203204),
 ('would', 201783),
 ('this', 198671),
 ('###', 197585),
 ('into', 189155),
 ('It', 166201),
 ('she', 165573),
 ('other', 165083),
 ('more', 158719),
 ('during', 15

In [21]:
unique_words.total()

132691055

## Examples

In [12]:
ex = get_batch(DATA_FILENAME, 100).__next__()
ex[-37]

{'pageid': 40961074,
 'title': 'Attarsiya',
 'revid': 1164804042,
 'description': 'Military leader of Ahhiya',
 'categories': ['Ancient Anatolia',
  'Greek military leaders',
  'Mycenaean Greeks'],
 'markdown': 'Attarsiya was a 15th–14th century BCE military leader of Ahhiya. In the Hittite archives of circa 1400 BCE, he is described as a "man of Ahhiya", a country identified with the Achaeans and Mycenaean Greece. The campaigns of Attarsiya, as well as his conflict with the Hittite vassal, Madduwatta, represent the first recorded Mycenaean Greek military activity on the Anatolian mainland, as well as the first conflict between Achaeans and Hittites. He finally withdrew from Anatolia after Hittite intervention, but later launched a campaign against Alashiya (Cyprus).\n\nContemporary Hittite accounts about the campaigns of Attarsiya and the Ahhiya in general may indicate that there was a possible Mycenaean empire centered on late Bronze Age Greece. Moreover, Attarsiya might be a possibl

In [13]:
print(ex[-37]["markdown"])

Attarsiya was a 15th–14th century BCE military leader of Ahhiya. In the Hittite archives of circa 1400 BCE, he is described as a "man of Ahhiya", a country identified with the Achaeans and Mycenaean Greece. The campaigns of Attarsiya, as well as his conflict with the Hittite vassal, Madduwatta, represent the first recorded Mycenaean Greek military activity on the Anatolian mainland, as well as the first conflict between Achaeans and Hittites. He finally withdrew from Anatolia after Hittite intervention, but later launched a campaign against Alashiya (Cyprus).

Contemporary Hittite accounts about the campaigns of Attarsiya and the Ahhiya in general may indicate that there was a possible Mycenaean empire centered on late Bronze Age Greece. Moreover, Attarsiya might be a possible Hittite reconstruction of the Greek name Atreus, a king of Mycenae according to Greek mythology.

## Background

The activities of Attarsiya are recorded in the Hittite archives, in particular in the Indictment o

## Huggingface Test

In [22]:
from datasets import load_dataset

In [23]:
ds = load_dataset("euirim/goodwiki")

Downloading readme:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/483M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [25]:
ds["train"][0:2]

{'pageid': [57185536, 15394015],
 'title': ['Georgia Hopley', 'Willis Ward'],
 'revid': [1163683705, 1170257280],
 'description': ['American journalist and temperance advocate',
  'Track and field athlete and American football player'],
 'categories': [['1858 births',
   '1944 deaths',
   '19th-century American journalists',
   '19th-century American women journalists',
   '19th-century American women writers',
   '20th-century American journalists',
   '20th-century American women journalists',
   'American temperance activists',
   'Hopley family',
   'Journalists from Ohio',
   'Ohio Republicans',
   'People from Bucyrus, Ohio'],
  ['1912 births',
   '1983 deaths',
   '20th-century African-American lawyers',
   '20th-century African-American sportspeople',
   '20th-century American lawyers',
   'African-American players of American football',
   'American football ends',
   'Detroit College of Law alumni',
   'Michigan Wolverines football players',
   "Michigan Wolverines men's trac