In [None]:
# Selecting Tensorflow version v2 (the command is relevant for Colab only).
# %tensorflow_version 2.x

In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
import numpy as np
import random

print('Tensorflow version:', tf.__version__)
print('Keras version:', tf.keras.__version__)

Tensorflow version: 2.4.1
Keras version: 2.4.0


In [None]:
# Loading the wikipedia dataset.
DATASET_NAME = 'wikipedia/20190301.en'

dataset, dataset_info = tfds.load(
    name=DATASET_NAME,
    data_dir='tmp',
    with_info=True,
    split='train',
)

[1mDownloading and preparing dataset wikipedia/20190301.en/1.0.0 (download: 15.72 GiB, generated: Unknown size, total: 15.72 GiB) to tmp/wikipedia/20190301.en/1.0.0...[0m


local data directory. If you'd instead prefer to read directly from our public
GCS bucket (recommended if you're running on GCP), you can instead pass
`try_gcs=True` to `tfds.load` or set `data_dir=gs://tfds-data/datasets`.



HBox(children=(FloatProgress(value=0.0, description='Dl Completed...', max=258.0, style=ProgressStyle(descript…



[1mDataset wikipedia downloaded and prepared to tmp/wikipedia/20190301.en/1.0.0. Subsequent calls will reuse this data.[0m


In [None]:
print(dataset_info)

tfds.core.DatasetInfo(
    name='wikipedia',
    version=1.0.0,
    description='Wikipedia dataset containing cleaned articles of all languages.
The datasets are built from the Wikipedia dump
(https://dumps.wikimedia.org/) with one split per language. Each example
contains the content of one full Wikipedia article with cleaning to strip
markdown and unwanted sections (references, etc.).',
    homepage='https://dumps.wikimedia.org',
    features=FeaturesDict({
        'text': Text(shape=(), dtype=tf.string),
        'title': Text(shape=(), dtype=tf.string),
    }),
    total_num_examples=5824596,
    splits={
        'train': 5824596,
    },
    supervised_keys=None,
    citation="""@ONLINE {wikidump,
        author = "Wikimedia Foundation",
        title  = "Wikimedia Downloads",
        url    = "https://dumps.wikimedia.org"
    }""",
    redistribution_info=license: "This work is licensed under the Creative Commons Attribution-ShareAlike 3.0 Unported License. To view a copy of this l

In [None]:
print(dataset)

<PrefetchDataset shapes: {text: (), title: ()}, types: {text: tf.string, title: tf.string}>


In [None]:
TOTAL_NUM_EXAMPLES = dataset_info.splits['train'].num_examples
print('Total number of articles: ', TOTAL_NUM_EXAMPLES)

Total number of articles:  5824596


In [None]:
c = 0
for example in dataset.take(10):
  if (c == 1):
    print('Title:','\n------')
    print(example['title'].numpy().decode('utf-8'))
    print()
    print('Text:', '\n------')
    print(example['text'].numpy().decode('utf-8'))
  c += 1

Title: 
------
Pauline Donalda

Text: 
------
Pauline Donalda,  (March 5, 1882 – October 22, 1970) was a Canadian operatic soprano.

Early life and education
Donalda was born Pauline Lightstone in Montreal, Quebec, the daughter of Jewish parents who changed their surname from Lichtenstein to Lightstone after immigrating from Russia and Poland. She studied with Clara Lichtenstein (no relation) at Royal Victoria College, part of McGill University. In 1902, went to the Conservatoire de Paris on a grant from Donald Smith, Lord Strathcona, the patron of RVC.  There, she studied voice with Edmond Duvernoy.  She adopted the stage name Donalda in honour of her patron.

Career
With the help of composer Jules Massenet, Donalda made her debut in 1904 in Nice, singing the title role in his opera Manon.  The following year, she debuted in London, singing the role of Micaëla in Carmen. Donalda was the first to sing the roles of Concepción in Maurice Ravel's L'heure espagnole and Ah-joe in Franco Leo

In [None]:
sample_count = 100000
random_indices = random.sample(range(0, TOTAL_NUM_EXAMPLES-1), sample_count)

In [None]:
!rm -r "corpus" &> /dev/null
!rm "corpus.zip" &> /dev/null
import os
directory_name = "corpus"
os.mkdir(directory_name)

num_pages_in_file = 1000
counter = 0
progressMileStone = 0.05
total_examples = len(dataset)
page_counter = 0;

for example in dataset:

  if ((counter/total_examples) > progressMileStone):
    print(str(round(progressMileStone * 100)) + "% ", end='', flush=True)
    progressMileStone += 0.05

  if counter in random_indices:
    page_counter += 1
    article_text = example['text'].numpy().decode('utf-8')

    if (page_counter % num_pages_in_file == 1):
      if (page_counter>1):
        file.close()
      file_counter_str = str(page_counter//num_pages_in_file + 1).zfill(3)
      file_path = directory_name + r"/corpus_part"+file_counter_str
      file = open(file_path, mode="w", encoding="utf_8")

    file.write(article_text)
    file.write("\n")

  counter += 1

file.close()
print("100%\n", flush=True)

5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 55% 60% 65% 70% 75% 80% 85% 90% 95% 100%



In [None]:
!zip -r /content/corpus.zip /content/corpus

In [None]:
from google.colab import files
files.download("/content/corpus.zip")