add d2lbook saving mark for consecutive vars/functions

d2l-ai · Jan 6, 2020 · bca6cfb · bca6cfb
1 parent 2c53e54
commit bca6cfb
Show file tree

Hide file tree

Showing 7 changed files with 66 additions and 0 deletions.
diff --git a/chapter_computer-vision/semantic-segmentation-and-dataset.md b/chapter_computer-vision/semantic-segmentation-and-dataset.md
@@ -79,6 +79,8 @@ VOC_COLORMAP = [[0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0],
                 [64, 0, 128], [192, 0, 128], [64, 128, 128], [192, 128, 128],
                 [0, 64, 0], [128, 64, 0], [0, 192, 0], [128, 192, 0],
                 [0, 64, 128]]
+
+# Saved in the d2l package for later use
 VOC_CLASSES = ['background', 'aeroplane', 'bicycle', 'bird', 'boat',
                'bottle', 'bus', 'car', 'cat', 'chair', 'cow',
                'diningtable', 'dog', 'horse', 'motorbike', 'person',

diff --git a/chapter_multilayer-perceptrons/kaggle-house-price.md b/chapter_multilayer-perceptrons/kaggle-house-price.md
@@ -35,6 +35,8 @@ import tarfile
 
 # Saved in the d2l package for later use
 DATA_HUB = dict()
+
+# Saved in the d2l package for later use
 DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'
 ```
 
@@ -157,6 +159,8 @@ For convenience, we downloaded and saved the Kaggle dataset in the `DATA_URL` we
 DATA_HUB['kaggle_house_train'] = (
     DATA_URL+'kaggle_house_pred_train.csv',
     '585e9cc93e70b39160e7921475f9bcd7d31219ce')
+
+# Saved in the d2l package for later use  
 DATA_HUB['kaggle_house_test'] = (
     DATA_URL+'kaggle_house_pred_test.csv',
     'fa19780a7b011d9b009e8bff8e99922a8ee2eb90')

diff --git a/chapter_natural-language-processing/word2vec-dataset.md b/chapter_natural-language-processing/word2vec-dataset.md
@@ -22,6 +22,8 @@ This dataset has already been preprocessed. Each line of the dataset acts as a s
 # Saved in the d2l package for later use
 d2l.DATA_HUB['ptb'] = (d2l.DATA_URL+'ptb.zip', 
                       '319d85e578af0cdc590547f26231e4e31cdf1e42')
+
+# Saved in the d2l package for later use
 def read_ptb():
     data_dir = d2l.download_extract('ptb')
     with open(data_dir+'ptb.train.txt') as f:

diff --git a/chapter_optimization/minibatch-sgd.md b/chapter_optimization/minibatch-sgd.md
@@ -108,6 +108,8 @@ Let's have a look at how minibatches are efficiently generated from data. In the
 # Saved in the d2l package for later use
 d2l.DATA_HUB['airfoil'] = (d2l.DATA_URL+'airfoil_self_noise.dat',
                           '76e5be1548fd8222e5074cf0faae75edff8cf93f')
+
+# Saved in the d2l package for later use
 def get_data_ch11(batch_size=10, n=1500):
     data = np.genfromtxt(d2l.download('airfoil'),
                          dtype=np.float32, delimiter='\t')

diff --git a/chapter_recommender-systems/movielens.md b/chapter_recommender-systems/movielens.md
@@ -23,6 +23,8 @@ Then, we download the MovieLens 100k dataset and load the interactions as `DataF
 d2l.DATA_HUB['ml-100k'] = (
     'http://files.grouplens.org/datasets/movielens/ml-100k.zip',
     'cd4dcac4241c8a4ad7badc7ca635da8a69dddb83')
+
+# Saved in the d2l package for later use
 def read_data_ml100k():
     data_dir = d2l.download_extract('ml-100k')
     names = ['user_id', 'item_id', 'rating', 'timestamp']

diff --git a/chapter_recurrent-modern/machine-translation.md b/chapter_recurrent-modern/machine-translation.md
@@ -19,6 +19,8 @@ We first download a dataset that contains a set of English sentences with the co
 # Saved in the d2l package for later use
 d2l.DATA_HUB['fra-eng'] = (d2l.DATA_URL+'fra-eng.zip',
                           '94646ad1522d915e7b0f9296181140edcf86a4f5')
+
+# Saved in the d2l package for later use
 def read_data_nmt():
     data_dir = d2l.download_extract('fra-eng')
     with open(data_dir+'fra.txt', 'r') as f:

diff --git a/d2l/d2l.py b/d2l/d2l.py
@@ -320,6 +320,10 @@ def evaluate_loss(net, data_iter, loss):
 DATA_HUB = dict()
 
 
+# Defined in file: ./chapter_multilayer-perceptrons/kaggle-house-price.md
+DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'
+
+
 # Defined in file: ./chapter_multilayer-perceptrons/kaggle-house-price.md
 def download(name, cache_dir='../data'):
     """Download a file inserted into DATA_HUB, return the local filename"""
@@ -362,6 +366,12 @@ def download_all():
     '585e9cc93e70b39160e7921475f9bcd7d31219ce')
 
 
+# Defined in file: ./chapter_multilayer-perceptrons/kaggle-house-price.md
+DATA_HUB['kaggle_house_test'] = (
+    DATA_URL+'kaggle_house_pred_test.csv',
+    'fa19780a7b011d9b009e8bff8e99922a8ee2eb90')
+
+
 # Defined in file: ./chapter_deep-learning-computation/use-gpu.md
 def try_gpu(i=0):
     """Return gpu(i) if exists, otherwise return cpu()."""
@@ -712,6 +722,13 @@ def begin_state(self, *args, **kwargs):
                           '94646ad1522d915e7b0f9296181140edcf86a4f5')
 
 
+# Defined in file: ./chapter_recurrent-modern/machine-translation.md
+def read_data_nmt():
+    data_dir = d2l.download_extract('fra-eng')
+    with open(data_dir+'fra.txt', 'r') as f:
+        return f.read()
+
+
 # Defined in file: ./chapter_recurrent-modern/machine-translation.md
 def preprocess_nmt(text):
     text = text.replace('\u202f', ' ').replace('\xa0', ' ')
@@ -1006,6 +1023,16 @@ def show_trace_2d(f, results):
                           '76e5be1548fd8222e5074cf0faae75edff8cf93f')
 
 
+# Defined in file: ./chapter_optimization/minibatch-sgd.md
+def get_data_ch11(batch_size=10, n=1500):
+    data = np.genfromtxt(d2l.download('airfoil'),
+                         dtype=np.float32, delimiter='\t')
+    data = (data - data.mean(axis=0)) / data.std(axis=0)
+    data_iter = d2l.load_array(
+        (data[:n, :-1], data[:n, -1]), batch_size, is_train=True)
+    return data_iter, data.shape[1]-1
+
+
 # Defined in file: ./chapter_optimization/minibatch-sgd.md
 def train_ch11(trainer_fn, states, hyperparams, data_iter,
                feature_dim, num_epochs=2):
@@ -1252,6 +1279,13 @@ def read_voc_images(voc_dir, is_train=True):
                 [0, 64, 128]]
 
 
+# Defined in file: ./chapter_computer-vision/semantic-segmentation-and-dataset.md
+VOC_CLASSES = ['background', 'aeroplane', 'bicycle', 'bird', 'boat',
+               'bottle', 'bus', 'car', 'cat', 'chair', 'cow',
+               'diningtable', 'dog', 'horse', 'motorbike', 'person',
+               'potted plant', 'sheep', 'sofa', 'train', 'tv/monitor']
+
+
 # Defined in file: ./chapter_computer-vision/semantic-segmentation-and-dataset.md
 def build_colormap2label():
     """Build an RGB color to label mapping for segmentation."""
@@ -1391,6 +1425,14 @@ def reorg_test(data_dir):
                       '319d85e578af0cdc590547f26231e4e31cdf1e42')
 
 
+# Defined in file: ./chapter_natural-language-processing/word2vec-dataset.md
+def read_ptb():
+    data_dir = d2l.download_extract('ptb')
+    with open(data_dir+'ptb.train.txt') as f:
+        raw_text = f.read()
+    return [line.split() for line in raw_text.split('\n')]
+
+
 # Defined in file: ./chapter_natural-language-processing/word2vec-dataset.md
 def subsampling(sentences, vocab):
     # Map low frequency words into <unk>
@@ -1542,6 +1584,16 @@ def predict_sentiment(net, vocab, sentence):
     'cd4dcac4241c8a4ad7badc7ca635da8a69dddb83')
 
 
+# Defined in file: ./chapter_recommender-systems/movielens.md
+def read_data_ml100k():
+    data_dir = d2l.download_extract('ml-100k')
+    names = ['user_id', 'item_id', 'rating', 'timestamp']
+    data = pd.read_csv(data_dir+'u.data', '\t', names=names, engine='python')
+    num_users = data.user_id.unique().shape[0]
+    num_items = data.item_id.unique().shape[0]
+    return data, num_users, num_items
+
+
 # Defined in file: ./chapter_recommender-systems/movielens.md
 def split_data_ml100k(data, num_users, num_items,
                       split_mode="random", test_ratio=0.1):