Add CI check for trailing newlines in notebooks

The rendered docs don't look good when code cells have trailing newlines.
cleanlab · Apr 5, 2022 · a903c4c · a903c4c
1 parent 6cac342
commit a903c4c
Show file tree

Hide file tree

Showing 5 changed files with 79 additions and 64 deletions.
diff --git a/.ci/nblint.py b/.ci/nblint.py
@@ -43,6 +43,7 @@ def check(notebook):
     with open(notebook) as f:
         contents = json.load(f)
     check_outputs_empty(notebook, contents)
+    check_no_trailing_newline(notebook, contents)
 
 
 def check_outputs_empty(path, contents):
@@ -51,6 +52,20 @@ def check_outputs_empty(path, contents):
             fail(path, "output is not empty", i)
 
 
+def check_no_trailing_newline(path, contents):
+    """
+    Checks that the last line of a code cell doesn't end with a newline, which
+    produces an unnecessarily newline in the doc rendering.
+    """
+    for i, cell in enumerate(contents["cells"]):
+        if cell["cell_type"] != "code":
+            continue
+        if "source" not in cell or len(cell["source"]) == 0:
+            fail(path, "code cell is empty", i)
+        if cell["source"][-1].endswith("\n"):
+            fail(path, "unnecessary trailing newline", i)
+
+
 def fail(path, message, cell=None):
     cell_msg = f" [cell {cell}]" if cell is not None else ""
     print(f"{path}{cell_msg}: {message}")

diff --git a/docs/source/tutorials/audio.ipynb b/docs/source/tutorials/audio.ipynb
@@ -112,7 +112,7 @@
     "set_seed(SEED)\n",
     "tf.get_logger().setLevel('ERROR')  # suppress TF warnings \n",
     "pd.options.display.max_colwidth = 500\n",
-    "os.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"3\"  # Suppress TF info, warnings and errors\n"
+    "os.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"3\"  # Suppress TF info, warnings and errors"
    ]
   },
   {
@@ -191,7 +191,7 @@
     "    file_paths += [os.path.join(dirpath, file) for file in filenames if file.endswith(\".wav\")]\n",
     "\n",
     "# Check out first 3 files\n",
-    "file_paths[:3]\n"
+    "file_paths[:3]"
    ]
   },
   {
@@ -242,7 +242,7 @@
     "    wav_file_example = load_wav_16k_mono(wav_file_name)\n",
     "    label = Path(wav_file_name).parts[-1].split(\"_\")[0]\n",
     "    print(f\"Given label for this example: {label}\")\n",
-    "    display.display(display.Audio(wav_file_example, rate=audio_rate))\n"
+    "    display.display(display.Audio(wav_file_example, rate=audio_rate))"
    ]
   },
   {
@@ -277,7 +277,7 @@
    "outputs": [],
    "source": [
     "wav_file_name_example = \"spoken_digits/free-spoken-digit-dataset-1.0.9/recordings/7_jackson_43.wav\"  # change this to hear other examples\n",
-    "display_example(wav_file_name_example)\n"
+    "display_example(wav_file_name_example)"
    ]
   },
   {
@@ -335,7 +335,7 @@
     "df = pd.DataFrame(file_paths, columns=[\"wav_audio_file_path\"])\n",
     "df[\"label\"] = df.wav_audio_file_path.map(lambda x: int(Path(x).parts[-1].split(\"_\")[0]))\n",
     "\n",
-    "df.head(3)\n"
+    "df.head(3)"
    ]
   },
   {
@@ -355,7 +355,7 @@
     "    embeddings = model.encode_batch(\n",
     "        signal\n",
     "    )  # Pass tensor through pretrained neural net and extract representation\n",
-    "    return embeddings\n"
+    "    return embeddings"
    ]
   },
   {
@@ -372,7 +372,7 @@
     "    embeddings = extract_audio_embeddings(feature_extractor, file_name)\n",
     "    embeddings_list.append(embeddings.cpu().numpy())\n",
     "\n",
-    "embeddings_array = np.squeeze(np.array(embeddings_list))\n"
+    "embeddings_array = np.squeeze(np.array(embeddings_list))"
    ]
   },
   {
@@ -397,7 +397,7 @@
    "outputs": [],
    "source": [
     "print(embeddings_array)\n",
-    "print(\"Shape of array: \", embeddings_array.shape)\n"
+    "print(\"Shape of array: \", embeddings_array.shape)"
    ]
   },
   {
@@ -438,7 +438,7 @@
     "# Generate cross-validated predicted probabilities for each datapoint\n",
     "cv_pred_probs = cross_val_predict(\n",
     "    estimator=model, X=embeddings_array, y=df.label.values, cv=5, method=\"predict_proba\"\n",
-    ")\n"
+    ")"
    ]
   },
   {
@@ -466,7 +466,7 @@
     "\n",
     "predicted_labels = cv_pred_probs.argmax(axis=1)\n",
     "cv_accuracy = accuracy_score(df.label.values, predicted_labels)\n",
-    "print(f\"Cross-validated estimate of accuracy on held-out data: {cv_accuracy}\")\n"
+    "print(f\"Cross-validated estimate of accuracy on held-out data: {cv_accuracy}\")"
    ]
   },
   {
@@ -507,7 +507,7 @@
     "    return_indices_ranked_by=\"self_confidence\",  # ranks the label issues\n",
     ")\n",
     "\n",
-    "print(label_issues_indices)\n"
+    "print(label_issues_indices)"
    ]
   },
   {
@@ -532,7 +532,7 @@
    },
    "outputs": [],
    "source": [
-    "df.iloc[label_issues_indices]\n"
+    "df.iloc[label_issues_indices]"
    ]
   },
   {
@@ -567,7 +567,7 @@
    "outputs": [],
    "source": [
     "wav_file_name_example = \"spoken_digits/free-spoken-digit-dataset-1.0.9/recordings/6_yweweler_14.wav\"\n",
-    "display_example(wav_file_name_example)\n"
+    "display_example(wav_file_name_example)"
    ]
   },
   {
@@ -593,7 +593,7 @@
    "outputs": [],
    "source": [
     "wav_file_name_example = \"spoken_digits/free-spoken-digit-dataset-1.0.9/recordings/6_yweweler_36.wav\"\n",
-    "display_example(wav_file_name_example)\n"
+    "display_example(wav_file_name_example)"
    ]
   },
   {
@@ -610,7 +610,7 @@
    "outputs": [],
    "source": [
     "wav_file_name_example = \"spoken_digits/free-spoken-digit-dataset-1.0.9/recordings/6_yweweler_35.wav\"\n",
-    "display_example(wav_file_name_example)\n"
+    "display_example(wav_file_name_example)"
    ]
   },
   {
@@ -627,7 +627,7 @@
    "outputs": [],
    "source": [
     "wav_file_name_example = \"spoken_digits/free-spoken-digit-dataset-1.0.9/recordings/6_nicolas_8.wav\"\n",
-    "display_example(wav_file_name_example)\n"
+    "display_example(wav_file_name_example)"
    ]
   },
   {

diff --git a/docs/source/tutorials/image.ipynb b/docs/source/tutorials/image.ipynb
@@ -68,7 +68,7 @@
     "    if len(missing_dependencies) > 0:\n",
     "        print(\"Missing required dependencies:\")\n",
     "        print(*missing_dependencies, sep=\", \")\n",
-    "        print(\"\\nPlease install them before running the rest of this notebook.\")\n"
+    "        print(\"\\nPlease install them before running the rest of this notebook.\")"
    ]
   },
   {
@@ -91,7 +91,7 @@
     "X = mnist.data.astype(\"float32\").to_numpy()  # 2D numpy array of image features\n",
     "X /= 255.0  # Scale the features to the [0, 1] range\n",
     "\n",
-    "y = mnist.target.astype(\"int64\").to_numpy()  # 1D numpy array of the image labels\n"
+    "y = mnist.target.astype(\"int64\").to_numpy()  # 1D numpy array of the image labels"
    ]
   },
   {
@@ -134,7 +134,7 @@
     "    nn.Dropout(0.5),\n",
     "    nn.Linear(128, 10),\n",
     "    nn.Softmax(dim=-1),\n",
-    ")\n"
+    ")"
    ]
   },
   {
@@ -159,7 +159,7 @@
    "source": [
     "from skorch import NeuralNetClassifier\n",
     "\n",
-    "model_skorch = NeuralNetClassifier(model)\n"
+    "model_skorch = NeuralNetClassifier(model)"
    ]
   },
   {
@@ -184,7 +184,7 @@
    "source": [
     "from sklearn.model_selection import cross_val_predict\n",
     "\n",
-    "pred_probs = cross_val_predict(model_skorch, X, y, cv=3, method=\"predict_proba\")\n"
+    "pred_probs = cross_val_predict(model_skorch, X, y, cv=3, method=\"predict_proba\")"
    ]
   },
   {
@@ -209,7 +209,7 @@
    "source": [
     "from cleanlab.filter import find_label_issues\n",
     "\n",
-    "ranked_label_issues = find_label_issues(y, pred_probs, return_indices_ranked_by=\"self_confidence\")\n"
+    "ranked_label_issues = find_label_issues(y, pred_probs, return_indices_ranked_by=\"self_confidence\")"
    ]
   },
   {
@@ -226,7 +226,7 @@
    "outputs": [],
    "source": [
     "print(f\"Cleanlab found {len(ranked_label_issues)} potential label errors.\")\n",
-    "print(f\"Here are the indices of the top 15 most likely ones: \\n {ranked_label_issues[:15]}\")\n"
+    "print(f\"Here are the indices of the top 15 most likely ones: \\n {ranked_label_issues[:15]}\")"
    ]
   },
   {
@@ -262,7 +262,7 @@
     "        plt.title(f\"id: {id} \\n label: {y[id]}\")\n",
     "        plt.axis(\"off\")\n",
     "\n",
-    "    plt.tight_layout(h_pad=2.0)\n"
+    "    plt.tight_layout(h_pad=2.0)"
    ]
   },
   {
@@ -287,7 +287,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "plot_examples(ranked_label_issues[range(15)], 3, 5)\n"
+    "plot_examples(ranked_label_issues[range(15)], 3, 5)"
    ]
   },
   {
@@ -310,7 +310,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "plot_examples([59915])\n"
+    "plot_examples([59915])"
    ]
   },
   {
@@ -326,7 +326,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "plot_examples([24798])\n"
+    "plot_examples([24798])"
    ]
   },
   {
@@ -342,7 +342,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "plot_examples([18598, 1352, 61247], 1, 3)\n"
+    "plot_examples([18598, 1352, 61247], 1, 3)"
    ]
   },
   {

diff --git a/docs/source/tutorials/tabular.ipynb b/docs/source/tutorials/tabular.ipynb
@@ -111,7 +111,7 @@
     "\n",
     "data = fetch_openml(\"credit-g\")  # get the credit data from OpenML\n",
     "X_raw = data.data  # features (pandas DataFrame)\n",
-    "y_raw = data.target  # labels (pandas Series)\n"
+    "y_raw = data.target  # labels (pandas Series)"
    ]
   },
   {
@@ -138,7 +138,7 @@
     "X_scaled = X_encoded.copy()\n",
     "X_scaled[num_features] = scaler.fit_transform(X_encoded[num_features])\n",
     "\n",
-    "y = y_raw.map({\"bad\": 0, \"good\": 1})  # encode labels as integers\n"
+    "y = y_raw.map({\"bad\": 0, \"good\": 1})  # encode labels as integers"
    ]
   },
   {
@@ -175,7 +175,7 @@
    "source": [
     "from sklearn.linear_model import LogisticRegression\n",
     "\n",
-    "clf = LogisticRegression()\n"
+    "clf = LogisticRegression()"
    ]
   },
   {
@@ -196,7 +196,7 @@
     "from sklearn.model_selection import cross_val_predict\n",
     "\n",
     "num_crossval_folds = 3\n",
-    "pred_probs = cross_val_predict(clf, X_scaled, y, cv=num_crossval_folds, method=\"predict_proba\")\n"
+    "pred_probs = cross_val_predict(clf, X_scaled, y, cv=num_crossval_folds, method=\"predict_proba\")"
    ]
   },
   {
@@ -225,7 +225,7 @@
     "    labels=y, pred_probs=pred_probs, return_indices_ranked_by=\"self_confidence\"\n",
     ")\n",
     "\n",
-    "print(f\"Cleanlab found {len(ranked_label_issues)} potential label errors.\")\n"
+    "print(f\"Cleanlab found {len(ranked_label_issues)} potential label errors.\")"
    ]
   },
   {
@@ -241,7 +241,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "X_raw.iloc[ranked_label_issues].assign(label=y_raw.iloc[ranked_label_issues]).head()\n"
+    "X_raw.iloc[ranked_label_issues].assign(label=y_raw.iloc[ranked_label_issues]).head()"
    ]
   },
   {
@@ -273,7 +273,7 @@
    "source": [
     "from sklearn.model_selection import train_test_split\n",
     "\n",
-    "X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.25, random_state=SEED)\n"
+    "X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.25, random_state=SEED)"
    ]
   },
   {
@@ -296,7 +296,7 @@
     "X_train = X_train.to_numpy()\n",
     "y_train = y_train.to_numpy()\n",
     "X_test = X_test.to_numpy()\n",
-    "y_test = y_test.to_numpy()\n"
+    "y_test = y_test.to_numpy()"
    ]
   },
   {
@@ -316,7 +316,7 @@
     "\n",
     "clf.fit(X_train, y_train)\n",
     "acc = clf.score(X_test, y_test)\n",
-    "print(f\"Test acuracy of original logistic regression: {acc}\")\n"
+    "print(f\"Test acuracy of original logistic regression: {acc}\")"
    ]
   },
   {
@@ -335,7 +335,7 @@
     "from cleanlab.classification import LearningWithNoisyLabels\n",
     "\n",
     "clf = LogisticRegression()  # Note we first re-initialize clf\n",
-    "lnl = LearningWithNoisyLabels(clf)  # lnl has same methods/attributes as clf\n"
+    "lnl = LearningWithNoisyLabels(clf)  # lnl has same methods/attributes as clf"
    ]
   },
   {
@@ -351,7 +351,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "_ = lnl.fit(X_train, y_train)\n"
+    "_ = lnl.fit(X_train, y_train)"
    ]
   },
   {
@@ -369,7 +369,7 @@
    "source": [
     "preds = lnl.predict(X_test)\n",
     "acc = accuracy_score(y_test, preds)\n",
-    "print(f\"Test acuracy of cleanlab's logistic regression: {acc}\")\n"
+    "print(f\"Test acuracy of cleanlab's logistic regression: {acc}\")"
    ]
   },
   {
@@ -399,7 +399,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.9.10"
   }
  },
  "nbformat": 4,