Skip to content

Commit

Permalink
Add CI check for trailing newlines in notebooks
Browse files Browse the repository at this point in the history
The rendered docs don't look good when code cells have trailing
newlines.
  • Loading branch information
anishathalye committed Apr 5, 2022
1 parent 6cac342 commit a903c4c
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 64 deletions.
15 changes: 15 additions & 0 deletions .ci/nblint.py
Expand Up @@ -43,6 +43,7 @@ def check(notebook):
with open(notebook) as f:
contents = json.load(f)
check_outputs_empty(notebook, contents)
check_no_trailing_newline(notebook, contents)


def check_outputs_empty(path, contents):
Expand All @@ -51,6 +52,20 @@ def check_outputs_empty(path, contents):
fail(path, "output is not empty", i)


def check_no_trailing_newline(path, contents):
"""
Checks that the last line of a code cell doesn't end with a newline, which
produces an unnecessarily newline in the doc rendering.
"""
for i, cell in enumerate(contents["cells"]):
if cell["cell_type"] != "code":
continue
if "source" not in cell or len(cell["source"]) == 0:
fail(path, "code cell is empty", i)
if cell["source"][-1].endswith("\n"):
fail(path, "unnecessary trailing newline", i)


def fail(path, message, cell=None):
cell_msg = f" [cell {cell}]" if cell is not None else ""
print(f"{path}{cell_msg}: {message}")
Expand Down
32 changes: 16 additions & 16 deletions docs/source/tutorials/audio.ipynb
Expand Up @@ -112,7 +112,7 @@
"set_seed(SEED)\n",
"tf.get_logger().setLevel('ERROR') # suppress TF warnings \n",
"pd.options.display.max_colwidth = 500\n",
"os.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"3\" # Suppress TF info, warnings and errors\n"
"os.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"3\" # Suppress TF info, warnings and errors"
]
},
{
Expand Down Expand Up @@ -191,7 +191,7 @@
" file_paths += [os.path.join(dirpath, file) for file in filenames if file.endswith(\".wav\")]\n",
"\n",
"# Check out first 3 files\n",
"file_paths[:3]\n"
"file_paths[:3]"
]
},
{
Expand Down Expand Up @@ -242,7 +242,7 @@
" wav_file_example = load_wav_16k_mono(wav_file_name)\n",
" label = Path(wav_file_name).parts[-1].split(\"_\")[0]\n",
" print(f\"Given label for this example: {label}\")\n",
" display.display(display.Audio(wav_file_example, rate=audio_rate))\n"
" display.display(display.Audio(wav_file_example, rate=audio_rate))"
]
},
{
Expand Down Expand Up @@ -277,7 +277,7 @@
"outputs": [],
"source": [
"wav_file_name_example = \"spoken_digits/free-spoken-digit-dataset-1.0.9/recordings/7_jackson_43.wav\" # change this to hear other examples\n",
"display_example(wav_file_name_example)\n"
"display_example(wav_file_name_example)"
]
},
{
Expand Down Expand Up @@ -335,7 +335,7 @@
"df = pd.DataFrame(file_paths, columns=[\"wav_audio_file_path\"])\n",
"df[\"label\"] = df.wav_audio_file_path.map(lambda x: int(Path(x).parts[-1].split(\"_\")[0]))\n",
"\n",
"df.head(3)\n"
"df.head(3)"
]
},
{
Expand All @@ -355,7 +355,7 @@
" embeddings = model.encode_batch(\n",
" signal\n",
" ) # Pass tensor through pretrained neural net and extract representation\n",
" return embeddings\n"
" return embeddings"
]
},
{
Expand All @@ -372,7 +372,7 @@
" embeddings = extract_audio_embeddings(feature_extractor, file_name)\n",
" embeddings_list.append(embeddings.cpu().numpy())\n",
"\n",
"embeddings_array = np.squeeze(np.array(embeddings_list))\n"
"embeddings_array = np.squeeze(np.array(embeddings_list))"
]
},
{
Expand All @@ -397,7 +397,7 @@
"outputs": [],
"source": [
"print(embeddings_array)\n",
"print(\"Shape of array: \", embeddings_array.shape)\n"
"print(\"Shape of array: \", embeddings_array.shape)"
]
},
{
Expand Down Expand Up @@ -438,7 +438,7 @@
"# Generate cross-validated predicted probabilities for each datapoint\n",
"cv_pred_probs = cross_val_predict(\n",
" estimator=model, X=embeddings_array, y=df.label.values, cv=5, method=\"predict_proba\"\n",
")\n"
")"
]
},
{
Expand Down Expand Up @@ -466,7 +466,7 @@
"\n",
"predicted_labels = cv_pred_probs.argmax(axis=1)\n",
"cv_accuracy = accuracy_score(df.label.values, predicted_labels)\n",
"print(f\"Cross-validated estimate of accuracy on held-out data: {cv_accuracy}\")\n"
"print(f\"Cross-validated estimate of accuracy on held-out data: {cv_accuracy}\")"
]
},
{
Expand Down Expand Up @@ -507,7 +507,7 @@
" return_indices_ranked_by=\"self_confidence\", # ranks the label issues\n",
")\n",
"\n",
"print(label_issues_indices)\n"
"print(label_issues_indices)"
]
},
{
Expand All @@ -532,7 +532,7 @@
},
"outputs": [],
"source": [
"df.iloc[label_issues_indices]\n"
"df.iloc[label_issues_indices]"
]
},
{
Expand Down Expand Up @@ -567,7 +567,7 @@
"outputs": [],
"source": [
"wav_file_name_example = \"spoken_digits/free-spoken-digit-dataset-1.0.9/recordings/6_yweweler_14.wav\"\n",
"display_example(wav_file_name_example)\n"
"display_example(wav_file_name_example)"
]
},
{
Expand All @@ -593,7 +593,7 @@
"outputs": [],
"source": [
"wav_file_name_example = \"spoken_digits/free-spoken-digit-dataset-1.0.9/recordings/6_yweweler_36.wav\"\n",
"display_example(wav_file_name_example)\n"
"display_example(wav_file_name_example)"
]
},
{
Expand All @@ -610,7 +610,7 @@
"outputs": [],
"source": [
"wav_file_name_example = \"spoken_digits/free-spoken-digit-dataset-1.0.9/recordings/6_yweweler_35.wav\"\n",
"display_example(wav_file_name_example)\n"
"display_example(wav_file_name_example)"
]
},
{
Expand All @@ -627,7 +627,7 @@
"outputs": [],
"source": [
"wav_file_name_example = \"spoken_digits/free-spoken-digit-dataset-1.0.9/recordings/6_nicolas_8.wav\"\n",
"display_example(wav_file_name_example)\n"
"display_example(wav_file_name_example)"
]
},
{
Expand Down
24 changes: 12 additions & 12 deletions docs/source/tutorials/image.ipynb
Expand Up @@ -68,7 +68,7 @@
" if len(missing_dependencies) > 0:\n",
" print(\"Missing required dependencies:\")\n",
" print(*missing_dependencies, sep=\", \")\n",
" print(\"\\nPlease install them before running the rest of this notebook.\")\n"
" print(\"\\nPlease install them before running the rest of this notebook.\")"
]
},
{
Expand All @@ -91,7 +91,7 @@
"X = mnist.data.astype(\"float32\").to_numpy() # 2D numpy array of image features\n",
"X /= 255.0 # Scale the features to the [0, 1] range\n",
"\n",
"y = mnist.target.astype(\"int64\").to_numpy() # 1D numpy array of the image labels\n"
"y = mnist.target.astype(\"int64\").to_numpy() # 1D numpy array of the image labels"
]
},
{
Expand Down Expand Up @@ -134,7 +134,7 @@
" nn.Dropout(0.5),\n",
" nn.Linear(128, 10),\n",
" nn.Softmax(dim=-1),\n",
")\n"
")"
]
},
{
Expand All @@ -159,7 +159,7 @@
"source": [
"from skorch import NeuralNetClassifier\n",
"\n",
"model_skorch = NeuralNetClassifier(model)\n"
"model_skorch = NeuralNetClassifier(model)"
]
},
{
Expand All @@ -184,7 +184,7 @@
"source": [
"from sklearn.model_selection import cross_val_predict\n",
"\n",
"pred_probs = cross_val_predict(model_skorch, X, y, cv=3, method=\"predict_proba\")\n"
"pred_probs = cross_val_predict(model_skorch, X, y, cv=3, method=\"predict_proba\")"
]
},
{
Expand All @@ -209,7 +209,7 @@
"source": [
"from cleanlab.filter import find_label_issues\n",
"\n",
"ranked_label_issues = find_label_issues(y, pred_probs, return_indices_ranked_by=\"self_confidence\")\n"
"ranked_label_issues = find_label_issues(y, pred_probs, return_indices_ranked_by=\"self_confidence\")"
]
},
{
Expand All @@ -226,7 +226,7 @@
"outputs": [],
"source": [
"print(f\"Cleanlab found {len(ranked_label_issues)} potential label errors.\")\n",
"print(f\"Here are the indices of the top 15 most likely ones: \\n {ranked_label_issues[:15]}\")\n"
"print(f\"Here are the indices of the top 15 most likely ones: \\n {ranked_label_issues[:15]}\")"
]
},
{
Expand Down Expand Up @@ -262,7 +262,7 @@
" plt.title(f\"id: {id} \\n label: {y[id]}\")\n",
" plt.axis(\"off\")\n",
"\n",
" plt.tight_layout(h_pad=2.0)\n"
" plt.tight_layout(h_pad=2.0)"
]
},
{
Expand All @@ -287,7 +287,7 @@
"metadata": {},
"outputs": [],
"source": [
"plot_examples(ranked_label_issues[range(15)], 3, 5)\n"
"plot_examples(ranked_label_issues[range(15)], 3, 5)"
]
},
{
Expand All @@ -310,7 +310,7 @@
"metadata": {},
"outputs": [],
"source": [
"plot_examples([59915])\n"
"plot_examples([59915])"
]
},
{
Expand All @@ -326,7 +326,7 @@
"metadata": {},
"outputs": [],
"source": [
"plot_examples([24798])\n"
"plot_examples([24798])"
]
},
{
Expand All @@ -342,7 +342,7 @@
"metadata": {},
"outputs": [],
"source": [
"plot_examples([18598, 1352, 61247], 1, 3)\n"
"plot_examples([18598, 1352, 61247], 1, 3)"
]
},
{
Expand Down
26 changes: 13 additions & 13 deletions docs/source/tutorials/tabular.ipynb
Expand Up @@ -111,7 +111,7 @@
"\n",
"data = fetch_openml(\"credit-g\") # get the credit data from OpenML\n",
"X_raw = data.data # features (pandas DataFrame)\n",
"y_raw = data.target # labels (pandas Series)\n"
"y_raw = data.target # labels (pandas Series)"
]
},
{
Expand All @@ -138,7 +138,7 @@
"X_scaled = X_encoded.copy()\n",
"X_scaled[num_features] = scaler.fit_transform(X_encoded[num_features])\n",
"\n",
"y = y_raw.map({\"bad\": 0, \"good\": 1}) # encode labels as integers\n"
"y = y_raw.map({\"bad\": 0, \"good\": 1}) # encode labels as integers"
]
},
{
Expand Down Expand Up @@ -175,7 +175,7 @@
"source": [
"from sklearn.linear_model import LogisticRegression\n",
"\n",
"clf = LogisticRegression()\n"
"clf = LogisticRegression()"
]
},
{
Expand All @@ -196,7 +196,7 @@
"from sklearn.model_selection import cross_val_predict\n",
"\n",
"num_crossval_folds = 3\n",
"pred_probs = cross_val_predict(clf, X_scaled, y, cv=num_crossval_folds, method=\"predict_proba\")\n"
"pred_probs = cross_val_predict(clf, X_scaled, y, cv=num_crossval_folds, method=\"predict_proba\")"
]
},
{
Expand Down Expand Up @@ -225,7 +225,7 @@
" labels=y, pred_probs=pred_probs, return_indices_ranked_by=\"self_confidence\"\n",
")\n",
"\n",
"print(f\"Cleanlab found {len(ranked_label_issues)} potential label errors.\")\n"
"print(f\"Cleanlab found {len(ranked_label_issues)} potential label errors.\")"
]
},
{
Expand All @@ -241,7 +241,7 @@
"metadata": {},
"outputs": [],
"source": [
"X_raw.iloc[ranked_label_issues].assign(label=y_raw.iloc[ranked_label_issues]).head()\n"
"X_raw.iloc[ranked_label_issues].assign(label=y_raw.iloc[ranked_label_issues]).head()"
]
},
{
Expand Down Expand Up @@ -273,7 +273,7 @@
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.25, random_state=SEED)\n"
"X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.25, random_state=SEED)"
]
},
{
Expand All @@ -296,7 +296,7 @@
"X_train = X_train.to_numpy()\n",
"y_train = y_train.to_numpy()\n",
"X_test = X_test.to_numpy()\n",
"y_test = y_test.to_numpy()\n"
"y_test = y_test.to_numpy()"
]
},
{
Expand All @@ -316,7 +316,7 @@
"\n",
"clf.fit(X_train, y_train)\n",
"acc = clf.score(X_test, y_test)\n",
"print(f\"Test acuracy of original logistic regression: {acc}\")\n"
"print(f\"Test acuracy of original logistic regression: {acc}\")"
]
},
{
Expand All @@ -335,7 +335,7 @@
"from cleanlab.classification import LearningWithNoisyLabels\n",
"\n",
"clf = LogisticRegression() # Note we first re-initialize clf\n",
"lnl = LearningWithNoisyLabels(clf) # lnl has same methods/attributes as clf\n"
"lnl = LearningWithNoisyLabels(clf) # lnl has same methods/attributes as clf"
]
},
{
Expand All @@ -351,7 +351,7 @@
"metadata": {},
"outputs": [],
"source": [
"_ = lnl.fit(X_train, y_train)\n"
"_ = lnl.fit(X_train, y_train)"
]
},
{
Expand All @@ -369,7 +369,7 @@
"source": [
"preds = lnl.predict(X_test)\n",
"acc = accuracy_score(y_test, preds)\n",
"print(f\"Test acuracy of cleanlab's logistic regression: {acc}\")\n"
"print(f\"Test acuracy of cleanlab's logistic regression: {acc}\")"
]
},
{
Expand Down Expand Up @@ -399,7 +399,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.9.10"
}
},
"nbformat": 4,
Expand Down

0 comments on commit a903c4c

Please sign in to comment.