From 3471a1aeeb6b06e504f9c236d17237e1731ae6a4 Mon Sep 17 00:00:00 2001 From: Jithin James Date: Sat, 13 May 2023 16:56:37 +0530 Subject: [PATCH 1/3] docs: quickstart with pre-commit hooks --- .pre-commit-config.yaml | 15 + examples/data_prep.py | 44 ++ examples/quickstart.ipynb | 860 +++++++++++++++++++++++++++++--------- pyproject.toml | 5 + 4 files changed, 727 insertions(+), 197 deletions(-) create mode 100644 .pre-commit-config.yaml create mode 100644 examples/data_prep.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 000000000..aa8978dc9 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,15 @@ +# See https://pre-commit.com for more information +# See https://pre-commit.com/hooks.html for more hooks +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v3.2.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-added-large-files +- repo: https://github.com/psf/black + rev: 22.10.0 + hooks: + - id: black + - id: black-jupyter diff --git a/examples/data_prep.py b/examples/data_prep.py new file mode 100644 index 000000000..df2763104 --- /dev/null +++ b/examples/data_prep.py @@ -0,0 +1,44 @@ +from datasets import concatenate_datasets, load_dataset + + +def format_for_belar(row): + row["context"] = row["selftext"] + row["prompt"] = row["title"] + row["ground_truth"] = row["answers"]["text"] + return row + + +d = load_dataset("eli5") +ds = d["test_eli5"].map(format_for_belar, batched=False) +ds = ds.select_columns(["context", "prompt", "ground_truth"]) + +ds = ds.shuffle(seed=42).select(range(500)) +ds.shape, ds.column_names + +import concurrent.futures as f + +from langchain.llms import OpenAI + +llm = OpenAI() +prompt = """ +{context} +with the above context explain like I'm five: {prompt} +""" + + +def get_answers(row): + qs, cs = row["prompt"], row["context"] + + generated_answers = [] + with f.ThreadPoolExecutor(max_workers=10) as executor: + results = executor.map( + llm, [prompt.format(context=cs[i], prompt=qs[i]) for i in range(len(qs))] + ) + for result in results: + generated_answers.append(result) + + row["generated_answers"] = generated_answers + return row + + +ds = ds.map(get_answers, batched=True, batch_size=10) diff --git a/examples/quickstart.ipynb b/examples/quickstart.ipynb index 6effae29b..f726fcf23 100644 --- a/examples/quickstart.ipynb +++ b/examples/quickstart.ipynb @@ -1,11 +1,28 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "aeb5819b", + "metadata": {}, + "source": [ + "# Quickstart" + ] + }, { "cell_type": "code", - "execution_count": 7, - "id": "992c777a", + "execution_count": 30, + "id": "22c7dd25", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], "source": [ "%load_ext autoreload\n", "%autoreload 2" @@ -13,108 +30,89 @@ }, { "cell_type": "code", - "execution_count": 4, - "id": "5eaf4729", + "execution_count": 2, + "id": "0b5d4d41", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Found cached dataset eli5 (/home/jjmachan/.cache/huggingface/datasets/eli5/LFQA_reddit/1.0.0/17574e5502a10f41bbd17beba83e22475b499fa62caa1384a3d093fc856fe6fa)\n" + "Found cached dataset parquet (/home/jjmachan/.cache/huggingface/datasets/explodinggradients___parquet/explodinggradients--eli5-test-217d92ce20e19249/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)\n" ] }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "a2231863e61c4ffd8d695c8531a48139", - "version_major": 2, - "version_minor": 0 - }, "text/plain": [ - " 0%| | 0/9 [00:00 Date: Sat, 13 May 2023 17:32:14 +0530 Subject: [PATCH 2/3] remove pre-commit hooks --- .pre-commit-config.yaml | 15 --------------- pyproject.toml | 1 - 2 files changed, 16 deletions(-) delete mode 100644 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml deleted file mode 100644 index aa8978dc9..000000000 --- a/.pre-commit-config.yaml +++ /dev/null @@ -1,15 +0,0 @@ -# See https://pre-commit.com for more information -# See https://pre-commit.com/hooks.html for more hooks -repos: -- repo: https://github.com/pre-commit/pre-commit-hooks - rev: v3.2.0 - hooks: - - id: trailing-whitespace - - id: end-of-file-fixer - - id: check-yaml - - id: check-added-large-files -- repo: https://github.com/psf/black - rev: 22.10.0 - hooks: - - id: black - - id: black-jupyter diff --git a/pyproject.toml b/pyproject.toml index ed76e07b4..7ae0e0218 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,6 @@ dynamic = ["version", "readme"] [project.optional-dependencies] dev = [ - "pre-commit", ] [tool.setuptools.dynamic] From 92b11365f1cce66e2603e5d53efcc61ed931b9fa Mon Sep 17 00:00:00 2001 From: Jithin James Date: Sun, 14 May 2023 10:33:17 +0530 Subject: [PATCH 3/3] fix mistakes --- examples/data_prep.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/examples/data_prep.py b/examples/data_prep.py index df2763104..75305f659 100644 --- a/examples/data_prep.py +++ b/examples/data_prep.py @@ -1,4 +1,7 @@ -from datasets import concatenate_datasets, load_dataset +import concurrent.futures as f + +from datasets import DatasetDict, load_dataset +from langchain.llms import OpenAI def format_for_belar(row): @@ -9,17 +12,15 @@ def format_for_belar(row): d = load_dataset("eli5") +assert isinstance(d, DatasetDict) ds = d["test_eli5"].map(format_for_belar, batched=False) ds = ds.select_columns(["context", "prompt", "ground_truth"]) ds = ds.shuffle(seed=42).select(range(500)) -ds.shape, ds.column_names +print(ds.shape, ds.column_names) -import concurrent.futures as f - -from langchain.llms import OpenAI -llm = OpenAI() +llm = OpenAI() # type: ignore prompt = """ {context} with the above context explain like I'm five: {prompt}