In [59]:
import pandas as pd
import multiprocessing
import torch
from transformers import AutoTokenizer, AutoModel
from datasets import Dataset
from datasets import load_dataset

In [2]:
num_cores = multiprocessing.cpu_count()
num_cores_avail = max(1, num_cores - 1)

# Download data

In [3]:
# Just get the training split
issues_dataset = load_dataset("lewtun/github-issues", split="train")

Repo card metadata block was not found. Setting CardData to empty.


In [4]:
issues_dataset

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
    num_rows: 3019
})

# Prep data

In [5]:
issues_dataset = issues_dataset.filter(
    lambda x: (x["is_pull_request"] == False and len(x["comments"]) > 0)
)

In [6]:
issues_dataset

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
    num_rows: 808
})

In [7]:
cols = issues_dataset.column_names
keeper_cols = ["title", "body", "html_url", "comments"]
drop_cols = set(keeper_cols).symmetric_difference(cols)
issues_dataset = issues_dataset.remove_columns(drop_cols)

In [8]:
issues_dataset

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 808
})

## 'Explode' data to have 1 row per comment

In [9]:
issues_dataset.set_format("pandas")
df = issues_dataset[:]

In [10]:
df["comments"][0].tolist()

['Cool, I think we can do both :)',
 '@lhoestq now the 2 are implemented.\r\n\r\nPlease note that for the the second protection, finally I have chosen to protect the master branch only from **merge commits** (see update comment above), so no need to disable/re-enable the protection on each release (direct commits, different from merge commits, can be pushed to the remote master branch; and eventually reverted without messing up the repo history).']

In [11]:
comments_df = df.explode("comments", ignore_index=True)
comments_df.head(4)

Unnamed: 0,html_url,title,comments,body
0,https://github.com/huggingface/datasets/issues...,Protect master branch,"Cool, I think we can do both :)",After accidental merge commit (91c55355b634d0d...
1,https://github.com/huggingface/datasets/issues...,Protect master branch,@lhoestq now the 2 are implemented.\r\n\r\nPle...,After accidental merge commit (91c55355b634d0d...
2,https://github.com/huggingface/datasets/issues...,Backwards compatibility broken for cached data...,Hi ! I guess the caching mechanism should have...,## Describe the bug\r\nAfter upgrading to data...
3,https://github.com/huggingface/datasets/issues...,Backwards compatibility broken for cached data...,"If it's easy enough to implement, then yes ple...",## Describe the bug\r\nAfter upgrading to data...


In [12]:
comments_dataset = Dataset.from_pandas(comments_df)

In [13]:
comments_dataset

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 2964
})

## Try to do this without pandas

In [14]:
issues_dataset.reset_format()

In [15]:
def batched_explode(examples):
    exploded_data = {"html_url": [], "title": [], "comment": [], "body": []}
    for html_url, title, comments, body in zip(examples["html_url"], examples["title"], examples["comments"], examples["body"]):
        for comment in comments: 
            exploded_data["html_url"].append(html_url)
            exploded_data["title"].append(title)
            exploded_data["comment"].append(comment)
            exploded_data["body"].append(body)
    return exploded_data

In [16]:
comments_dataset = issues_dataset.map(batched_explode, batched=True, remove_columns=["html_url", "title", "comments", "body"])

Map:   0%|          | 0/808 [00:00<?, ? examples/s]

In [17]:
comments_dataset

Dataset({
    features: ['html_url', 'title', 'body', 'comment'],
    num_rows: 2964
})

In [18]:
comments_dataset.set_format("pandas")
comments_dataset[:4]

Unnamed: 0,html_url,title,body,comment
0,https://github.com/huggingface/datasets/issues...,Protect master branch,After accidental merge commit (91c55355b634d0d...,"Cool, I think we can do both :)"
1,https://github.com/huggingface/datasets/issues...,Protect master branch,After accidental merge commit (91c55355b634d0d...,@lhoestq now the 2 are implemented.\r\n\r\nPle...
2,https://github.com/huggingface/datasets/issues...,Backwards compatibility broken for cached data...,## Describe the bug\r\nAfter upgrading to data...,Hi ! I guess the caching mechanism should have...
3,https://github.com/huggingface/datasets/issues...,Backwards compatibility broken for cached data...,## Describe the bug\r\nAfter upgrading to data...,"If it's easy enough to implement, then yes ple..."


In [19]:
comments_dataset.reset_format()

## Filter out short comments

In [20]:
comments_dataset = comments_dataset.map(lambda x: {"comment_length": [len(c.split(' ')) for c in x["comment"]]}, batched=True)

Map:   0%|          | 0/2964 [00:00<?, ? examples/s]

In [21]:
comments_dataset

Dataset({
    features: ['html_url', 'title', 'body', 'comment', 'comment_length'],
    num_rows: 2964
})

In [22]:
comments_dataset[0]

{'html_url': 'https://github.com/huggingface/datasets/issues/2945',
 'title': 'Protect master branch',
 'body': 'After accidental merge commit (91c55355b634d0dc73350a7ddee1a6776dbbdd69) into `datasets` master branch, all commits present in the feature branch were permanently added to `datasets` master branch history, as e.g.:\r\n- 00cc036fea7c7745cfe722360036ed306796a3f2\r\n- 13ae8c98602bbad8197de3b9b425f4c78f582af1\r\n- ...\r\n\r\nI propose to protect our master branch, so that we avoid we can accidentally make this kind of mistakes in the future:\r\n- [x] For Pull Requests using GitHub, allow only squash merging, so that only a single commit per Pull Request is merged into the master branch\r\n  - Currently, simple merge commits are already disabled\r\n  - I propose to disable rebase merging as well\r\n- ~~Protect the master branch from direct pushes (to avoid accidentally pushing of merge commits)~~\r\n  - ~~This protection would reject direct pushes to master branch~~\r\n  - ~~If s

In [23]:
comments_dataset = comments_dataset.filter(lambda x: x["comment_length"] > 15)

Filter:   0%|          | 0/2964 [00:00<?, ? examples/s]

In [24]:
comments_dataset

Dataset({
    features: ['html_url', 'title', 'body', 'comment', 'comment_length'],
    num_rows: 2162
})

## Add a new column with all text concatenated

In [25]:
def concat_text(example):
    return {"text": f"{example['title']} \n {example['body']} \n {example['comment']}"}

def batched_concat_text(examples):
    return {"text": [
        f"{title} \n {body} \n {comment}" for title, comment, body in zip(examples["title"], examples["comment"], examples["body"])
    ]}

In [26]:
comments_dataset.map(concat_text)

Map:   0%|          | 0/2162 [00:00<?, ? examples/s]

Dataset({
    features: ['html_url', 'title', 'body', 'comment', 'comment_length', 'text'],
    num_rows: 2162
})

In [27]:
comments_dataset = comments_dataset.map(batched_concat_text, batched=True)

Map:   0%|          | 0/2162 [00:00<?, ? examples/s]

In [28]:
comments_dataset

Dataset({
    features: ['html_url', 'title', 'body', 'comment', 'comment_length', 'text'],
    num_rows: 2162
})

# Text embeddings

In [29]:
device = torch.device("cuda")

In [30]:
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [31]:
model.to(device)

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_

In [32]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]


def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [33]:
comments_dataset["text"][0]

'Protect master branch \n After accidental merge commit (91c55355b634d0dc73350a7ddee1a6776dbbdd69) into `datasets` master branch, all commits present in the feature branch were permanently added to `datasets` master branch history, as e.g.:\r\n- 00cc036fea7c7745cfe722360036ed306796a3f2\r\n- 13ae8c98602bbad8197de3b9b425f4c78f582af1\r\n- ...\r\n\r\nI propose to protect our master branch, so that we avoid we can accidentally make this kind of mistakes in the future:\r\n- [x] For Pull Requests using GitHub, allow only squash merging, so that only a single commit per Pull Request is merged into the master branch\r\n  - Currently, simple merge commits are already disabled\r\n  - I propose to disable rebase merging as well\r\n- ~~Protect the master branch from direct pushes (to avoid accidentally pushing of merge commits)~~\r\n  - ~~This protection would reject direct pushes to master branch~~\r\n  - ~~If so, for each release (when we need to commit directly to the master branch), we should p

In [35]:
with torch.no_grad():
    # Quick test run
    embedding = get_embeddings(comments_dataset["text"][:2])

In [36]:
embedding.shape

torch.Size([2, 768])

In [37]:
# Free up GPU memory
del embedding
torch.cuda.empty_cache()

## Add embeddings to the data

### One at a time

In [38]:
# with torch.no_grad():
#     embeddings_dataset = comments_dataset.map(
#         lambda x: {"embeddings": get_embeddings(x["text"]).detach().cpu().numpy().squeeze()}
#     )
#     torch.cuda.empty_cache()

### Batched

In [39]:
def batched_get_embeddings_wrapper(examples):
    embeddings = [row for row in get_embeddings(examples["text"]).detach().cpu().numpy()]
    # Don't let GPU memory overload 
    torch.cuda.empty_cache()
    return {"embeddings": embeddings}

with torch.no_grad():
    embeddings_dataset = comments_dataset.map(
        # lambda x: {"embeddings": [row for row in get_embeddings(x["text"]).detach().cpu().numpy()]},
        batched_get_embeddings_wrapper,
        batched=True,
        batch_size=64,
    )

Map:   0%|          | 0/2162 [00:00<?, ? examples/s]

# FAISS similarity search

In [44]:
faiss_idx = "embeddings"
embeddings_dataset.add_faiss_index(column=faiss_idx)

  0%|          | 0/3 [00:00<?, ?it/s]

Dataset({
    features: ['html_url', 'title', 'body', 'comment', 'comment_length', 'text', 'embeddings'],
    num_rows: 2162
})

## Run a query

In [48]:
question = "How can I load a dataset offline?"
with torch.no_grad():
    question_embedding = get_embeddings([question]).cpu().detach().numpy()

In [63]:
question_embedding.shape

(1, 768)

In [64]:
k = 5
scores, samples = embeddings_dataset.get_nearest_examples(
    faiss_idx,
    question_embedding,
    k=k
)

In [72]:
samples_df = (
    pd.DataFrame.from_dict(samples)
        .assign(score=scores)
        .sort_values(by="score", ascending=False)
        .reset_index(drop=True)
)

In [80]:
for _, row in samples_df.iterrows():
    print(f"{'*'*4} SCORE: {round(row.score, 3)} {'*'*4}\n")
    print(f"COMMENT: {row.comment}")
    print(f"TITLE: {row.title}")
    print(f"URL: {row.html_url}")
    print("=" * 50)
    print()

**** SCORE: 25.505 ****

COMMENT: Requiring online connection is a deal breaker in some cases unfortunately so it'd be great if offline mode is added similar to how `transformers` loads models offline fine.

@mandubian's second bullet point suggests that there's a workaround allowing you to use your offline (custom?) dataset with `datasets`. Could you please elaborate on how that should look like?
TITLE: Discussion using datasets in offline mode
URL: https://github.com/huggingface/datasets/issues/824

**** SCORE: 24.555 ****

COMMENT: The local dataset builders (csv, text , json and pandas) are now part of the `datasets` package since #1726 :)
You can now use them offline
```python
datasets = load_dataset('text', data_files=data_files)
```

We'll do a new release soon
TITLE: Discussion using datasets in offline mode
URL: https://github.com/huggingface/datasets/issues/824

**** SCORE: 24.149 ****

COMMENT: I opened a PR that allows to reload modules that have already been loaded once ev