HuggingFace tutorial page:

https://huggingface.co/learn/llm-course/chapter5/6

# Load dataset

In [1]:
from datasets import load_dataset, Dataset



In [2]:
issues_dataset = load_dataset("lewtun/github-issues", split="train")
issues_dataset

Repo card metadata block was not found. Setting CardData to empty.


Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
    num_rows: 3019
})

In [3]:
# filter out pull_requests and issues without comments
issues_dataset = issues_dataset.filter(
    lambda x: x['is_pull_request'] == False and len(x['comments']) > 0
)
issues_dataset

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
    num_rows: 808
})

In [4]:
# filter out useless columns
columns = issues_dataset.column_names
columns_to_keep = ['title', 'body', 'html_url', 'comments']
columns_to_remove = set(columns_to_keep).symmetric_difference(columns) # in either set, but not in both
issues_dataset = issues_dataset.remove_columns(columns_to_remove)
issues_dataset

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 808
})

In [5]:
# for x in issues_dataset.shuffle(seed=42).select(range(10)):
#     print(x)

In [6]:
issues_dataset

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 808
})

In [7]:
# convert to pandas
issues_dataset.set_format('pandas')
df = issues_dataset[:]
df

Unnamed: 0,html_url,title,comments,body
0,https://github.com/huggingface/datasets/issues...,Protect master branch,"[Cool, I think we can do both :), @lhoestq now...",After accidental merge commit (91c55355b634d0d...
1,https://github.com/huggingface/datasets/issues...,Backwards compatibility broken for cached data...,[Hi ! I guess the caching mechanism should hav...,## Describe the bug\r\nAfter upgrading to data...
2,https://github.com/huggingface/datasets/issues...,OSCAR unshuffled_original_ko: NonMatchingSplit...,[I tried `unshuffled_original_da` and it is al...,## Describe the bug\r\n\r\nCannot download OSC...
3,https://github.com/huggingface/datasets/issues...,load_dataset using default cache on Windows ca...,"[Hi @daqieq, thanks for reporting.\r\n\r\nUnfo...",## Describe the bug\r\nStandard process to dow...
4,https://github.com/huggingface/datasets/issues...,to_tf_dataset keeps a reference to the open da...,"[I did some investigation and, as it seems, th...",To reproduce:\r\n```python\r\nimport datasets ...
...,...,...,...,...
803,https://github.com/huggingface/datasets/issues/6,Error when citation is not given in the Datase...,[Yes looks good to me.\r\nNote that we may ref...,The following error is raised when the `citati...
804,https://github.com/huggingface/datasets/issues/5,ValueError when a split is empty,[To fix this I propose to modify only the file...,"When a split is empty either TEST, VALIDATION ..."
805,https://github.com/huggingface/datasets/issues/4,[Feature] Keep the list of labels of a dataset...,[Yes! I see mostly two options for this:\r\n- ...,It would be useful to keep the list of the lab...
806,https://github.com/huggingface/datasets/issues/3,[Feature] More dataset outputs,[Yes!\r\n- pandas will be a one-liner in `arro...,Add the following dataset outputs:\r\n\r\n- Sp...


In [8]:
# expand comment list
comments_df = df.explode("comments", ignore_index=True)
comments_df.head()

Unnamed: 0,html_url,title,comments,body
0,https://github.com/huggingface/datasets/issues...,Protect master branch,"Cool, I think we can do both :)",After accidental merge commit (91c55355b634d0d...
1,https://github.com/huggingface/datasets/issues...,Protect master branch,@lhoestq now the 2 are implemented.\r\n\r\nPle...,After accidental merge commit (91c55355b634d0d...
2,https://github.com/huggingface/datasets/issues...,Backwards compatibility broken for cached data...,Hi ! I guess the caching mechanism should have...,## Describe the bug\r\nAfter upgrading to data...
3,https://github.com/huggingface/datasets/issues...,Backwards compatibility broken for cached data...,"If it's easy enough to implement, then yes ple...",## Describe the bug\r\nAfter upgrading to data...
4,https://github.com/huggingface/datasets/issues...,Backwards compatibility broken for cached data...,Well it can cause issue with anyone that updat...,## Describe the bug\r\nAfter upgrading to data...


In [9]:
# switch back to hf dataset
comments_dataset = Dataset.from_pandas(comments_df)
comments_dataset

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 2964
})

In [10]:
comments_dataset = comments_dataset.map(
    lambda x: {'comment_length': len(x['comments'].split())}
)
comments_dataset = comments_dataset.filter(
    lambda x: x['comment_length'] > 15
)
comments_dataset

Map:   0%|          | 0/2964 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2964 [00:00<?, ? examples/s]

Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length'],
    num_rows: 2175
})

In [11]:
def concat_text(examples):
    return {
        'text': examples['title'] + ' \n '
        + examples['body'] + ' \n '
        + examples['comments']
    }

comments_dataset = comments_dataset.map(concat_text)
comments_dataset

Map:   0%|          | 0/2175 [00:00<?, ? examples/s]

Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length', 'text'],
    num_rows: 2175
})

# Create text embeddings
Use `sentence-transformers`

In [12]:
from transformers import AutoTokenizer, AutoModel
import torch

In [13]:
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [14]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

In [15]:
def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    # print(encoded_input)
    model_output = model(**encoded_input)
    # print(model_output)
    return cls_pooling(model_output)

In [37]:
# embedding = get_embeddings(comments_dataset['text'][0])
# embedding.shape

In [17]:
# make sure to cast to float32 to use FAISS (otherwise crashing)
embeddings_dataset = comments_dataset.map(
    lambda x: {'embeddings': get_embeddings(x['text']).detach().numpy().astype('float32')[0]}
)

Map:   0%|          | 0/2175 [00:00<?, ? examples/s]

# Use FAISS for efficient similarity search

In [30]:
embeddings_dataset.add_faiss_index(column='embeddings')

  0%|          | 0/3 [00:00<?, ?it/s]

Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length', 'text', 'embeddings'],
    num_rows: 2175
})

In [31]:
question = "How can I load a dataset offline?"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
question_embedding.shape

(1, 768)

In [None]:
# scores, samples = embeddings_dataset.get_nearest_examples(
#     'embeddings', question_embedding, k=1
# )

### debug

In [20]:
question_embedding.dtype

dtype('float32')

In [21]:
import numpy as np
print(type(embeddings_dataset[0]['embeddings']))
print(np.array(embeddings_dataset[0]['embeddings']).dtype)

<class 'list'>
float64


In [22]:
expected_dim = len(embeddings_dataset[0]['embeddings'])
expected_dim

768

In [29]:
np.array(embeddings_dataset[0]['embeddings']).dtype

dtype('float64')

In [35]:
# Convert all embeddings to float32
def convert_to_float32(example):
    example['embeddings'] = np.array(example['embeddings'], dtype=np.float32).tolist()
    return example

# Apply conversion to entire dataset
embeddings_dataset_new = embeddings_dataset.map(convert_to_float32)

# Rebuild the FAISS index with float32 data
# embeddings_dataset_new.drop_index('embeddings')  # Remove old index
embeddings_dataset_new.add_faiss_index(column='embeddings')

Map:   0%|          | 0/2175 [00:00<?, ? examples/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length', 'text', 'embeddings'],
    num_rows: 2175
})

In [36]:
print(np.array(embeddings_dataset_new[0]['embeddings']).dtype)

float64
