In [1]:
import pandas as pd
import json

In [None]:
lines = []

with open("comments2", "r") as f:
    for line in f:
        lines.append(json.loads(line))

In [None]:
lines = pd.DataFrame(lines)

In [None]:
# Get the text data
lines_ = lines[["body_cleaned", "score"]]

In [None]:
# Remove null text or score
lines_ = lines_[lines_["body_cleaned"].notnull()]
lines_ = lines_[lines_["score"].notnull()]

In [None]:
# Add the score to the text data by adding "score: {score of comment}\n" to the beginning of each comment
lines_["body_cleaned"] = "score: " + lines_["score"].astype(str) + "\n" + lines_["body_cleaned"]

In [None]:
# Get all the text data
text = lines_["body_cleaned"].values

In [None]:
# Create huggingface dataset
from datasets import Dataset

dataset = Dataset.from_dict({"text": text})

In [None]:
from transformers import AutoTokenizer

max_length = 128
tokenizer = AutoTokenizer.from_pretrained("gmongaras/wizardLM-7B-HF-8bit")

In [None]:
# Load in the dataset and map using the tokenizer
def map_function(example):
    text = example["text"]
    
    # Encode the question and output
    text_encoded = tokenizer(text, max_length=max_length-1, truncation=True, padding="max_length")
    
    # Add on a pad token to the end of the input_ids
    text_encoded["input_ids"] = text_encoded["input_ids"] + [tokenizer.pad_token_id]
    
    # Attention mask is the length of the input_ids without the padding + 1
    # because we want the model to stop itself
    attention_mask = [1 for i in range(0, sum(text_encoded["attention_mask"]) + 1)] + [0 for i in range(sum(text_encoded["attention_mask"])+1, max_length)]
    assert len(attention_mask) == max_length and len(text_encoded["input_ids"]) == max_length, \
        "Attention mask or input_ids is not the correct length"
    # attention_mask = text_encoded["attention_mask"]
    
    # The labels are the input ids, but we want to mask the loss for the context and padding
    labels = [attention_mask["input_ids"][i] if attention_mask["attention_mask"][i] == 1 else -100 for i in range(len(attention_mask["attention_mask"]))]
    
    return {
        "input_ids": text_encoded["input_ids"],
        "labels": labels,
        "attention_mask": attention_mask
    }
dataset = dataset.map(map_function)

                                                                         

In [None]:
# Remove text from dataset
dataset = dataset.remove_columns(["text"])

In [None]:
# Save dataset to hub
dataset.save_to_disk("reddit_political_2019")
dataset.push_to_hub("gmongaras/reddit_political_2019")

In [None]:
from datasets import load_from_disk
dataset = load_from_disk("reddit_political_2019")