# Preprocess Dataset
Load up dataset and prep for use with AI LLM models

In [2]:
from google.colab import drive
import shutil

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Define source and destination paths
source_path = "/content/drive/MyDrive/soen691/datasets"
destination_path = "/content/data"

# Copy the data
shutil.copytree(source_path, destination_path, dirs_exist_ok=True)

print("Files copied successfully!")

Mounted at /content/drive
Files copied successfully!


Read data from json lines file

In [3]:
import json

train_file_path = "/content/data/msg-train-merged.jsonl"
test_file_path = "/content/data/msg-test-5000-merged.jsonl"

train_json = []
for line in open(train_file_path, 'r', encoding="utf-8"):
    train_json.append(json.loads(line))
print(f"Train data length: {len(train_json)}")
print("\n-----------------\n")

test_json = []
for line in open(test_file_path, 'r', encoding="utf-8"):
    test_json.append(json.loads(line))
print(f"Test data length: {len(test_json)}")
print("\n-----------------\n")

Train data length: 117739

-----------------

Test data length: 5000

-----------------



Extract 500 Test Case for smaller sample == first 500 samples from 5000 test cases

In [4]:
test_json_500 = test_json[:500]
print(f"Test data length: {len(test_json_500)}")

Test data length: 500


Save for both 500 and 5000 test case as hash of content and value. Gold will be used as truth for evaluation

In [11]:
from hashlib import sha256

def hash(value: str, counter=0) -> str:
    """Generate a unique hash, modifying it if needed."""
    if counter == 0:
        return sha256(value.encode()).hexdigest()[:16]
    return sha256(f"{value}-{counter}".encode()).hexdigest()[:16]

def store_hashed_items(items):
    """Stores items with unique hashes by modifying hash on collision."""
    hashed_items = {}

    for item in items:
        original_hash = hash(json.dumps(item))
        counter = 0
        new_hash = original_hash

        # Ensure hash is unique
        while new_hash in hashed_items:
            counter += 1
            new_hash = hash(json.dumps(item), counter)

        hashed_items[new_hash] = item  # Store with unique hash

    return hashed_items

test_json_5000_hashed = store_hashed_items(test_json)
test_json_500_hashed = store_hashed_items(test_json_500)

# show first in dictionary
print(list(test_json_500_hashed.items())[0])
print(len(test_json_5000_hashed))

('033b0baff52bb483', {'patch': '@@ -191,7 +191,10 @@ public class FindFiles {\n       Snapshot snapshot = snapshotId != null ?\n           ops.current().snapshot(snapshotId) : ops.current().currentSnapshot();\n \n-      CloseableIterable<ManifestEntry> entries = new ManifestGroup(ops, snapshot.manifests())\n+      // snapshot could be null when the table just gets created\n+      Iterable<ManifestFile> manifests = (snapshot != null) ? snapshot.manifests() : CloseableIterable.empty();\n+\n+      CloseableIterable<ManifestEntry> entries = new ManifestGroup(ops, manifests)\n           .filterData(rowFilter)\n           .filterFiles(fileFilter)\n           .filterPartitions(partitionFilter)', 'callgraph': '[FindFiles->[Builder->[asOfTime->[inSnapshot],inPartitions->[inPartitions]]]]', 'summary': 'Collect all data files in the manifest.', 'msg': "If there are no manifests, then entries should be `CloseableIterable.empty()`, not the manifest iterable. That doesn't need to be closeable."})
50

Dump and Save in Drive and Upload to HF

In [12]:
import io
import os

def _make_w_io_base(f, mode: str):
    if not isinstance(f, io.IOBase):
        f_dirname = os.path.dirname(f)
        if f_dirname != "":
            os.makedirs(f_dirname, exist_ok=True)
        f = open(f, mode=mode, encoding="utf-8")
    return f


def _make_r_io_base(f, mode: str):
    if not isinstance(f, io.IOBase):
        f = open(f, mode=mode, encoding="utf-8")
    return f


def jdump(obj, f: str, mode="w", indent=4, default=str):
    f = _make_w_io_base(f, mode)
    if isinstance(obj, (dict, list)):
        json.dump(obj, f, indent=indent, default=default)
    elif isinstance(obj, str):
        f.write(obj)
    else:
        raise ValueError(f"Unexpected type: {type(obj)}")
    f.close()

# save file to drive
jdump(test_json_5000_hashed, "/content/drive/MyDrive/soen691/datasets/test_json_5000_hashed.json")
jdump(test_json_500_hashed, "/content/drive/MyDrive/soen691/datasets/test_json_500_hashed.json")

In [13]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

In [14]:
from datasets import load_dataset, Dataset, DatasetDict

# Remember to set HF_TOKEN in colab secrets for upload rights
!huggingface-cli login

def upload_test_to_huggingface(source: dict, repo_name: str, is_test: bool = False):
    # uploading to huggingface repo
    dataset = []
    for key, example in source.items():
      result = {}
      result["hash"] = key
      result["value"] = example
      dataset.append(result)
    hf_dataset = Dataset.from_list(dataset)
    if is_test:
      hf_dataset = DatasetDict({"test": hf_dataset})
    hf_dataset.push_to_hub(repo_name)


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `CLI` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `CLI`


In [15]:
upload_test_to_huggingface(test_json_5000_hashed, "dbaeka/soen_691_msg_test_5000_hashed", True)
upload_test_to_huggingface(test_json_500_hashed, "dbaeka/soen_691_msg_test_500_hashed", True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/455 [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/451 [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Save Gold Text to Drive

In [21]:
test_gold_5000_hashed = {key: value["msg"] for key, value in test_json_5000_hashed.items()}
test_gold_500_hashed =  {key: value["msg"] for key, value in test_json_500_hashed.items()}

# show first in dictionary
print(list(test_gold_500_hashed.items())[0])

('033b0baff52bb483', "If there are no manifests, then entries should be `CloseableIterable.empty()`, not the manifest iterable. That doesn't need to be closeable.")


In [22]:
# save to drive
jdump(test_gold_5000_hashed, "/content/drive/MyDrive/soen691/evaluation/test_gold_5000_hashed.json")
jdump(test_gold_500_hashed, "/content/drive/MyDrive/soen691/evaluation/test_gold_500_hashed.json")

Upload Train Data to HF

In [None]:
hf_dataset = Dataset.from_list(train_json)
hf_dataset.push_to_hub("dbaeka/soen_691_msg_train")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/118 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/dbaeka/soen_691_msg_train/commit/b106e53dcd0d8e6aa5ce1f56b650c60357c019bc', commit_message='Upload dataset', commit_description='', oid='b106e53dcd0d8e6aa5ce1f56b650c60357c019bc', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/dbaeka/soen_691_msg_train', endpoint='https://huggingface.co', repo_type='dataset', repo_id='dbaeka/soen_691_msg_train'), pr_revision=None, pr_num=None)