In [35]:
# Import necessary libraries
import os
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import nltk
from nltk.corpus import stopwords
from datasets import load_dataset
from transformers import AutoTokenizer
import torch
from tqdm import tqdm

# Settings
warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 12

# Download NLTK data if needed
try:
    nltk.data.find('stopwords')
except LookupError:
    nltk.download('stopwords')
    
print("Libraries imported successfully!")

Libraries imported successfully!


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [36]:
import pickle
import numpy as np

def load_pkl(path):
    with open(path, "rb") as f:
        return pickle.load(f)

def save_pkl(obj, path):
    with open(path, "wb") as f:
        pickle.dump(obj, f)

def sample_one_third(data_dict, seed=42):
    """Randomly sample 1/3 of the dataset."""
    n = len(data_dict["texts"])
    k = n // 3
    np.random.seed(seed)
    idx = np.random.choice(n, k, replace=False)
    
    # subset each key
    new_data = {k: [v[i] for i in idx] for k, v in data_dict.items()}
    return new_data

# paths
root = "../../data/processed/"
files = ["sst2_train.pkl", "sst2_test.pkl", "sst2_val.pkl"]

for f in files:
    data = load_pkl(root + f)
    sampled_data = sample_one_third(data)

    n = len(sampled_data["texts"])
    sampled_data["category"] = ["movies"] * n  # or "movie_review"

    # save sampled version
    out_path = root + f.replace(".pkl", "_one_third.pkl")
    save_pkl(sampled_data, out_path)
    
    print(f"Sampled 1/3 from {f} → saved to {out_path}, size = {n}, category = movies")


Sampled 1/3 from sst2_train.pkl → saved to ../../data/processed/sst2_train_one_third.pkl, size = 18192, category = movies
Sampled 1/3 from sst2_test.pkl → saved to ../../data/processed/sst2_test_one_third.pkl, size = 2274, category = movies
Sampled 1/3 from sst2_val.pkl → saved to ../../data/processed/sst2_val_one_third.pkl, size = 2274, category = movies


In [41]:
from datasets import load_dataset
import numpy as np
import pickle

def save_pkl(obj, path):
    with open(path, "wb") as f:
        pickle.dump(obj, f)

# target sizes from SST-2 sample
SIZES = {
    "train": 18192,
    "internal_val": 2274,
    "val": 2274
}

# load amazon_polarity
dataset = load_dataset("mteb/amazon_polarity")["train"]

print("Total Amazon rows:", len(dataset))

# shuffle indices for reproducibility
rng = np.random.default_rng(seed=42)
indices = rng.permutation(len(dataset))

# sampling function
def build_amazon_split(n, start_idx):
    idx = indices[start_idx:start_idx + n]
    subset = dataset.select(idx.tolist())

    texts = subset["text"]
    labels = subset["label"]
    label_texts = subset["label_text"]

    data = {
        "texts": texts,
        "processed_texts": texts,  # raw text retained for now
        "labels": labels,          # KEEP sentiment label
        "label_texts": label_texts, # KEEP sentiment text label
        "text_lengths": [len(t.split()) for t in texts],
        "word_counts":  [len(t.split()) for t in texts],
        "category": ["online_shopping"] * len(texts)
    }

    return data

root = "../../data/processed/"

# sequential index allocation
start = 0

amazon_train = build_amazon_split(SIZES["train"], start)
save_pkl(amazon_train, root + "amazon_train.pkl")
start += SIZES["train"]

amazon_internal = build_amazon_split(SIZES["internal_val"], start)
save_pkl(amazon_internal, root + "amazon_internal_val.pkl")
start += SIZES["internal_val"]

amazon_val = build_amazon_split(SIZES["val"], start)
save_pkl(amazon_val, root + "amazon_val.pkl")

print("DONE:")
print("amazon_train.pkl:", len(amazon_train["texts"]))
print("amazon_internal_val.pkl:", len(amazon_internal["texts"]))
print("amazon_val.pkl:", len(amazon_val["texts"]))


Total Amazon rows: 3599994
DONE:
amazon_train.pkl: 18192
amazon_internal_val.pkl: 2274
amazon_val.pkl: 2274


In [42]:
import re

def preprocess_text(text, lowercase=True, normalize_whitespace=True):
    """
    Preprocess text for sentiment classification.
    """
    if normalize_whitespace:
        text = ' '.join(text.split())
    
    if lowercase:
        text = text.lower()

    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    
    # Remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)
    
    # Normalize repeated punctuation
    text = re.sub(r'([!?.]){2,}', r'\1', text)
    
    # Remove extra spaces
    text = ' '.join(text.split())
    
    return text


In [43]:
import pickle

def load_pkl(path):
    with open(path, "rb") as f:
        return pickle.load(f)

def save_pkl(obj, path):
    with open(path, "wb") as f:
        pickle.dump(obj, f)


# Paths
root = "../../data/processed/"
files = {
    "amazon_train.pkl": "amazon_train_preprocessed.pkl",
    "amazon_internal_val.pkl": "amazon_internal_val_preprocessed.pkl",
    "amazon_val.pkl": "amazon_val_preprocessed.pkl"
}

for src, dst in files.items():
    print(f"\nProcessing {src} ...")
    
    data = load_pkl(root + src)
    
    texts = data["texts"]  # original texts
    
    # Apply preprocessing
    processed = [preprocess_text(t) for t in texts]
    
    # Add new field
    data["processed_texts"] = processed
    
    # Save updated version
    save_pkl(data, root + dst)



Processing amazon_train.pkl ...

Processing amazon_internal_val.pkl ...

Processing amazon_val.pkl ...


In [45]:
def load_pkl(path):
    with open(path, "rb") as f:
        return pickle.load(f)

def save_pkl(obj, path):
    with open(path, "wb") as f:
        pickle.dump(obj, f)

# Desired sizes (same as SST-2 one-third)
TARGET_SIZES = {
    "train": 18192,
    "internal_val": 2274,
    "val": 2274
}

root = "../../data/processed/"

# Input Yelp files
files = {
    "train": root + "yelp_train.pkl",
    "internal_val": root + "yelp_test.pkl",
    "val": root + "yelp_val.pkl"
}

# Output Yelp sampled files
out_files = {
    "train": root + "yelp_train_one_third.pkl",
    "internal_val": root + "yelp_internal_val_one_third.pkl",
    "val": root + "yelp_val_one_third.pkl"
}

# Sampling function
def sample_dataset(data, n, seed=42):
    total = len(data["texts"])
    if n > total:
        raise ValueError(f"Requested {n} samples, but dataset only has {total}!")

    rng = np.random.default_rng(seed)
    idx = rng.choice(total, n, replace=False)

    # subset all keys
    sampled = {k: [v[i] for i in idx] for k, v in data.items()}
    sampled["category"] = ["local_business_review"] * n
    return sampled

# Perform sampling
for split in ["train", "internal_val", "val"]:
    print(f"\nProcessing {split} …")

    data = load_pkl(files[split])
    n = TARGET_SIZES[split]

    sampled_data = sample_dataset(data, n)
    save_pkl(sampled_data, out_files[split])

    print(f"Saved {split}: {n} samples → {out_files[split]}")



Processing train …
Saved train: 18192 samples → ../../data/processed/yelp_train_one_third.pkl

Processing internal_val …
Saved internal_val: 2274 samples → ../../data/processed/yelp_internal_val_one_third.pkl

Processing val …
Saved val: 2274 samples → ../../data/processed/yelp_val_one_third.pkl


In [46]:
def load_pkl(path):
    with open(path, "rb") as f:
        return pickle.load(f)

def save_pkl(obj, path):
    with open(path, "wb") as f:
        pickle.dump(obj, f)

root = "../../data/processed/"

files = [
    "yelp_train_one_third.pkl",
    "yelp_internal_val_one_third.pkl",
    "yelp_val_one_third.pkl",
]

for fname in files:
    path = root + fname
    
    data = load_pkl(path)
    
    n = len(data["texts"])
    data["category"] = ["local_business_review"] * n
    
    save_pkl(data, path)
    
    print(f"Added category=local_business_review → {fname} (size={n})")


Added category=local_business_review → yelp_train_one_third.pkl (size=18192)
Added category=local_business_review → yelp_internal_val_one_third.pkl (size=2274)
Added category=local_business_review → yelp_val_one_third.pkl (size=2274)


In [48]:
import pickle
from pathlib import Path

root = Path("../../data/processed/")
out_root = Path("../../data/multi/")
out_root.mkdir(exist_ok=True)

# ---------------------------------------------------
# Helper: load pickle
# ---------------------------------------------------
def load_pkl(path):
    with open(path, "rb") as f:
        return pickle.load(f)

# ---------------------------------------------------
# Helper: save pickle
# ---------------------------------------------------
def save_pkl(obj, path):
    with open(path, "wb") as f:
        pickle.dump(obj, f)
    print(f"Saved → {path} (size={len(obj['texts'])})")

# ---------------------------------------------------
# Helper: normalize structure
# ---------------------------------------------------
def to_list(x, length=None):
    """Convert HuggingFace column / numpy array / scalar to Python list."""
    if isinstance(x, list):
        return x
    try:
        return list(x)
    except TypeError:
        return [x] * length


def normalize(d, domain_name):
    """Ensure all required fields exist and are lists."""

    d = dict(d)
    n = len(d["texts"])

    # Convert all fields into lists
    for key, val in d.items():
        d[key] = to_list(val, length=n)

    # Force category
    d["category"] = [domain_name] * n

    # Optional fields
    d["token_lengths"] = to_list(d.get("token_lengths", [None] * n), length=n)
    d["label_texts"] = to_list(d.get("label_texts", [None] * n), length=n)

    return d

# ---------------------------------------------------
# Merge utility (FORCE MERGE SPECIFIC FIELDS)
# ---------------------------------------------------
def merge_three(d1, d2, d3):

    keys_to_merge = [
        "texts",
        "processed_texts",
        "labels",
        "word_counts",
        "category"
    ]

    out = {}
    for key in keys_to_merge:
        out[key] = d1[key] + d2[key] + d3[key]

    return out


# ---------------------------------------------------
# Load dataset paths
# ---------------------------------------------------
files = {
    "train": {
        "movie": root/"sst2_train_one_third.pkl",
        "amazon": root/"amazon_train.pkl",
        "yelp": root/"yelp_train_one_third.pkl",
    },
    "internal_val": {
        "movie": root/"sst2_internal_val_one_third.pkl",
        "amazon": root/"amazon_internal_val.pkl",
        "yelp": root/"yelp_internal_val_one_third.pkl",
    },
    "val": {
        "movie": root/"sst2_val_one_third.pkl",
        "amazon": root/"amazon_val.pkl",
        "yelp": root/"yelp_val_one_third.pkl",
    }
}

# ---------------------------------------------------
# Build multi-domain datasets
# ---------------------------------------------------
def build_split(split_name):
    print(f"\n=== Building {split_name} split ===")

    movie = normalize(load_pkl(files[split_name]["movie"]), "movie_review")
    amazon = normalize(load_pkl(files[split_name]["amazon"]), "online_shopping")
    yelp = normalize(load_pkl(files[split_name]["yelp"]), "local_business_review")

    merged = merge_three(movie, amazon, yelp)

    save_pkl(merged, out_root / f"multi_{split_name}.pkl")


# Build all splits
build_split("train")
build_split("internal_val")
build_split("val")

print("\nAll multi-domain datasets completed!")



=== Building train split ===
Saved → ..\..\data\multi\multi_train.pkl (size=54576)

=== Building internal_val split ===
Saved → ..\..\data\multi\multi_internal_val.pkl (size=6822)

=== Building val split ===
Saved → ..\..\data\multi\multi_val.pkl (size=6822)

All multi-domain datasets completed!


In [52]:
import pickle
from collections import Counter

def load_pkl(path):
    with open(path, "rb") as f:
        return pickle.load(f)

split_train = out_root/"multi_train.pkl"
split_val   = out_root/"multi_val.pkl"
split_test  = out_root/"multi_test.pkl"

def show_label_dist(name, path):
    data = load_pkl(path)   
    labels = data["labels"]
    counter = Counter(labels)
    total = len(labels)
    
    print(f"\n=== {name} Label Distribution (n={total}) ===")
    for lbl, cnt in sorted(counter.items()):
        print(f"Label {lbl}: {cnt} ({cnt/total:.2%})")

# call functions
show_label_dist("Train", split_train)
show_label_dist("Val", split_val)
show_label_dist("Test", split_test)



=== Train Label Distribution (n=54576) ===
Label 0: 26258 (48.11%)
Label 1: 28318 (51.89%)

=== Val Label Distribution (n=6822) ===
Label 0: 3244 (47.55%)
Label 1: 3578 (52.45%)

=== Test Label Distribution (n=6822) ===
Label 0: 3264 (47.85%)
Label 1: 3558 (52.15%)
