## Setup

### Importing Dependencies

In [1]:
# For loading dataset from HuggingFace
from datasets import load_dataset

# For creating one-hot encodings of positive/negative instances
import torch

# Data saving
import os
import pickle

### Fixing Random Seeds for Reproducibility

In [2]:
seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)

### Making Project Folders to Store Results

In [3]:
proj_dir = "./QLens/" # Personalize if needed

data_dir = proj_dir + "Datasets/"

model_dir = proj_dir + "Model_Checkpoints/"
lens_dir = proj_dir + "Lens_Checkpoints/"

figures_dir = proj_dir + "Figures/"
atten_figures_dir = proj_dir + "Figures/Attention_Layer"
mlp_figures_dir = proj_dir + "Figures/MLP_Layer"

os.makedirs(data_dir, exist_ok = True)
os.makedirs(model_dir, exist_ok = True)
os.makedirs(lens_dir, exist_ok = True)
os.makedirs(atten_figures_dir, exist_ok = True)
os.makedirs(mlp_figures_dir, exist_ok = True)

## Dataset Preprocessing

In [4]:
# Loading the Sentihood Dataset from HuggingFace
ds = load_dataset("bhavnicksm/sentihood")

In [5]:
ds = ds.shuffle(seed=42) # Shuffling dataset

# Deriving input/output pairs for training a Transformer from the Sentihood dataset
X = [] # Each instance will be a String of text whose sentiment is to be classified
y = [] # Each instance will be a one-hot encoded torch Tensor describing positive or negative sentiment

# Counters for positive and negative instances (used for balancing dataset to have an equal number of each)
total_positive_instances = 0
total_negative_instances = 0

for split in list(dict(ds).keys()): # Looping through each dataset split
  ds_split = ds[split]

  for i in range(ds_split.num_rows):
    target = ds_split["opinions"][i]

    if len(target) == 1: # Only instances with one target are retained to simplify task to single-intent prediction
      X_instance = ds_split["text"][i].strip()

      if ds_split["opinions"][i][0]["sentiment"] == "Positive":
        y_instance = torch.Tensor([1, 0])
        total_positive_instances += 1
      else:
        y_instance = torch.Tensor([0, 1])
        total_negative_instances += 1

      X.append(X_instance)
      y.append(y_instance)

print("Reduced Dataset Length:", len(X))
print("Positive instances:", total_positive_instances)
print("Negative instances:", total_negative_instances)

Reduced Dataset Length: 1864
Positive instances: 1329
Negative instances: 535


In [6]:
# Creating balanced training and test input/output lists
X_train = []
X_test = []

y_train = []
y_test = []

train_test_ratio = 0.8 # 80:20 train/test split used

positives = 0
negatives = 0

max_class_size = total_negative_instances if total_negative_instances < total_positive_instances else total_positive_instances
cutoff = int(train_test_ratio * max_class_size)

k = 0

while len(X_train + X_test) < (2 * max_class_size): # Looping until all instances have been allocated to train or test set
  if torch.equal(y[k], torch.tensor([1, 0]).float()): # Checking whether instance belongs to positive or negative class
    positives += 1

    if positives <= cutoff:
      X_train.append(X[k])
      y_train.append(y[k])
    elif positives <= max_class_size:
      X_test.append(X[k])
      y_test.append(y[k])
  else:
    negatives += 1

    if negatives <= cutoff:
      X_train.append(X[k])
      y_train.append(y[k])
    elif negatives <= max_class_size:
      X_test.append(X[k])
      y_test.append(y[k])

  k += 1

print("Train split size:", len(X_train))
print("Test split size:", len(X_test))

Train split size: 856
Test split size: 214


In [7]:
# Saving Train and Test splits

train_dict = {'X': X_train, 'y': y_train}
test_dict = {'X': X_test, 'y': y_test}

train_path = data_dir + 'train.pickle'
test_path = data_dir + 'test.pickle'

with open(train_path, 'wb') as train_file:
    pickle.dump(train_dict, train_file)

with open(test_path, 'wb') as test_file:
    pickle.dump(test_dict, test_file)