In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
import json
import os
import random

For reproducibility, we fix the random seed.

In [21]:
seed = 42

# Creating New Datasets

We seek to create new datasets by performing modifications on the old datasets or combining multiple datasets.

In [5]:
folder = '/content/drive/MyDrive/CS 171/Final Project'

In [8]:
def load_dataset(filename):
  text = []
  labels = []

  with open(os.path.join(folder, filename), 'r') as infile:
    for line in infile.readlines():
      sample = json.loads(line)
      text.append(sample['text'])
      labels.append(sample['sentiment'])

  return text, labels


def save_dataset(text, labels, filename):
  with open(os.path.join(folder, filename), 'w') as outfile:
    for text_sample, label_sample in zip(text, labels):
      entry_dict = {'text': text_sample, 'sentiment': label_sample}
      json.dump(entry_dict, outfile)
      outfile.write('\n')

In [30]:
def reverse_dataset(dataset_filename, output_filename):
  text, labels = load_dataset(dataset_filename)

  # Reverse each of the sentences
  for text_sample in text:
    text_sample.reverse()

  save_dataset(text, labels, output_filename)

  # Sample first line to check for consistency
  print('Sampling reversed dataset')
  print(text[0], labels[0])


def shuffle_dataset(dataset_filename, output_filename):
  # Set seed so each shuffle is the same
  random.seed(seed)

  text, labels = load_dataset(dataset_filename)

  # Reverse each of the sentences
  for text_sample in text:
    random.shuffle(text_sample)

  save_dataset(text, labels, output_filename)

  # Sample first line to check for consistency
  print('Sampling shuffled dataset')
  print(text[0], labels[0])


def truncate_dataset(ratio, dataset_filename, output_filename):
  text, labels = load_dataset(dataset_filename)

  output_text, output_labels = [], []

  # Take 1 in every ratio samples from the dataset
  for index, text_sample in enumerate(text):
    if index % ratio == 0:
      output_text.append(text_sample)
      output_labels.append(labels[index])

  save_dataset(output_text, output_labels, output_filename)

  # Sample first line to check for consistency
  print('Sampling truncated dataset')
  print(output_text[0], output_labels[0])
  print(f'Comparing sizes, Before: {len(text)}, After: {len(output_text)}')


def combine_datasets_2(dataset1_filename, dataset2_filename, output_filename):
  text1, labels1 = load_dataset(dataset1_filename)
  text2, labels2 = load_dataset(dataset2_filename)

  text, labels = [], []
  # Take alternating samples from both datasets until one runs out
  for index in range(min(len(text1), len(text2))):
    if index % 2 == 0:
      text.append(text1[index])
      labels.append(labels1[index])
    else:
      text.append(text2[index])
      labels.append(labels2[index])

  save_dataset(text, labels, output_filename)

  # Sample three lines to check for consistency
  print('Sampling dataset combination')
  print(text[0], labels[0])
  print(text[1], labels[1])
  print(text[2], labels[2])

Now, we generate the datasets that we want.

In [32]:
reverse_dataset('Video_Games_Clean.json', 'Video_Games_Reverse.json')
shuffle_dataset('Video_Games_Clean.json', 'Video_Games_Shuffle.json')
truncate_dataset(500, 'Video_Games_Clean.json', 'Video_Games_Truncate.json')
combine_datasets_2('Video_Games_Clean.json', 'Financial_Clean.json',
                   'Video_Games_Financial_Combination.json')

Sampling reversed dataset
['great', 'when', 'but', 'of', 'hang', 'get', 'to', 'hard', 'bit', 'game'] 2
Sampling shuffled dataset
['but', 'to', 'hard', 'when', 'hang', 'of', 'great', 'get', 'game', 'bit'] 2
Sampling truncated dataset
['game', 'bit', 'hard', 'to', 'get', 'hang', 'of', 'but', 'when', 'great'] 2
Comparing sizes, Before: 994, After: 496756
Sampling dataset combination
['game', 'bit', 'hard', 'to', 'get', 'hang', 'of', 'but', 'when', 'great'] 2
['technopolis', 'plan', 'to', 'develop', 'in', 'stage', 'area', 'of', 'no', 'le', 'than', '100000', 'square', 'meter', 'in', 'order', 'to', 'host', 'company', 'working', 'in', 'computer', 'technology', 'and', 'telecommunication', 'statement', 'said'] 1
['ok', 'game'] 1
