# Part 1: Prepare Dataset

This python notebook corresponds directly to the section 4.3 in the final thesis report. 

### Load Required Libraries

In [None]:
import os
import json
import sys
import csv
import re
import torch
import random
import pandas as pd
import numpy as np

from random import shuffle
from tqdm import tqdm

### Mount Google Drive

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
# set path variables
basepath = '/content/gdrive/MyDrive/ljmu-ms-thesis/'
datapath = '/content/gdrive/MyDrive/ljmu-ms-thesis/data/'
modelpath =  '/content/gdrive/MyDrive/ljmu-ms-thesis/model/'

### Downoad Dataset from GitHub

In [None]:
# Download the dataset. Required only for the first run.
# !wget https://github.com/ef2020/SarcasmAmazonReviewsCorpus/raw/master/Ironic.rar -P /content/gdrive/MyDrive/ljmu-ms-thesis/data
# !wget https://github.com/ef2020/SarcasmAmazonReviewsCorpus/raw/master/Regular.rar -P /content/gdrive/MyDrive/ljmu-ms-thesis/data

In [None]:
# Unarchive the data. Required only for the first run.
# os.chdir(datapath)
# !unrar x "/content/gdrive/MyDrive/ljmu-ms-thesis/data/Ironic.rar"
# !unrar x "/content/gdrive/MyDrive/ljmu-ms-thesis/data/Regular.rar"

### Transform Data

In [None]:
# Utility to clean data
def clean_string(text):
    text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', 'ent' + str(random.randint(0, 1000)), text)
    text = re.sub(r'\#', ' ', text) # remove hashtags
    text = re.sub(r'\:\S+\:', '', text) # remove smilies :grim_face:
    text = re.sub(r' +', ' ', text)
    text = text.strip()
    if len(text) > 512:
        text = text[:512]
    return text

In [None]:
# Load Dataset
def load_dataset():

    path = {}
    path ['regular'] = '/content/gdrive/MyDrive/ljmu-ms-thesis/data/Regular/'
    path ['ironic'] = '/content/gdrive/MyDrive/ljmu-ms-thesis/data/Ironic/'
    
    dataset = {}
    dataset['data'] = []
    dataset['label'] = []
  
    for pathname in path:
      for filename in os.listdir(path[pathname]):
        if filename.endswith(".txt"):
            file = os.path.join(path[pathname], filename)
            with open(file, 'r', encoding='cp1252') as f:
                string_without_line_breaks = ""
                for each_line in f:
                    stripped_line = each_line.strip()
                    string_without_line_breaks += stripped_line
                result = re.search('<REVIEW>(.*)</REVIEW>', string_without_line_breaks)
                data = clean_string(result.group(1))
                dataset['data'].append(data)
                if pathname == 'regular':
                  dataset['label'].append(0)
                else:
                  dataset['label'].append(1)
                continue

    # Shuffle the dataset
    df = pd.DataFrame.from_dict(dataset)
    df = df.sample(frac=1)
    dataset = df.to_dict('list')
    
    return dataset

In [None]:
# read data from dataset, convert it to json and save the file
def process_dataset():
    dataset = load_dataset()  
    content = []
    outfile = os.path.join(datapath,'amazon_data.json')

    for i in tqdm(range(len(dataset['data']))):
        entry = {}
        entry['sentence'] = dataset['data'][i].lower()
        entry['label'] = dataset['label'][i]
        content.append(json.dumps(entry)) 

    # wrte the content to an output file
    with open(outfile, "w") as f:
        for c in content:
            f.write(c + '\n')
    return content

# call the function
content = process_dataset()