In [1]:
import re

def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [6]:
review = ". . . plays like a badly edited , 91-minute trailer ( and ) the director can't seem to get a coherent rhythm going . in fact , it doesn't even seem like she tried! Did I like it? Awful . "

print(clean_str(review))

plays like a badly edited , 91 minute trailer \( and \) the director ca n't seem to get a coherent rhythm going in fact , it does n't even seem like she tried ! did i like it \? awful


In [8]:
import numpy as np

def load_data_and_labels(positive_data_file, negative_data_file):
    """
    Loads MR polarity data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    positive_examples, negative_examples = None, None
    
    # Load data from files
    with open(positive_data_file, "r") as positive_file:
        positive_example = [s.strip() for s in positive_file.readlines()]
    
    with open(negative_data_file, "r") as negative_file:
        negative_example = [s.strip() for s in negative_file.readlines()]
     
    # Split by words
    x_text = [clean_str(sent) for sent in positive_examples + negative_examples]
    
    # Generate labels
    positive_labels = [[0, 1] for _ in positive_examples]
    negative_labels = [[1, 0] for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    return [x_text, y]