In [1]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
import os
import pandas as pd
import numpy as np
from random import shuffle
import pickle
from text2vector import Text2Vector
import re


In [207]:
class Dataset:
    def __init__(self, dataset=None, batch_size=1, repeat=1, shuffle_buffer_size=1):
        if dataset is None:
            self.data = [] # generator
        else:
            self.data = dataset.data
        self.__batch_size = batch_size
        self.__repeat = repeat
        self.__shuffle_buffer_size = shuffle_buffer_size
        self.__iterator = None
        
    def map(self, foo):
        new_dataset = Dataset(None, self.__batch_size, self.__repeat)
        new_dataset.data = [foo(data_point) for data_point in self.data]
        return new_dataset
    
    def batch(self, batch_size):
        return Dataset(self, batch_size, self.__repeat)
    
    def repeat(self, count):
        return Dataset(self, self.__batch_size, count)
    
    def shuffle(self, buffer_size):
        return Dataset(self, self.__batch_size, self.__repeat, buffer_size)
    
    def padded_batch(self, batch_size, list_lengths, padded_value):
        if isinstance(list_lengths, int):
            length = list_lengths
            new_data = [list(item[:length]) + [padded_value]*(length - len(item)) for item in self.data]
        elif isinstance(list_lengths, tuple):
            new_data = []
            for datapoint in self.data:
                new_datapoint = []
                for idx, length in enumerate(list_lengths):
                    if length is not None:
                        new_datapoint.append(list(datapoint[idx][:length]) + [padded_value]*(length - len(datapoint[idx])))
                    else:
                        new_datapoint.append(datapoint[idx])
                new_data.append(tuple(new_datapoint))
        else:
            raise NotImplementedError()
            
        new_dataset = Dataset(None, batch_size, self.__repeat)
        new_dataset.data = new_data
        return new_dataset
    
    def get_iterator(self):
        data_length = len(self.data)
        for i in range(self.__repeat):
            for j in range(0, data_length-self.__batch_size+1, self.__batch_size):
                sample = self.data[j : j + self.__shuffle_buffer_size]
                shuffle(sample)
                self.data[j : j + self.__shuffle_buffer_size] = sample
                start = j
                end = j+self.__batch_size
                yield self.data[start: end]

    @staticmethod    
    def from_csv(filename, columns=None):
        df = pd.read_csv(filename)
        if isinstance(columns, list):
            datas = [list(df[col]) for col in columns]
        else:
            datas = [list(df[col]) for col in df.columns]
        new_dataset = Dataset()
        new_dataset.data = list(zip(*datas))
        return new_dataset

    @staticmethod
    def from_tensor_slices(tensors):
        assert isinstance(tensors, tuple)
        new_dataset = Dataset()
        new_dataset.data = list(zip(*tensors))
        return new_dataset
    
    @staticmethod
    def from_pickle_file(filename):
        return pickle.load(open(filename, 'rb'))
    
    def save(self, filename):
        pickle.dump(self, open(filename, 'wb'))
    
    

In [40]:
text2vec_model = pickle.load(open('text2vec.p', 'rb'))
LABEL_MAPPING = {
        'positive': 0,
        'neutral': 1,
        'negative': 2
    }
def preprocess_text(doc):
    doc = doc.lower()
    NUMBERS_PATTERN = re.compile(r"[+-]?\d+(?:\.\d+)?")
    doc = re.sub(NUMBERS_PATTERN, '', doc)
    URL_PATTERN = re.compile(
            r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))')
    doc = re.sub(URL_PATTERN, 'URL', doc)
    return doc
def standadize_datapoint(datapoint):
    doc, label = datapoint
    doc = preprocess_text(doc)
    return text2vec_model.doc_to_vec([doc])[0], LABEL_MAPPING[label]

In [41]:
ENTROPY_PATH = os.path.join('/dataset', 'entropy_2018')
TRAINING_PATH = os.path.join(ENTROPY_PATH, 'training_set.csv')
TEST_PATH = os.path.join(ENTROPY_PATH, 'test_set.csv')

dataset = Dataset.from_csv(TRAINING_PATH)
dataset = dataset.map(standadize_datapoint)
dataset.save('dataset.p')

In [130]:
dataset = Dataset.from_pickle_file('dataset.p')

In [218]:
data = np.random.randint(10, size=(5, 1))

In [219]:
data

array([[8],
       [6],
       [9],
       [6],
       [2]])

In [233]:
dataset = Dataset.from_tensor_slices((data,))

In [221]:
dataset.data

[(array([8]),), (array([6]),), (array([9]),), (array([6]),), (array([2]),)]

In [234]:
# dataset = Dataset.from_tensor_slices((data,))
# dataset = Dataset.fromCsv(TRAINING_PATH, ['sentiment', 'sentence'])
dataset = dataset.padded_batch(2, (3, ), -1)
dataset = dataset.repeat(2)
# dataset = dataset.shuffle(5)
# dataset = dataset.batch(2)

In [235]:
iterator = dataset.get_iterator()

In [240]:
next(iterator)

StopIteration: 

In [23]:
len(x[0][0])

10

In [25]:
len(x[1][0])

22

In [None]:
columns = ['sentiment', 'sentence']
df = pd.read_csv(TRAINING_PATH)
if isinstance(columns, list):
    datas = [list(df[col] for col in columns]
else:
    datas = [df[col] for col in df.columns]

In [241]:
!jupyter nbconvert --to script dataset.ipynb

[NbConvertApp] Converting notebook dataset.ipynb to script
[NbConvertApp] Writing 5348 bytes to dataset.py
