# Preprocessing

In [1]:
import gzip
import pandas as pd
import numpy as np
import os
import pdb
import pickle
import string
import shutil

from collections import defaultdict

## Preprocessing Class

In [2]:
class preprocessing(object):
    """
    A dataframe object that has preprocessing capabilities
    """
    def __init__(self, df):
        """Initializes class parameters"""
        super()
        self.df = df
        self.replace_punctuation = str.maketrans(string.punctuation, ' '*len(string.punctuation))

    
    @staticmethod
    def text_processor(sentence, table):
        """Function for removing punctuations and uncasing"""
        sentence = sentence.translate(table)
        return([w.lower() for w in sentence.split() if w!=' ' and w!=None])
    

    def category_id(self):
        """Return the values of **category_id** as a list"""
        categ_id = np.asarray(self.df.categoryID.values)
        return(categ_id.reshape((len(categ_id), 1)))


    def rating(self):
        """Returns the values of **rating** as a list"""
        rate = np.asarray(self.df.rating.values)
        return(rate.reshape((len(rate), 1)))


    def review_length(self):
        """Finds the length of a review and returns a list of all lengths"""
        documents = self.df.reviewText.as_matrix()
        all_lengths = np.zeros((len(documents)))
        for i, text in enumerate(documents):
            all_lengths[i] = len(text)
        return(all_lengths.reshape((len(all_lengths), 1)))


    def review_time(self):
        """Translates the time string into 3-element array"""
        review_time = list(self.df.reviewTime.values)
        time_wo_punctuation = [self.text_processor(date, self.replace_punctuation) for date in review_time]
        formatted_time = np.asarray(list(map(lambda x: [int(i) for i in x], time_wo_punctuation)))
        return(np.array([[x[0], x[1], x[2]] for x in formatted_time]))
        

    def out_of(self):
        """Filters out the **outOf** entry and returns a list of values"""
        helpful = list(self.df.helpful.values)
        out_of = np.asarray([x['outOf'] for x in helpful], dtype=np.int32)
        return(out_of.reshape((len(out_of), 1)))

## Helper Functions

In [3]:
def readGz(f):
    for l in gzip.open(f):
        yield eval(l)

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

## Load the Zipped Dataframes

In [4]:
parent_path = os.getcwd() + '/'
zipped_train_path = parent_path + 'train.json.gz'
zipped_test_path = parent_path + 'test_Helpful.json.gz'

In [5]:
print('Loading zipped dataframes...')
train_zip = getDF(zipped_train_path)
test_zip = getDF(zipped_test_path)

Loading zipped dataframes...


In [6]:
train_path = parent_path + 'train_data/'
test_path = parent_path + 'test_data/'

In [7]:
def folder_setup(path):
    if os.path.exists(path):
        print('Removing old folder %s'%path)
        shutil.rmtree(path)
        print('Making new folder %s'%path)
        os.mkdir(path)
    else:
        print('Making new folder %s'%path)
        os.mkdir(path)

In [8]:
folder_setup(train_path)
print()
folder_setup(test_path)

Removing old folder /Users/cheng-haotai/Documents/Projects_Data/Amazon_Helpful/train_data/
Making new folder /Users/cheng-haotai/Documents/Projects_Data/Amazon_Helpful/train_data/

Removing old folder /Users/cheng-haotai/Documents/Projects_Data/Amazon_Helpful/test_data/
Making new folder /Users/cheng-haotai/Documents/Projects_Data/Amazon_Helpful/test_data/


## Assemble Features

In [9]:
def true_scores(helpful_list):
    true_scores = np.asarray([x['nHelpful'] for x in helpful_list])
    return(true_scores)

In [10]:
def get_preprocessed_data(df):
    structure = preprocessing(df)
    # Features to include
    categ_id = structure.category_id()
    rate = structure.rating()
    rev_len = structure.review_length()
    rev_time = structure.review_time()
    outof = structure.out_of()

    # Assemble dataframe
    data_assy = np.concatenate((categ_id, rate, rev_len, rev_time, outof), axis=1)
    return(pd.DataFrame(data=data_assy))

In [11]:
print('Preprocessing zipped datasets...')
train_df = get_preprocessed_data(train_zip)
test_df = get_preprocessed_data(test_zip)

Preprocessing zipped datasets...


In [12]:
print(train_df.shape)
print(test_df.shape)

(200000, 7)
(14000, 7)


In [13]:
# Get labels
train_labels = true_scores(train_zip.helpful)

In [14]:
print(train_labels.shape)

(200000,)


## Save Dataframes and Labels

In [15]:
path_train_df = train_path + 'train_processed.csv'
path_train_labels = train_path + 'train_labels.pickle'
path_test_df = test_path + 'test_processed.csv'

In [16]:
train_df.to_csv(path_train_df, index=None)
test_df.to_csv(path_test_df, index=None)

In [17]:
with open(path_train_labels, 'wb') as handle:
    pickle.dump(train_labels, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [18]:
train_load = pd.read_csv(path_train_df)
test_load = pd.read_csv(path_test_df)

In [19]:
test_load.head()

Unnamed: 0,0,1,2,3,4,5,6
0,0.0,3.0,147.0,7.0,15.0,2011.0,2.0
1,0.0,4.0,144.0,7.0,17.0,2013.0,0.0
2,0.0,5.0,112.0,12.0,8.0,2013.0,1.0
3,0.0,5.0,666.0,11.0,22.0,2012.0,1.0
4,0.0,4.0,190.0,4.0,1.0,2014.0,0.0


In [20]:
train_load.head()

Unnamed: 0,0,1,2,3,4,5,6
0,0.0,3.0,119.0,5.0,20.0,2014.0,0.0
1,0.0,4.0,269.0,2.0,7.0,2013.0,0.0
2,0.0,3.0,144.0,5.0,13.0,2014.0,2.0
3,0.0,4.0,174.0,5.0,25.0,2014.0,0.0
4,0.0,5.0,424.0,7.0,30.0,2013.0,1.0


In [21]:
with open(path_train_labels, 'rb') as handle:
    labels_load = pickle.load(handle)

In [22]:
print(labels_load.shape)

(200000,)
