In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [None]:
from renthop.preprocessing import loaders

In [9]:
reload(loaders)

<module 'renthop_preprocessing.loaders' from 'renthop_preprocessing/loaders.py'>

In [6]:
json_loader = loaders.JSONLoader()
preprocessor = loaders.Preprocessor()
preprocessor.with_pipeline('main').set_loader(json_loader)
preprocessor.add_operation(loaders.DateTimeExtractor()).add_operation(loaders.NewSimplePredictors())
preprocessor.add_operation(loaders.LogTransform(['price_per_bedroom', 'price', 'price_per_bathroom']))
preprocessor.add_operation(loaders.Selector(['listing_id', 'bathrooms', u'bedrooms', 'latitude', 'longitude', 'price',
                                             'month', 'day_of_month', 'hour', 'day_of_week', 'price_per_bathroom',
                                             'price_per_bedroom', 'num_features', 'features_len', 'num_photos']))
merger = loaders.PandasColumnMerger(['main', 'features', 'sentiment'], on = 'listing_id')
preprocessor.set_consumer(merger)
preprocessor.with_pipeline('features').set_loader(loaders.CSVLoader('data/features_train.csv', 'data/features_test.csv'))
preprocessor.set_consumer(merger)
preprocessor.with_pipeline('sentiment').set_loader(loaders.CSVLoader('data/sentiment_train.csv', 'data/sentiment_test.csv'))
preprocessor.set_consumer(merger)
preprocessor.with_pipeline('merged').set_loader(merger).add_operation(loaders.ColumnDrop('listing_id'))
preprocessor.add_operation(loaders.ToNdarray()).add_operation(preprocessing.StandardScaler())
preprocessor.with_pipeline('response').set_loader(json_loader.select_loader('interest_level'))
preprocessor.add_operation(loaders.Dummifier(output_cols = ['high', 'medium', 'low'])).add_operation(loaders.ToNdarray())

<renthop_preprocessing.loaders.Preprocessor at 0x7ff6b890cf90>

In [7]:
data = preprocessor.load_and_transform()
X = data['merged']
y = data['response']

In [8]:
print X.shape, y.shape

(49352, 80) (49352, 3)


In [9]:
X[:5]

array([[ 0.57401627,  1.3079381 , -0.90071619,  0.94608301, -0.21414523,
         1.1949383 ,  1.06188548,  0.63054681,  0.60392284, -0.67816544,
        -1.48334253, -1.38362455, -1.30596185, -0.16709934, -0.21430261,
        -0.02987223, -0.11925235, -0.12754554, -0.05178646, -0.03795678,
        -0.22519593, -0.05428384, -0.60709953, -0.04004138, -0.05237322,
        -0.10478552, -0.25897929, -1.06695545, -0.04895628, -0.05409577,
        -0.21107602, -0.05178646, -0.13964567, -0.06200286, -0.04054589,
        -0.12116054, -0.75996554, -0.14671995, -0.05352768, -0.52045923,
        -0.04321621, -0.06583799, -0.34155843, -0.11793463, -0.86048996,
        -0.25906837, -0.0515894 , -0.04957621, -0.17023888, -0.08078583,
        -0.02918483, -0.30976784, -0.07319575, -0.99809718, -0.44331497,
        -0.96856022, -0.02701825, -0.02987223, -0.13690613, -0.4111729 ,
        -0.11205543, -0.23620595, -0.8545838 , -0.0608395 , -0.04345089,
        -0.12012326, -0.90233684, -0.12966208, -0.0

Below are the first few tests and conversion from features to a dummy `.csv` performed.

In [3]:
json_loader = loaders.JSONLoader()
preprocessor = loaders.Preprocessor()
preprocessor.with_pipeline('main').set_loader(json_loader)
preprocessor.add_operation(loaders.DateTimeExtractor()).add_operation(loaders.NewSimplePredictors())
preprocessor.add_operation(loaders.LogTransform(['price_per_bedroom', 'price', 'price_per_bathroom']))
preprocessor.with_pipeline('features').set_loader(json_loader.select_loader('features'))\
    .add_operation(loaders.FeaturesDummifier())

<renthop_preprocessing.loaders.Preprocessor at 0x7f0e3c76e250>

In [4]:
data = preprocessor.load_and_transform()

In [25]:
data['main']['listing_id'] = data['main']['listing_id'].astype('int')
columns = ['listing_id']
columns.extend(data['features'].columns)
features = pd.DataFrame(np.hstack([data['main'][['listing_id']], data['features']]), columns = columns)
for col in features.columns:
    features[col] = features[col].astype('int')

In [27]:
features.to_csv('data/features_train.csv', index = False)

In [26]:
features.head()

Unnamed: 0,listing_id,feature_exclusive,feature_virtual_doorman,feature_furnished,feature_lowrise,feature_bike_room,feature_no_pets,feature_terrace,feature_valet,feature_fitness_center,...,feature_super,feature_new_construction,feature_dishwasher,feature_light,feature_central_a/c,feature_reduced_fee,feature_dogs_allowed,feature_high_ceilings,feature_green_building,feature_cats_allowed
0,7211212,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7150865,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,1
2,6887163,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,1
3,6888711,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,6934781,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
data = preprocessor.load_and_transform(test = True)
data['main']['listing_id'] = data['main']['listing_id'].astype('int')
columns = ['listing_id']
columns.extend(data['features'].columns)
features = pd.DataFrame(np.hstack([data['main'][['listing_id']], data['features']]), columns = columns)
for col in features.columns:
    features[col] = features[col].astype('int')
features.to_csv('data/features_test.csv', index = False)