In [1]:
import base64
import datetime
import json
import os
import random
import time

import scipy
from scipy import ndimage
import numpy as np
import pandas as pd
import sklearn
import matplotlib as mpl
import matplotlib.pyplot as plt
from pylab import rcParams
import seaborn as sns
import statsmodels as sm
from pylab import rcParams
from pylab import *
from matplotlib.dates import date2num , DateFormatter
from PIL import Image

np.random.seed(1337)

%matplotlib inline
sns.set(font_scale=1.0)
rcParams['figure.figsize'] = 8, 6
sns.set_style('whitegrid')
sns.set_palette(sns.color_palette('muted'))



In [2]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

# Load the Data

In [3]:
d = pd.read_json('tmp/raw_data.json')
d.index.name = 'id'
d = d[['features']]
print len(d)

124011


In [4]:
d.dtypes

features    object
dtype: object

In [5]:
d.features.head()

id
0       [Elevator, Laundry in Building, Laundry in Uni...
1                   [Pre-War, Dogs Allowed, Cats Allowed]
10                                                     []
100                           [Doorman, Elevator, No Fee]
1000    [Roof Deck, Balcony, Elevator, Laundry in Buil...
Name: features, dtype: object

# Clean Features

In [6]:
d.features = d.features.apply(lambda fl: [f.lower().replace(' ', '_') for f in fl])

# Number of Features

In [7]:
d['num_features'] = d.features.apply(lambda f: len(f))

# Most Frequent Features

In [8]:
flat_features = [f for apt in d.features.tolist() for f in apt]
flat_feature_counts = Counter(flat_features)
flat_feature_counts.most_common(20)

[(u'elevator', 65833),
 (u'cats_allowed', 59194),
 (u'hardwood_floors', 59155),
 (u'dogs_allowed', 55207),
 (u'doorman', 52505),
 (u'dishwasher', 52035),
 (u'laundry_in_building', 47483),
 (u'no_fee', 45450),
 (u'fitness_center', 33420),
 (u'laundry_in_unit', 23752),
 (u'pre-war', 23111),
 (u'roof_deck', 16466),
 (u'outdoor_space', 13414),
 (u'dining_room', 12847),
 (u'high_speed_internet', 10622),
 (u'balcony', 7793),
 (u'swimming_pool', 7153),
 (u'new_construction', 6457),
 (u'terrace', 5707),
 (u'exclusive', 5470)]

In [9]:
top_10_features = set([fc[0] for fc in flat_feature_counts.most_common(10)])
top_30_features = set([fc[0] for fc in flat_feature_counts.most_common(30)])

In [10]:
d['top_10_feature_coverage'] = d.features.apply(
    lambda fs: len(top_10_features.intersection(set(fs)))/10.0)
d['top_30_feature_coverage'] = d.features.apply(
    lambda fs: len(top_30_features.intersection(set(fs)))/30.0)
d['top_10_feature_ratio'] = d.features.apply(
    lambda fs: len(top_10_features.intersection(set(fs)))/(1.0 * len(fs)) if len(fs) > 0 else 0)
d['top_30_feature_ratio'] = d.features.apply(
    lambda fs: len(top_30_features.intersection(set(fs)))/(1.0 * len(fs)) if len(fs) > 0 else 0)

In [11]:
d.head(10)

Unnamed: 0_level_0,features,num_features,top_10_feature_coverage,top_30_feature_coverage,top_10_feature_ratio,top_30_feature_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,"[elevator, laundry_in_building, laundry_in_uni...",6,0.5,0.2,0.833333,1.0
1,"[pre-war, dogs_allowed, cats_allowed]",3,0.2,0.1,0.666667,1.0
10,[],0,0.0,0.0,0.0,0.0
100,"[doorman, elevator, no_fee]",3,0.3,0.1,1.0,1.0
1000,"[roof_deck, balcony, elevator, laundry_in_buil...",10,0.5,0.333333,0.5,1.0
10000,"[doorman, elevator, fitness_center, cats_allow...",5,0.5,0.166667,1.0,1.0
100000,"[common_outdoor_space, cats_allowed, dogs_allo...",14,0.9,0.433333,0.642857,0.928571
100001,"[fireplace, dining_room, doorman, elevator, la...",11,0.9,0.366667,0.818182,1.0
100002,"[hardwood_floors, new_construction]",2,0.1,0.066667,0.5,1.0
100003,"[fireplace, pre-war, laundry_in_building, dish...",5,0.3,0.166667,0.6,1.0


# Word Counts

In [12]:
d['features_s'] = d.features.apply(lambda fs: ' '.join(fs))

In [13]:
NUM_COUNT_FEATURES = 200
cv = CountVectorizer(stop_words='english', max_features=NUM_COUNT_FEATURES)
cv.fit(d.features_s)
voc = {v: k for k, v in cv.vocabulary_.iteritems()}
dc = pd.DataFrame(cv.transform(d.features_s).todense())
dc.columns = [('features_count_' + voc[i]) for i in range(NUM_COUNT_FEATURES)]
for c in dc.columns:
    d[c] = dc[c]

In [14]:
d.head(20).T

id,0,1,10,100,1000,10000,100000,100001,100002,100003,100004,100005,100006,100007,100008,100009,10001,100010,100011,100012
features,"[elevator, laundry_in_building, laundry_in_uni...","[pre-war, dogs_allowed, cats_allowed]",[],"[doorman, elevator, no_fee]","[roof_deck, balcony, elevator, laundry_in_buil...","[doorman, elevator, fitness_center, cats_allow...","[common_outdoor_space, cats_allowed, dogs_allo...","[fireplace, dining_room, doorman, elevator, la...","[hardwood_floors, new_construction]","[fireplace, pre-war, laundry_in_building, dish...","[laundry_in_building, dishwasher, hardwood_flo...","[dogs_allowed, cats_allowed]","[loft, hardwood_floors]","[hardwood_floors, no_fee]","[cats_allowed, dogs_allowed]","[dining_room, doorman, elevator, garden/patio,...","[laundry_in_unit, private_outdoor_space, no_fee]","[doorman, pre-war]","[pre-war, dogs_allowed, cats_allowed]","[roof_deck, doorman, elevator, pre-war, laundr..."
num_features,6,3,0,3,10,5,14,11,2,5,4,2,2,2,2,10,3,2,3,5
top_10_feature_coverage,0.5,0.2,0,0.3,0.5,0.5,0.9,0.9,0.1,0.3,0.3,0.2,0.1,0.2,0.2,0.7,0.2,0.1,0.2,0.3
top_30_feature_coverage,0.2,0.1,0,0.1,0.333333,0.166667,0.433333,0.366667,0.0666667,0.166667,0.1,0.0666667,0.0666667,0.0666667,0.0666667,0.333333,0.0666667,0.0666667,0.1,0.166667
top_10_feature_ratio,0.833333,0.666667,0,1,0.5,1,0.642857,0.818182,0.5,0.6,0.75,1,0.5,1,1,0.7,0.666667,0.5,0.666667,0.6
top_30_feature_ratio,1,1,0,1,1,1,0.928571,1,1,1,0.75,1,1,1,1,1,0.666667,1,1,1
features_s,elevator laundry_in_building laundry_in_unit d...,pre-war dogs_allowed cats_allowed,,doorman elevator no_fee,roof_deck balcony elevator laundry_in_building...,doorman elevator fitness_center cats_allowed d...,common_outdoor_space cats_allowed dogs_allowed...,fireplace dining_room doorman elevator laundry...,hardwood_floors new_construction,fireplace pre-war laundry_in_building dishwash...,laundry_in_building dishwasher hardwood_floors...,dogs_allowed cats_allowed,loft hardwood_floors,hardwood_floors no_fee,cats_allowed dogs_allowed,dining_room doorman elevator garden/patio pre-...,laundry_in_unit private_outdoor_space no_fee,doorman pre-war,pre-war dogs_allowed cats_allowed,roof_deck doorman elevator pre-war laundry_in_...
features_count_24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
features_count__balconies,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
features_count__chef,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# Save the Data

In [15]:
del d['features']
del d['features_s']

In [16]:
d.to_csv('tmp/features_features.csv')