In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
FTRAIN = 'data/train.json'
FTEST = 'data/test.json'

In [3]:
df_train = pd.read_json(FTRAIN)

In [4]:
df_test = pd.read_json(FTEST)

In [5]:
def get_longest_features(df, num_results = 10):
    return feature_counts.loc[feature_counts['name'].apply(len).sort_values(ascending = False).head(num_results).index, :]

In [6]:
def extract_features(feature_parser = lambda x: [x.lower()]):
    features = {}
    for df in [df_train, df_test]:
        for flist in df['features']:
            for ft in flist:
                for ft_parsed in feature_parser(ft):
                    features[ft_parsed] = features.get(ft_parsed, 0) + 1
    return pd.DataFrame([[k, v] for k, v in features.iteritems()], columns = ['name', 'counts'])

In [29]:
def subparser(x):
    x = x.lower().replace('-', ' ').strip()
    if x[0] == '{':
        return [y.replace('"', '').strip() for y in re.findall(r'(?<=\d\s=\s)([^;]+);', x)]
    x = x.split(u'\u2022')
    return [z for y in x for z in re.split(r'[\.\s!;]!*\s+|\s+-\s+|\s*\*\s*', y)]

def parser(x):
    return [z for z in [y.strip() for y in subparser(x)] if len(z) > 0]

In [30]:
feature_counts = extract_features(parser)

In [31]:
len(feature_counts)

2632

In [32]:
for name in get_longest_features(feature_counts, 50)['name']:
    print name

bike room driveway garage garden fitness facility(fee) spa services pool sauna steam room laundry on each flr valet playroom lounge billiards room rooftop deck common storage wifi access
east and west resident lounges reading room outdoor lounges with river views party room chefs kitchen movie screening room tot spot playroom ping pong
bike room driveway garage water filtration fitness room pool sauna laundry on each flr housekeeping valet playroom lounge rooftop deck common storage
bike room garage garden fitness facility(fee) spa services valet playroom lounge party room rooftop deck private storage wifi access business center
residents only fitness center and aerobic room professionally outfitted with a full complement of strength and cardio training equipment
and stainless steel appliances with a dishwasher well maintained hardwood floors add to the warmth and modern feel of the space.
bike room garage fitness facility laundry room housekeeping valet lounge party room rooftop deck 

In [33]:
for ind, row in feature_counts.sort_values(by = 'counts', ascending = False).head(100).iterrows():
    print("%s, %d" % (row['name'], row['counts']))

elevator, 65835
cats allowed, 59195
hardwood floors, 59157
dogs allowed, 55208
doorman, 52514
dishwasher, 52089
laundry in building, 47483
no fee, 45467
fitness center, 33422
laundry in unit, 23753
pre war, 23125
roof deck, 17470
outdoor space, 13415
dining room, 12848
high speed internet, 10667
balcony, 7793
swimming pool, 7154
new construction, 6457
terrace, 5707
exclusive, 5471
loft, 5285
garden/patio, 4894
prewar, 3433
wheelchair access, 3393
common outdoor space, 3294
hardwood, 2696
simplex, 2327
fireplace, 2315
lowrise, 2026
garage, 1892
reduced fee, 1817
laundry room, 1788
furnished, 1674
multi level, 1592
high ceilings, 1537
private outdoor space, 1451
live in super, 1174
parking space, 1045
publicoutdoor, 1020
renovated, 821
pool, 796
on site laundry, 786
laundry, 729
green building, 606
storage, 499
high ceiling, 494
stainless steel appliances, 467
dryer in unit, 426
washer in unit, 426
concierge, 419
newly renovated, 390
on site garage, 358
washer/dryer, 354
light, 349
patio

Let's select the most frequent features and a regex that attempts to detect all variations of such feature.

In [19]:
feature_map = {'elevator': 'elevator',
               'cats allowed': r'(?<!\w)cats?(?!\w)|(?<!\w)(?<!no )pets?(?!\w)',
               'dogs allowed': r'(?<!\w)dogs?(?!\w)|(?<!\w)(?<!no )pets?(?!\w)(?!: cats only)',
               'hardwood floors': 'hardwood',
               'doorman': r'(?<!virtual )doorman',
               'dishwasher': 'dishwasher|dw(?!\w)',
               'laundry': r'laundry(?! is on the blo)',
               'no fee': 'no fee',
               'fitness center': r'fitness(?! goals)|gym',
               'pre war': r'pre\s?war',
               'roof deck': 'roof',
               'outdoor space': 'outdoor|garden|patio',
               'dining room': 'dining',
               'high speed internet': r'high.*internet',
               'balcony': r'balcon(y|ies)|private.*terrace',
               'terrace': 'terrace',
               'swimming pool': r'pool(?! table)',
               'new construction': 'new construction',
               'exclusive': r'exclusive( rental)?$',
               'loft': r'(?<!sleep )loft(?! bed)',
               'wheelchair access': 'wheelchair',
               'simplex': 'simplex',
               'fireplace': ['fireplace(?! storage)', 'deco'], # looks for first regex, excluding matches of the second regex
               'lowrise': r'low\s?rise',
               'garage': r'garage|indoor parking',
               'reduced fee': r'(reduced|low) fee',
               'furnished': ['(?<!un)furni', 'deck|inquire|terrace'],
               'multi level': r'multi\s?level|duplex',
               'high ceilings': r'(hig?h|tall) .*ceiling',
               'super': r'(live|site).*super',
               'parking': r'(?<!street )(?<!side )parking(?! available nearby)',
               'renovated': 'renovated',
               'green building': 'green building',
               'storage': 'storage',
               'stainless steel appliances': r'stainless.*(appliance|refrigerator)',
               'concierge': 'concierge',
               'light': r'(?<!\w)(sun)?light(?!\w)',
               'exposed brick': 'exposed brick',
               'eat in kitchen': r'eat.*kitchen',
               'granite kitchen': 'granite kitchen',
               'bike room': r'(?<!citi)(?<!citi )bike',
               'walk in closet': r'walk.*closet',
               'marble bath': r'marble.*bath',
               'valet': 'valet',
               'subway': r'subway|trains?(?!\w)',
               'lounge': 'lounge',
               'short term allowed': 'short term',
               'children\'s playroom': r'(child|kid).*room',
               'no pets': 'no pets',
               'central a/c': r'central a|ac central',
               'luxury building': 'luxur',
               'view': r'(?<!\w)views?(?!\w)|skyline',
               'virtual doorman': 'virtual d',
               'courtyard': 'courtyard',
               'microwave': 'microwave|mw',
               'sauna': 'sauna'}

In [141]:
def search_regex(regexes):
    if isinstance(regexes, basestring):
        filter_fun = lambda x: re.search(regexes, x) is not None
    else:
        filter_fun = lambda x: re.search(regexes[0], x) is not None and re.search(regexes[1], x) is None
    for ind, row in feature_counts[feature_counts['name'].apply(filter_fun)].iterrows():
        print("%s, %d" % (row['name'], row['counts']))

In [181]:
search_regex(r'terrace')

sprawling sunfilled 2br w/private terrace, 1
live in superintendent terraces / balconies, 1
private terrace & scenic roof deck, 1
expansive rooftop lounge with outdoor terrace boasting spectacular views, 1
wrap around terrace, 1
2 private terrace, 1
sprawling 2br w/terrace, 1
xl terrace, 1
two private terrace, 1
private rooftop terrace, 1
huge terrace, 1
oversize terrace, 1
an expansive landscaped terrace and sun deck, 3
massive private terrace, 1
terrace, 5707
landscaped terrace with bbq grill, 2
furnished wrap around terrace, 2
huge rear terrace, 1
large private terrace, 1
expansive 2br w/1000 sq ft terrace, 1
scenic private terrace, 2
huge private terrace, 2
furnished sun terrace water filtration, 1
massive 2br home w/ xl terrace, 1
terraces / balconies, 26
/terrace/ washer dryer/ closet space!!!, 1
private large terrace, 1
2 private terraces, 1
outdoor terrace, 6
outdoor grilling terrace, 3
rooftop terrace, 41
an outdoor entertaining terrace, 1
landscaped terrace complete with an o

In [33]:
' # '.join([a for b in df_train[df_train['features'].apply(lambda y: np.any([re.search('elevator/laundry', x.lower()) is not None for x in y]))]['features'] for a in b])

u'Elevator # Laundry In Building # ELEVATOR/LAUNDRY/ SO CLOSE TO THE 6 $2450!!'

In [32]:
df_train[df_train['features'].apply(lambda y: np.any([re.search('elevator/laundry', x.lower()) is not None for x in y]))]

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,longitude,manager_id,photos,price,street_address
8633,1.0,1,0,2016-06-22 14:10:24,GREAT LOCATION ON THIS TRUE 1 BEDROOM IN MURRA...,,"[Elevator, Laundry In Building, ELEVATOR/LAUND...",low,40.7128,7201950,-74.0059,a4a468c229a6094d3811489361d08819,[https://photos.renthop.com/2/7201950_2d04c9c0...,2450,
