In [1]:
import base64
import datetime
import json
import os
import random
import re
import time

import scipy
from scipy import ndimage
import numpy as np
import pandas as pd
import sklearn
import matplotlib as mpl
import matplotlib.pyplot as plt
from pylab import rcParams
import seaborn as sns
import statsmodels as sm
from pylab import rcParams
from pylab import *
from matplotlib.dates import date2num , DateFormatter
from PIL import Image

np.random.seed(1337)

%matplotlib inline
sns.set(font_scale=1.0)
rcParams['figure.figsize'] = 8, 6
sns.set_style('whitegrid')
sns.set_palette(sns.color_palette('muted'))
pd.options.display.max_colwidth = 10000



In [2]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

# Load the Data

In [3]:
d = pd.read_json('tmp/raw_data.json')
d.index.name = 'id'
d = d[['description']]
print len(d)

124011


In [4]:
d.dtypes

description    object
dtype: object

# Some Examples

In [5]:
d.head()

Unnamed: 0_level_0,description
id,Unnamed: 1_level_1
0,"Large with awesome terrace--accessible via bedroom and living room. Unique find in the LES.Apartment Features:-Large terrace via bedroom and living room-Hardwood floors-Newly renovated -Granite counter top-Breakfast Bar-Ample counter space and storage-Dishwasher-Great Lighting Neighborhood Features:-A few blocks from Whole Foods-1 block from the J, Z and M subway-All the restaurants and night life the Lower East Side is known for (Hotel Chantel, DL, Pianos)Call/txt/Email James to set up a showing:kagglemanager@renthop.com<br /><br /><br /><br /><br /><br /><p><a website_redacted"
1,"Prime Soho - between Bleecker and Houston - Newly renovated, with stainless steel appliances, hardwood floors, marble bath. Convenient to world class restaurants, shopping, galleries. Subway C,E,B,D,F,M. To schedule a viewing call Andrew today."
10,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy These Following Apartment Features As You Rent Here? Modern Designed Bathroom w/ a Deep Spa Soaking Tub? Room to Room AC/Heat? Real Oak Hardwood Floors? Rain Forest Shower Head? SS steel Appliances w/ Chef Gas Cook Oven & LG Fridge? washer /dryer in the apt? Cable Internet Ready? Granite Counter Top Kitchen w/ lot of cabinet storage spaceIt's Just A Few blocks To L Train<br /><br />Don't miss out!<br /><br />We have several great apartments in the immediate area.<br /><br />For additional information 687-878-2229<p><a website_redacted
100,"New York chic has reached a new level at 101 East 10th Street. Located on a prime corner in the East Village, steps from Union Square and Greenwich Village, it is sure to up the ante for coveted living spaces. 101 East 10th Street offers bespoke finishes, stainless appliances, marble bathrooms, ample closet space, and incredible natural light resulting in a rare genre of urban living.<br /><br />**Landlord is Offering 1 Month Free (12 month lease) - Advertised Rent is the Net Effective** <br /><br />Apartment Features:<br />• Built in A/C<br />• Brand New Renovations<br />• In Unit Laundry<br />• Stainless Steel Appliances<br />• Dishwasher<br />• Marble Bathrooms<br />• Oak Wood Flooring<br /> <br />Building Features:<br />• Part-time Doorman<br />• Virtual Doorman<br />• Brand New Common Area Renovations<br />• Renovated Hallways<br />• Elevator<br />• Unbeatable Union Square Location<br />• Steps to Whole Foods, Trader Joes & Westside Market<br /> <br />Contact leasing to schedule a viewing: kagglemanager@renthop.com"
1000,"Step into this fantastic new Construction in the heart of Williamsburg. This modern brand new apartment is the ultimate expression of modern living. The apartment boasts floor to ceiling windows, eco friendly bamboo flooring flowing from wall to wall and a video intercom. Considerable thought has gone into the kitchen design and details. White lacquer and frosted glass cabinetry and white quartz countertops provide a bright setting for storing and slicing, while a stainless steel appliance package including a wine chiller enhances serving dinner for two or hosting a larger crowd. Bathroom features white glass and grey ceramic tiles, a white custom oak vanity ,an over-sized soaking tub and separate, glass-enclosed steam shower, The apartment also has it's own storage space in the building and a private balcony! The building features a warm common back garden, a serene rooftop deck with views over the surrounding tree lined streetsYour new apartment is just steps away from everything that makes Williamsburg unique: emerging-designer boutiques, bars and chef-run restaurants. The city is just minutes away by L train at Bedford Avenue or Lorimer Street stops. Contact me today to set up your exclusive viewing at kagglemanager@renthop.com or 966-636-7822<br /><br /><br /><br /><br /><br /><p><a website_redacted"


In [6]:
# for desc in d.description.tolist()[:50]:
#     print desc
#     print '=' * 100
d.description.head(20).map(lambda s: sum([(not c.isalnum() and c!=" ") for c in s]))

id
0          49
1          16
10         43
100       105
1000       48
10000       0
100000     63
100001     45
100002     40
100003     96
100004     55
100005     36
100006     50
100007     78
100008      0
100009     39
10001      27
100010      8
100011      4
100012     66
Name: description, dtype: int64

# Basic Count Stats

In [7]:
def count_string_features(s):
    stats = {}
    
    stats['desc_num_breaks'] = s.count('<br />') + s.count('<br/>') + 0.0
    s = s.replace('<br />', ' ')
    s = s.replace('<br/>', ' ')
    
    stats['desc_num_paragraphs'] = s.count('<p>') + 0.0
    s = s.replace('<p>', ' ')
    
    stats['desc_num_nonalpha'] = sum([(not c.isalnum()) for c in s]) + 0.0
    
    stats['desc_num_parentheses'] = s.count('(') + s.count(')') + 0.0
    s = s.replace('(', ' ')
    s = s.replace(')', ' ')
    
    stats['desc_num_bangs'] = s.count('!') + 0.0
    s = s.replace('!', ' ')
    
    stats['desc_num_website_redacted'] = s.count('<a  website_redacted') + 0.0
    s = s.replace('<a  website_redacted', ' ')
    
    stats['desc_num_emails'] = s.count('kagglemanager@renthop.com') + 0.0
    s = s.replace('kagglemanager@renthop.com', ' ')
    
    phone_regex = '\d+(\s*-\s*\d+){1,4}'
    stats['desc_num_phones'] = len(re.findall(phone_regex, s)) + 0.0
    
    nr_of_non_alphanum_characters = sum([(not c.isalnum() and c!=" ") for c in s])
    nr_of_characters = len(s)
    stats['desc_count_non_alphanumeric_characters'] = nr_of_non_alphanum_characters + 0.0
    stats['num_characters'] = nr_of_characters + 0.0
    stats['ratio_non_alphanumeric_characters'] = (
        float(nr_of_non_alphanum_characters) /
        nr_of_characters if nr_of_characters != 0 else 0.0)
    s = re.sub(phone_regex, ' ', s)
    
    s = re.sub('\W', ' ', s)
    s = re.sub('\s+', ' ', s)
    stats['desc_num_words'] = len(s.split(' ')) + 0.0
    
    stats['desc_clean'] = s.lower()
    
    return pd.Series(stats)

d = d.description.apply(count_string_features)

In [8]:
d.dtypes

desc_clean                                 object
desc_count_non_alphanumeric_characters    float64
desc_num_bangs                            float64
desc_num_breaks                           float64
desc_num_emails                           float64
desc_num_nonalpha                         float64
desc_num_paragraphs                       float64
desc_num_parentheses                      float64
desc_num_phones                           float64
desc_num_website_redacted                 float64
desc_num_words                            float64
num_characters                            float64
ratio_non_alphanumeric_characters         float64
dtype: object

In [9]:
d.head().T

id,0,1,10,100,1000
desc_clean,large with awesome terrace accessible via bedroom and living room unique find in the les apartment features large terrace via bedroom and living room hardwood floors newly renovated granite counter top breakfast bar ample counter space and storage dishwasher great lighting neighborhood features a few blocks from whole foods 1 block from the j z and m subway all the restaurants and night life the lower east side is known for hotel chantel dl pianos call txt email james to set up a showing,prime soho between bleecker and houston newly renovated with stainless steel appliances hardwood floors marble bath convenient to world class restaurants shopping galleries subway c e b d f m to schedule a viewing call andrew today,a brand new 3 bedroom 1 5 bath apartmentenjoy these following apartment features as you rent here modern designed bathroom w a deep spa soaking tub room to room ac heat real oak hardwood floors rain forest shower head ss steel appliances w chef gas cook oven lg fridge washer dryer in the apt cable internet ready granite counter top kitchen w lot of cabinet storage spaceit s just a few blocks to l train don t miss out we have several great apartments in the immediate area for additional information,new york chic has reached a new level at 101 east 10th street located on a prime corner in the east village steps from union square and greenwich village it is sure to up the ante for coveted living spaces 101 east 10th street offers bespoke finishes stainless appliances marble bathrooms ample closet space and incredible natural light resulting in a rare genre of urban living landlord is offering 1 month free 12 month lease advertised rent is the net effective apartment features built in a c brand new renovations in unit laundry stainless steel appliances dishwasher marble bathrooms oak wood flooring building features part time doorman virtual doorman brand new common area renovations renovated hallways elevator unbeatable union square location steps to whole foods trader joes westside market contact leasing to schedule a viewing,step into this fantastic new construction in the heart of williamsburg this modern brand new apartment is the ultimate expression of modern living the apartment boasts floor to ceiling windows eco friendly bamboo flooring flowing from wall to wall and a video intercom considerable thought has gone into the kitchen design and details white lacquer and frosted glass cabinetry and white quartz countertops provide a bright setting for storing and slicing while a stainless steel appliance package including a wine chiller enhances serving dinner for two or hosting a larger crowd bathroom features white glass and grey ceramic tiles a white custom oak vanity an over sized soaking tub and separate glass enclosed steam shower the apartment also has it s own storage space in the building and a private balcony the building features a warm common back garden a serene rooftop deck with views over the surrounding tree lined streetsyour new apartment is just steps away from everything that makes williamsburg unique emerging designer boutiques bars and chef run restaurants the city is just minutes away by l train at bedford avenue or lorimer street stops contact me today to set up your exclusive viewing at or
desc_count_non_alphanumeric_characters,23,16,20,35,23
desc_num_bangs,0,0,1,0,1
desc_num_breaks,6,0,6,22,6
desc_num_emails,1,0,0,1,1
desc_num_nonalpha,107,50,118,371,232
desc_num_paragraphs,1,0,1,0,1
desc_num_parentheses,2,0,0,2,0
desc_num_phones,0,0,1,0,1
desc_num_website_redacted,1,0,1,0,1


In [10]:
l = len(d)
d['desc_num_words_q'] = 1.0 * d.desc_num_words.rank() / l

In [11]:
d[['desc_num_words', 'desc_num_words_q']].head(10)

Unnamed: 0_level_0,desc_num_words,desc_num_words_q
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,85.0,0.533352
1,38.0,0.188725
10,92.0,0.585565
100,137.0,0.827156
1000,199.0,0.947718
10000,2.0,0.052991
100000,187.0,0.933502
100001,77.0,0.470232
100002,98.0,0.628267
100003,40.0,0.20001


# Word Counts

In [12]:
NUM_COUNT_FEATURES = 50
cv = CountVectorizer(stop_words='english', max_features=NUM_COUNT_FEATURES)
cv.fit(d.desc_clean)
voc = {v: k for k, v in cv.vocabulary_.iteritems()}
dc = pd.DataFrame(cv.transform(d.desc_clean).todense())
dc.columns = [('desc_count_' + voc[i]) for i in range(NUM_COUNT_FEATURES)]
for c in dc.columns:
    d[c] = dc[c]

# Most Frequent Words

In [13]:
flat_words = [w for desc in d.desc_clean.tolist() for w in desc.split(' ')]
flat_word_counts = Counter(flat_words)
flat_word_counts.most_common(10)

[(u'and', 376446),
 (u'the', 303398),
 (u'a', 272015),
 (u'to', 196570),
 (u'in', 192782),
 (u'with', 175049),
 (u'of', 160566),
 (u'', 139531),
 (u'is', 116454),
 (u'this', 113211)]

In [14]:
#top_100_words = set([wc[0] for wc in flat_word_counts.most_common(100)])
top_500_words = set([wc[0] for wc in flat_word_counts.most_common(500)])
#top_2000_words = set([wc[0] for wc in flat_word_counts.most_common(2000)])

In [15]:
# d['desc_top_100_word_coverage'] = d.desc_clean.apply(
#     lambda ws: len(top_100_words.intersection(set(ws.split(' '))))/100.0)
d['desc_top_500_word_coverage'] = d.desc_clean.apply(
    lambda ws: len(top_500_words.intersection(set(ws.split(' '))))/500.0)
# d['desc_top_2000_word_coverage'] = d.desc_clean.apply(
#     lambda ws: len(top_2000_words.intersection(set(ws.split(' '))))/2000.0)

# d['desc_top_100_word_ratio'] = d.desc_clean.apply(
#     lambda ws:
#         len(top_100_words.intersection(set(ws.split(' '))))/
#         (1.0 * len(ws.split(' ')))
#         if len(ws.split(' ')) > 0 else 0)
d['desc_top_500_word_ratio'] = d.desc_clean.apply(
    lambda ws:
        len(top_500_words.intersection(set(ws.split(' '))))/
        (1.0 * len(ws.split(' ')))
        if len(ws.split(' ')) > 0 else 0)
# d['desc_top_2000_word_ratio'] = d.desc_clean.apply(
#     lambda ws:
#         len(top_2000_words.intersection(set(ws.split(' '))))/
#         (1.0 * len(ws.split(' ')))
#         if len(ws.split(' ')) > 0 else 0)

In [16]:
d.head(10).T

id,0,1,10,100,1000,10000,100000,100001,100002,100003
desc_clean,large with awesome terrace accessible via bedroom and living room unique find in the les apartment features large terrace via bedroom and living room hardwood floors newly renovated granite counter top breakfast bar ample counter space and storage dishwasher great lighting neighborhood features a few blocks from whole foods 1 block from the j z and m subway all the restaurants and night life the lower east side is known for hotel chantel dl pianos call txt email james to set up a showing,prime soho between bleecker and houston newly renovated with stainless steel appliances hardwood floors marble bath convenient to world class restaurants shopping galleries subway c e b d f m to schedule a viewing call andrew today,a brand new 3 bedroom 1 5 bath apartmentenjoy these following apartment features as you rent here modern designed bathroom w a deep spa soaking tub room to room ac heat real oak hardwood floors rain forest shower head ss steel appliances w chef gas cook oven lg fridge washer dryer in the apt cable internet ready granite counter top kitchen w lot of cabinet storage spaceit s just a few blocks to l train don t miss out we have several great apartments in the immediate area for additional information,new york chic has reached a new level at 101 east 10th street located on a prime corner in the east village steps from union square and greenwich village it is sure to up the ante for coveted living spaces 101 east 10th street offers bespoke finishes stainless appliances marble bathrooms ample closet space and incredible natural light resulting in a rare genre of urban living landlord is offering 1 month free 12 month lease advertised rent is the net effective apartment features built in a c brand new renovations in unit laundry stainless steel appliances dishwasher marble bathrooms oak wood flooring building features part time doorman virtual doorman brand new common area renovations renovated hallways elevator unbeatable union square location steps to whole foods trader joes westside market contact leasing to schedule a viewing,step into this fantastic new construction in the heart of williamsburg this modern brand new apartment is the ultimate expression of modern living the apartment boasts floor to ceiling windows eco friendly bamboo flooring flowing from wall to wall and a video intercom considerable thought has gone into the kitchen design and details white lacquer and frosted glass cabinetry and white quartz countertops provide a bright setting for storing and slicing while a stainless steel appliance package including a wine chiller enhances serving dinner for two or hosting a larger crowd bathroom features white glass and grey ceramic tiles a white custom oak vanity an over sized soaking tub and separate glass enclosed steam shower the apartment also has it s own storage space in the building and a private balcony the building features a warm common back garden a serene rooftop deck with views over the surrounding tree lined streetsyour new apartment is just steps away from everything that makes williamsburg unique emerging designer boutiques bars and chef run restaurants the city is just minutes away by l train at bedford avenue or lorimer street stops contact me today to set up your exclusive viewing at or,,take a stroll in central park enjoy the entertainment of time square go shopping on 5th avenue enjoy some of the city s best restaurants or simply stay at home and work out at the state of the art fitness center or relax in the steam room and get a massage the building features floor to ceiling and corner windows granite kitchen countertops cherry cabinetry stainless steel appliances baths with imported italian marble a 24 hour doorman and concierge attended parking valet and available maid service the club includes a cinema room for private screenings business center conference room and an exclusive residential lounge surrounded by a spacious and breathtaking fully landscaped 10 000 square foot sun deck please keep in mind that nyc has one of the most intense and busy real estate market in the country for that reason prices and availability change daily if not hourly that can be very stressful contact me and let my experience help you find the right place for you to call home you can contact me daily 9am 9pm at or 24 7 via email at,in a boutique prewar building with 24 hour doorman this apartment is the embodiment of elegance sophistication and space gorgeously renovated with granite and mosaic kitchen stainless still appliances complete with wine cooler the bathrooms are equally elegant with jacuzzi tubs and double sinks all with windows grand living room dining room 3 very large bedrooms the maid s room with a full bathroom of his own it s a pleasure showing this apartment call lia,why pay other brokers 15 for a great apartment when you can pay 7 5 for the same space with excellent customer service call me now flights up to this super renovated two bedroom offering light filled rooms open plan kitchen living room with stainless steel appliances strip wood flooring and customized closets building amenities live in super pets allowed heat included video intercom hot water includedif this apartment isn t exactly what you are looking for please give me a call i have many no fee low fee apartments all over nyc in our extensive database,spectacular renovated studio in restored townhouse tons of original historic details brand new renovation stainless steel appliances hardwood floors laundry in the building top location close to all your needs to schedule a viewing call text email ___________________________________________________________
desc_count_non_alphanumeric_characters,23,16,20,35,23,0,39,11,18,74
desc_num_bangs,0,0,1,0,1,0,0,0,0,0
desc_num_breaks,6,0,6,22,6,0,6,10,6,6
desc_num_emails,1,0,0,1,1,0,1,0,0,0
desc_num_nonalpha,107,50,118,371,232,8,223,100,128,130
desc_num_paragraphs,1,0,1,0,1,0,1,1,1,1
desc_num_parentheses,2,0,0,2,0,0,0,0,0,0
desc_num_phones,0,0,1,0,1,0,1,1,2,1
desc_num_website_redacted,1,0,1,0,1,0,1,1,1,1


# Save the Data

In [17]:
del d['desc_clean']

In [18]:
d.to_csv('tmp/features_description.csv')