In [57]:
import base64
import datetime
import json
import os
import random
import re
import time

import scipy
from scipy import ndimage
import numpy as np
import pandas as pd
import sklearn
import matplotlib as mpl
import matplotlib.pyplot as plt
from pylab import rcParams
import seaborn as sns
import statsmodels as sm
from pylab import rcParams
from pylab import *
from matplotlib.dates import date2num , DateFormatter
from PIL import Image

np.random.seed(1337)

%matplotlib inline
sns.set(font_scale=1.0)
rcParams['figure.figsize'] = 8, 6
sns.set_style('whitegrid')
sns.set_palette(sns.color_palette('muted'))
pd.options.display.max_colwidth = 10000

In [47]:
from collections import Counter

# Load the Data

In [48]:
d = pd.read_json('tmp/raw_data.json')
d.index.name = 'id'
d = d[['description']]
print len(d)

124011


In [49]:
d.dtypes

description    object
dtype: object

In [55]:
raw_data = pd.read_json('tmp/raw_data.json')

In [58]:
raw_data.description

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

# Some Examples

In [50]:
d.head()

Unnamed: 0_level_0,description
id,Unnamed: 1_level_1
0,Large with awesome terrace--accessible via bed...
1,Prime Soho - between Bleecker and Houston - Ne...
10,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...
100,New York chic has reached a new level ...
1000,Step into this fantastic new Construction in t...


In [51]:
# for desc in d.description.tolist()[:50]:
#     print desc
#     print '=' * 100
d.description.head(20).map(lambda s: sum([(not c.isalnum() and c!=" ") for c in s]))

id
0          49
1          16
10         43
100       105
1000       48
10000       0
100000     63
100001     45
100002     40
100003     96
100004     55
100005     36
100006     50
100007     78
100008      0
100009     39
10001      27
100010      8
100011      4
100012     66
Name: description, dtype: int64

# Basic Count Stats

In [53]:
def count_string_features(s):
    stats = {}
    
    stats['desc_num_breaks'] = s.count('<br />') + s.count('<br/>')
    s = s.replace('<br />', ' ')
    s = s.replace('<br/>', ' ')
    
    stats['desc_num_paragraphs'] = s.count('<p>')
    s = s.replace('<p>', ' ')
    
    stats['desc_num_website_redacted'] = s.count('<a  website_redacted')
    s = s.replace('<a  website_redacted', ' ')
    
    stats['desc_num_emails'] = s.count('kagglemanager@renthop.com')
    s = s.replace('kagglemanager@renthop.com', ' ')
    
    phone_regex = '\d+(\s*-\s*\d+){1,4}'
    stats['desc_num_phones'] = len(re.findall(phone_regex, s))
    
    nr_of_non_alphanum_characters = sum([(not c.isalnum() and c!=" ") for c in s])
    nr_of_characters = len(s)
    stats['count_non_alphanumeric_characters'] = nr_of_non_alphanum_characters
    stats['num_characters'] = nr_of_characters
    stats['ratio_non_alphanumeric_characters'] = float(nr_of_non_alphanum_characters) / nr_of_characters if nr_of_characters != 0 else 0
    s = re.sub(phone_regex, ' ', s)
    
    s = re.sub('\W', ' ', s)
    s = re.sub('\s+', ' ', s)
    stats['desc_num_words'] = len(s.split(' '))
    
    stats['desc_clean'] = s.lower()
    
    return pd.Series(stats)

d = d.description.apply(count_string_features)

In [54]:
d.head()

Unnamed: 0_level_0,count_non_alphanumeric_characters,desc_clean,desc_num_breaks,desc_num_emails,desc_num_paragraphs,desc_num_phones,desc_num_website_redacted,desc_num_words,num_characters,ratio_non_alphanumeric_characters
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,25.0,large with awesome terrace accessible via bedr...,6,1,1,0,1,85,512,0.048828
1,16.0,prime soho between bleecker and houston newly ...,0,0,0,0,0,38,245,0.065306
10,21.0,a brand new 3 bedroom 1 5 bath apartmentenjoy ...,6,0,1,1,1,92,537,0.039106
100,37.0,new york chic has reached a new level at 101 ...,22,1,0,0,0,137,1077,0.034355
1000,24.0,step into this fantastic new construction in t...,6,1,1,1,1,199,1252,0.019169


In [20]:
noise = np.random.randn(len(d))
d['desc_num_words_q100'] = pd.qcut((d.desc_num_words + 0.1*noise), 100, [0.01 * i for i in range(0, 100)])

# Most Frequent Words

In [21]:
flat_words = [w for desc in d.desc_clean.tolist() for w in desc.split(' ')]
flat_word_counts = Counter(flat_words)
flat_word_counts.most_common(10)

[(u'and', 376446),
 (u'the', 303398),
 (u'a', 272015),
 (u'to', 196570),
 (u'in', 192782),
 (u'with', 175049),
 (u'of', 160566),
 (u'', 139531),
 (u'is', 116454),
 (u'this', 113211)]

In [22]:
top_100_words = set([wc[0] for wc in flat_word_counts.most_common(100)])
top_500_words = set([wc[0] for wc in flat_word_counts.most_common(500)])
top_2000_words = set([wc[0] for wc in flat_word_counts.most_common(2000)])

In [23]:
d['desc_top_100_word_coverage'] = d.desc_clean.apply(
    lambda ws: len(top_100_words.intersection(set(ws.split(' '))))/100.0)
d['desc_top_500_word_coverage'] = d.desc_clean.apply(
    lambda ws: len(top_500_words.intersection(set(ws.split(' '))))/500.0)
d['desc_top_2000_word_coverage'] = d.desc_clean.apply(
    lambda ws: len(top_2000_words.intersection(set(ws.split(' '))))/2000.0)

d['desc_top_100_word_ratio'] = d.desc_clean.apply(
    lambda ws:
        len(top_100_words.intersection(set(ws.split(' '))))/
        (1.0 * len(ws.split(' ')))
        if len(ws.split(' ')) > 0 else 0)
d['desc_top_500_word_ratio'] = d.desc_clean.apply(
    lambda ws:
        len(top_500_words.intersection(set(ws.split(' '))))/
        (1.0 * len(ws.split(' ')))
        if len(ws.split(' ')) > 0 else 0)
d['desc_top_2000_word_ratio'] = d.desc_clean.apply(
    lambda ws:
        len(top_2000_words.intersection(set(ws.split(' '))))/
        (1.0 * len(ws.split(' ')))
        if len(ws.split(' ')) > 0 else 0)

In [24]:
d.head(20).T

id,0,1,10,100,1000,10000,100000,100001,100002,100003,100004,100005,100006,100007,100008,100009,10001,100010,100011,100012
desc_clean,large with awesome terrace accessible via bedr...,prime soho between bleecker and houston newly ...,a brand new 3 bedroom 1 5 bath apartmentenjoy ...,new york chic has reached a new level at 101 ...,step into this fantastic new construction in t...,,take a stroll in central park enjoy the enter...,in a boutique prewar building with 24 hour doo...,why pay other brokers 15 for a great apartment...,spectacular renovated studio in restored town...,top top west village location beautiful pre wa...,beautiful 2br apartment with cherry hardwood f...,hardwood floors exposed brick large living spa...,building amenities garage garden fitness room ...,,midtown neighborhood convenient to grand centr...,newly renovated no fee apt in gramercy with wa...,great value for this 3 bedroom in this area ap...,amazing true three bedroom located in the low ...,naturally illuminated trough large windows wit...
desc_num_breaks,6,0,6,22,6,0.0,6,10,6,6,10,6,14,18,0.0,6,0,0,0,6
desc_num_emails,1,0,0,1,1,0.0,1,0,0,0,0,0,0,0,0.0,1,0,0,0,0
desc_num_paragraphs,1,0,1,0,1,0.0,1,1,1,1,1,1,1,1,0.0,1,0,0,0,1
desc_num_phones,0,0,1,0,1,0.0,1,1,2,1,0,0,0,0,0.0,1,0,0,0,0
desc_num_website_redacted,1,0,1,0,1,0.0,1,1,1,1,1,1,1,1,0.0,1,0,0,0,1
desc_num_words,85,38,92,137,199,2.0,187,77,98,40,103,43,43,58,2.0,61,110,39,19,206
desc_num_words_q100,0.53,0.18,0.58,0.82,0.94,0.03,0.93,0.47,0.63,0.19,0.66,0.21,0.21,0.32,0.07,0.34,0.69,0.19,0.1,0.95
desc_top_100_word_coverage,0.29,0.15,0.3,0.41,0.39,0.01,0.45,0.31,0.33,0.23,0.46,0.19,0.24,0.3,0.01,0.2,0.42,0.2,0.1,0.43
desc_top_500_word_coverage,0.106,0.06,0.12,0.138,0.176,0.002,0.198,0.078,0.13,0.062,0.16,0.066,0.08,0.102,0.002,0.078,0.142,0.054,0.032,0.226


# Save the Data

In [26]:
d.to_csv('tmp/features_description.csv')