In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from PIL import Image
import glob
import os

# Preprocessing text of test dataset for Classifier
<b> Note: this dataset DO NOT contain the interest_level column (target variable) </b>

# Load Dataset

In [2]:
testDataDir = "dataset/two-sigma-connect-rental-listing-inquiries/test.json.zip"

testData = pd.read_json(testDataDir, convert_dates=['created'])
testData = testData.reset_index(drop=True)
testData.head(5)

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,latitude,listing_id,longitude,manager_id,photos,price,street_address
0,1.0,1,79780be1514f645d7e6be99a3de696c5,2016-06-11 05:29:41,Large with awesome terrace--accessible via bed...,Suffolk Street,"[Elevator, Laundry in Building, Laundry in Uni...",40.7185,7142618,-73.9865,b1b1852c416d78d7765d746cb1b8921f,[https://photos.renthop.com/2/7142618_1c45a2c8...,2950,99 Suffolk Street
1,1.0,2,0,2016-06-24 06:36:34,Prime Soho - between Bleecker and Houston - Ne...,Thompson Street,"[Pre-War, Dogs Allowed, Cats Allowed]",40.7278,7210040,-74.0,d0b5648017832b2427eeb9956d966a14,[https://photos.renthop.com/2/7210040_d824cc71...,2850,176 Thompson Street
2,1.0,0,0,2016-06-17 01:23:39,Spacious studio in Prime Location. Cleanbuildi...,Sullivan Street,"[Pre-War, Dogs Allowed, Cats Allowed]",40.726,7174566,-74.0026,e6472c7237327dd3903b3d6f6a94515a,[https://photos.renthop.com/2/7174566_ba3a35c5...,2295,115 Sullivan Street
3,1.0,2,f9c826104b91d868e69bd25746448c0c,2016-06-21 05:06:02,For immediate access call Bryan.<br /><br />Bo...,Jones Street,"[Hardwood Floors, Dogs Allowed, Cats Allowed]",40.7321,7191391,-74.0028,41735645e0f8f13993c42894023f8e58,[https://photos.renthop.com/2/7191391_8c2f2d49...,2900,23 Jones Street
4,1.0,1,81062936e12ee5fa6cd2b965698e17d5,2016-06-16 07:24:27,Beautiful TRUE 1 bedroom in a luxury building ...,Exchange Place,"[Roof Deck, Doorman, Elevator, Fitness Center,...",40.7054,7171695,-74.0095,a742cf7dd3b2627d83417bc3a1b3ec96,[https://photos.renthop.com/2/7171695_089ffee2...,3254,20 Exchange Place


In [3]:
# also include number of photos
testData["num_photos"] = testData["photos"].apply(lambda x: 0 if not x else len(x))

# Text Data

### Feature Extraction

In [4]:
testData['desc_word_count'] = testData['description'].apply(lambda x: len(str(x).split(" ")))
testData['desc_char_count'] = testData['description'].str.len()

testData[['desc_word_count', 'desc_char_count']]

Unnamed: 0,desc_word_count,desc_char_count
0,78,587
1,35,245
2,39,268
3,22,146
4,85,564
...,...,...
74654,45,258
74655,111,701
74656,141,866
74657,100,816


In [5]:
def avg_word(sentence):
    words = sentence.split()
    if (len(words) == 0):
        return 0
    
    return (sum(len(word) for word in words)/len(words))

testData['desc_avg_word_length'] = testData['description'].apply(lambda x: avg_word(x))
testData[['desc_avg_word_length']]

Unnamed: 0,desc_avg_word_length
0,6.710526
1,6.205882
2,6.052632
3,6.250000
4,5.783133
...,...
74654,4.478261
74655,5.523364
74656,5.185714
74657,7.170000


In [6]:
def count_non_alpha_num(str):
    return sum(1 - (s.isalpha() or s == " ") for s in str)

testData['desc_special_char_count'] = testData['description'].apply(lambda x: count_non_alpha_num(x))
testData[['description', 'desc_special_char_count']]

Unnamed: 0,description,desc_special_char_count
0,Large with awesome terrace--accessible via bed...,50
1,Prime Soho - between Bleecker and Houston - Ne...,16
2,Spacious studio in Prime Location. Cleanbuildi...,7
3,For immediate access call Bryan.<br /><br />Bo...,12
4,Beautiful TRUE 1 bedroom in a luxury building ...,41
...,...,...
74654,BRAND NEW TO MARKET 1BDR \r107TH AND LEXINGTON...,21
74655,Convertible 2BR apartment features a brand new...,85
74656,"Let's get you in to see this $2,400/mo, recent...",36
74657,CooperCooper.com :: Web ID #171357; Access 100...,101


In [7]:
testData['desc_num_count'] = testData['description'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
testData['desc_upper_count'] = testData['description'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
testData[['description', 'desc_num_count', 'desc_upper_count']]

Unnamed: 0,description,desc_num_count,desc_upper_count
0,Large with awesome terrace--accessible via bed...,0,4
1,Prime Soho - between Bleecker and Houston - Ne...,0,1
2,Spacious studio in Prime Location. Cleanbuildi...,0,0
3,For immediate access call Bryan.<br /><br />Bo...,0,0
4,Beautiful TRUE 1 bedroom in a luxury building ...,1,5
...,...,...,...
74654,BRAND NEW TO MARKET 1BDR \r107TH AND LEXINGTON...,1,44
74655,Convertible 2BR apartment features a brand new...,0,1
74656,"Let's get you in to see this $2,400/mo, recent...",0,1
74657,CooperCooper.com :: Web ID #171357; Access 100...,0,16


### Data Preprocessing

#### Convert to lower case and remove punctuation

In [8]:
testData['description'] = testData['description'].apply(lambda x: " ".join(x.lower() for x in x.split()))
testData['description'] = testData['description'].str.replace('[^\w\s]','')

testData[['description']]

Unnamed: 0,description
0,large with awesome terraceaccessible via bedro...
1,prime soho between bleecker and houston newl...
2,spacious studio in prime location cleanbuildin...
3,for immediate access call bryanbr br bond new ...
4,beautiful true 1 bedroom in a luxury building ...
...,...
74654,brand new to market 1bdr 107th and lexington s...
74655,convertible 2br apartment features a brand new...
74656,lets get you in to see this 2400mo recently re...
74657,coopercoopercom web id 171357 access 1000s of...


#### Remove common and rare words

In [9]:
word_counts = pd.Series(' '.join(testData['description']).split()).value_counts()

freq = word_counts[:10]
rare = word_counts[-10:]
freq_list = list(freq.index)
rare_list = list(rare.index)

testData['description'] = testData['description'].apply(lambda x: " ".join(x for x in x.split() if x not in freq_list and x not in rare_list))

print(freq_list)
print(rare_list)
testData[['description']]

['and', 'br', 'the', 'a', 'to', 'in', 'with', 'of', 'is', 'this']
['vibing', '2178662', 'bathroombrwhite', 'neocontemporary', 'entertainer', 'onmurray', 'monfribr', '642934', 'bathroomkitchenetchigh', 'convenience__i']


Unnamed: 0,description
0,large awesome terraceaccessible via bedroom li...
1,prime soho between bleecker houston newly reno...
2,spacious studio prime location cleanbuilding h...
3,for immediate access call bryanbr bond new yor...
4,beautiful true 1 bedroom luxury building finan...
...,...
74654,brand new market 1bdr 107th lexington seconds ...
74655,convertible 2br apartment features brand new m...
74656,lets get you see 2400mo recently renovated spa...
74657,coopercoopercom web id 171357 access 1000s uni...


#### Remove stop words

In [10]:
# import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
#display(stopwords.words('english'))

In [11]:
stop_list = stopwords.words('english')
testData['description'] = testData['description'].apply(lambda x: " ".join(x for x in x.split() if x not in stop_list))

testData[['description']]

Unnamed: 0,description
0,large awesome terraceaccessible via bedroom li...
1,prime soho bleecker houston newly renovated st...
2,spacious studio prime location cleanbuilding h...
3,immediate access call bryanbr bond new york re...
4,beautiful true 1 bedroom luxury building finan...
...,...
74654,brand new market 1bdr 107th lexington seconds ...
74655,convertible 2br apartment features brand new m...
74656,lets get see 2400mo recently renovated spaciou...
74657,coopercoopercom web id 171357 access 1000s uni...


In [12]:
testData.head(5)

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,latitude,listing_id,longitude,...,photos,price,street_address,num_photos,desc_word_count,desc_char_count,desc_avg_word_length,desc_special_char_count,desc_num_count,desc_upper_count
0,1.0,1,79780be1514f645d7e6be99a3de696c5,2016-06-11 05:29:41,large awesome terraceaccessible via bedroom li...,Suffolk Street,"[Elevator, Laundry in Building, Laundry in Uni...",40.7185,7142618,-73.9865,...,[https://photos.renthop.com/2/7142618_1c45a2c8...,2950,99 Suffolk Street,8,78,587,6.710526,50,0,4
1,1.0,2,0,2016-06-24 06:36:34,prime soho bleecker houston newly renovated st...,Thompson Street,"[Pre-War, Dogs Allowed, Cats Allowed]",40.7278,7210040,-74.0,...,[https://photos.renthop.com/2/7210040_d824cc71...,2850,176 Thompson Street,3,35,245,6.205882,16,0,1
2,1.0,0,0,2016-06-17 01:23:39,spacious studio prime location cleanbuilding h...,Sullivan Street,"[Pre-War, Dogs Allowed, Cats Allowed]",40.726,7174566,-74.0026,...,[https://photos.renthop.com/2/7174566_ba3a35c5...,2295,115 Sullivan Street,1,39,268,6.052632,7,0,0
3,1.0,2,f9c826104b91d868e69bd25746448c0c,2016-06-21 05:06:02,immediate access call bryanbr bond new york re...,Jones Street,"[Hardwood Floors, Dogs Allowed, Cats Allowed]",40.7321,7191391,-74.0028,...,[https://photos.renthop.com/2/7191391_8c2f2d49...,2900,23 Jones Street,4,22,146,6.25,12,0,0
4,1.0,1,81062936e12ee5fa6cd2b965698e17d5,2016-06-16 07:24:27,beautiful true 1 bedroom luxury building finan...,Exchange Place,"[Roof Deck, Doorman, Elevator, Fitness Center,...",40.7054,7171695,-74.0095,...,[https://photos.renthop.com/2/7171695_089ffee2...,3254,20 Exchange Place,6,85,564,5.783133,41,1,5


#### Count number of features for each listing

In [13]:
testData['num_features'] = testData['features'].apply(lambda x: len(x))

#### Extract year, month, date to their individual columns

In [14]:
testData['year'] = testData['created'].dt.year
testData['month'] = testData['created'].dt.month
testData['day'] = testData['created'].dt.day

In [15]:
testData

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,latitude,listing_id,longitude,...,desc_word_count,desc_char_count,desc_avg_word_length,desc_special_char_count,desc_num_count,desc_upper_count,num_features,year,month,day
0,1.0,1,79780be1514f645d7e6be99a3de696c5,2016-06-11 05:29:41,large awesome terraceaccessible via bedroom li...,Suffolk Street,"[Elevator, Laundry in Building, Laundry in Uni...",40.7185,7142618,-73.9865,...,78,587,6.710526,50,0,4,6,2016,6,11
1,1.0,2,0,2016-06-24 06:36:34,prime soho bleecker houston newly renovated st...,Thompson Street,"[Pre-War, Dogs Allowed, Cats Allowed]",40.7278,7210040,-74.0000,...,35,245,6.205882,16,0,1,3,2016,6,24
2,1.0,0,0,2016-06-17 01:23:39,spacious studio prime location cleanbuilding h...,Sullivan Street,"[Pre-War, Dogs Allowed, Cats Allowed]",40.7260,7174566,-74.0026,...,39,268,6.052632,7,0,0,3,2016,6,17
3,1.0,2,f9c826104b91d868e69bd25746448c0c,2016-06-21 05:06:02,immediate access call bryanbr bond new york re...,Jones Street,"[Hardwood Floors, Dogs Allowed, Cats Allowed]",40.7321,7191391,-74.0028,...,22,146,6.250000,12,0,0,3,2016,6,21
4,1.0,1,81062936e12ee5fa6cd2b965698e17d5,2016-06-16 07:24:27,beautiful true 1 bedroom luxury building finan...,Exchange Place,"[Roof Deck, Doorman, Elevator, Fitness Center,...",40.7054,7171695,-74.0095,...,85,564,5.783133,41,1,5,10,2016,6,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74654,1.0,1,bd863d28a6b119ac3bc72d5f27b07f24,2016-04-26 16:09:55,brand new market 1bdr 107th lexington seconds ...,150 EAST 107TH STREET,[],40.7925,6928108,-73.9454,...,45,258,4.478261,21,1,44,0,2016,4,26
74655,1.0,2,9174b75c0cd978eb0e5aa93afbad754b,2016-04-21 05:06:19,convertible 2br apartment features brand new m...,E 33rd St.,"[Doorman, Elevator, Laundry in Building, Dishw...",40.7456,6906674,-73.9797,...,111,701,5.523364,85,0,1,8,2016,4,21
74656,1.0,0,0,2016-04-20 01:31:52,lets get see 2400mo recently renovated spaciou...,Lexington Avenue,"[Dogs Allowed, Cats Allowed]",40.7416,6897967,-73.9829,...,141,866,5.185714,36,0,1,2,2016,4,20
74657,2.0,2,c90c010e5505365676538e64d02aa1e0,2016-04-08 02:26:45,coopercoopercom web id 171357 access 1000s uni...,Park Avenue,"[Doorman, Elevator, Cats Allowed, Dogs Allowed]",40.7485,6842183,-73.9800,...,100,816,7.170000,101,0,16,4,2016,4,8


In [16]:
testDataTextExtract = "dataset/two-sigma-connect-rental-listing-inquiries/testTextExtract.json.zip"
testData.to_json(testDataTextExtract)