In [1]:
import numpy as np 
import pandas as pd 
import json
import re

In [2]:
train_df = pd.read_json('processed_train.json')
test_df = pd.read_json('processed_test.json')
train_df.head(30)

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,latitude,listing_id,longitude,...,des_capital,tele_count,web_count,email_count,building_id_exist,date,month,day_of_week,hour,interest_num
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,1466754864000,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],40.7145,7211212,-73.9425,...,0,1,0,0,True,2016-06-24,6,Friday,7,2
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,1465733967000,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",40.7947,7150865,-73.9667,...,0,0,0,0,True,2016-06-12,6,Sunday,12,3
100004,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,1460863601000,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",40.7388,6887163,-74.0018,...,0,0,0,0,True,2016-04-17,4,Sunday,3,1
100007,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,1460946122000,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",40.7539,6888711,-73.9677,...,1,0,0,0,True,2016-04-18,4,Monday,2,3
100013,1.0,4,0,1461807161000,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],40.8241,6934781,-73.9493,...,0,0,0,0,False,2016-04-28,4,Thursday,1,3
100014,2.0,4,38a913e46c94a7f46ddf19b756a9640c,1461039887000,,West 18th Street,[],40.7429,6894514,-74.0028,...,0,0,0,0,True,2016-04-19,4,Tuesday,4,2
100016,1.0,2,3ba49a93260ca5df92fde024cb4ca61f,1461727196000,Stunning unit with a great location and lots o...,West 107th Street,"[prewar, elevator, Dogs Allowed, Cats Allowed,...",40.8012,6930771,-73.966,...,0,2,0,2,True,2016-04-27,4,Wednesday,3,3
100020,2.0,1,0372927bcb6a0949613ef5bf893bbac7,1460527302000,"This huge sunny ,plenty of lights 1 bed/2 bath...",West 21st Street,"[Doorman, Elevator, Pre-War, Terrace, Laundry ...",40.7427,6867392,-73.9957,...,3,0,0,0,True,2016-04-13,4,Wednesday,6,3
100026,1.0,1,a7efbeb58190aa267b4a9121cd0c88c0,1461119795000,<p><a website_redacted,Hamilton Terrace,"[Cats Allowed, Dogs Allowed, Elevator, Laundry...",40.8234,6898799,-73.9457,...,0,0,0,0,True,2016-04-20,4,Wednesday,2,2
100027,2.0,4,0,1459565895000,This is a spacious four bedroom with every bed...,522 E 11th,"[Dishwasher, Hardwood Floors]",40.7278,6814332,-73.9808,...,0,0,1,0,False,2016-04-02,4,Saturday,2,3


In [3]:
def basic_clean(text):
    clean_text = re.sub(r'[^\w ]+', '', text)  
    return clean_text.lower()

In [4]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

def stem_words(text):
    words = basic_clean(text).split()
    clean_text = ''
    # stem verbs
    verb_stem = []
    for word in words:
        verb_stem.append(wnl.lemmatize(word, 'v'))
    # stem nouns
    stemmed_words = []
    for word in verb_stem:
        stemmed_words.append(wnl.lemmatize(word, 'n'))
    for word in stemmed_words:
        clean_text = ' '.join([clean_text, "".join(word)]) 
    return clean_text

In [5]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
stop_words = set(stopwords.words('english')) 

def remove_stopwords(text):
    text = stem_words(text)
    word_tokens = word_tokenize(text) 
    clean_text = ''
    for w in word_tokens: 
        if w not in stop_words: 
            clean_text = ' '.join([clean_text, "".join(w)])        
    return clean_text

In [6]:
def clean_desc(text):
    text = remove_stopwords(text)
    desc_text = ''
    desc = text.lower().strip()
    desc = re.sub(r'[\w]*/|<\w+ */*>*', ' ', desc)
    desc_text = ' '.join([desc_text, desc])
    return desc_text

In [7]:
def clean_ft(feature):
    ft_text = ''
    if feature != []:
        for text in feature:
            text = basic_clean(text)
            ft_text = "|".join([ft_text, ' '.join(text.strip().lower().split(" "))])
    return ft_text

In [8]:
# clean description and feature 
train_df['description'] = train_df['description'].apply(clean_desc)
train_df['features'] = train_df['features'].apply(clean_ft)
test_df['description'] = test_df['description'].apply(clean_desc)
test_df['features'] = test_df['features'].apply(clean_ft)
train_df

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,latitude,listing_id,longitude,...,des_capital,tele_count,web_count,email_count,building_id_exist,date,month,day_of_week,hour,interest_num
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,1466754864000,brand new 3 bedroom 15 bath apartmentenjoy fo...,Metropolitan Avenue,,40.7145,7211212,-73.9425,...,0,1,0,0,True,2016-06-24,6,Friday,7,2
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,1465733967000,,Columbus Avenue,|doorman|elevator|fitness center|cats allowed|...,40.7947,7150865,-73.9667,...,0,0,0,0,True,2016-06-12,6,Sunday,12,3
100004,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,1460863601000,top top west village location beautiful prewa...,W 13 Street,|laundry in building|dishwasher|hardwood floor...,40.7388,6887163,-74.0018,...,0,0,0,0,True,2016-04-17,4,Sunday,3,1
100007,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,1460946122000,build amenity garage garden fitness room laun...,East 49th Street,|hardwood floors|no fee,40.7539,6888711,-73.9677,...,1,0,0,0,True,2016-04-18,4,Monday,2,3
100013,1.0,4,0,1461807161000,beautifully renovate 3 bedroom flex 4 bedroom...,West 143rd Street,|prewar,40.8241,6934781,-73.9493,...,0,0,0,0,False,2016-04-28,4,Thursday,1,3
100014,2.0,4,38a913e46c94a7f46ddf19b756a9640c,1461039887000,,West 18th Street,,40.7429,6894514,-74.0028,...,0,0,0,0,True,2016-04-19,4,Tuesday,4,2
100016,1.0,2,3ba49a93260ca5df92fde024cb4ca61f,1461727196000,stun unit great location lot natural light cl...,West 107th Street,|prewar|elevator|dogs allowed|cats allowed|low...,40.8012,6930771,-73.9660,...,0,2,0,2,True,2016-04-27,4,Wednesday,3,3
100020,2.0,1,0372927bcb6a0949613ef5bf893bbac7,1460527302000,huge sunny plenty light 1 bed2 bath offer bra...,West 21st Street,|doorman|elevator|prewar|terrace|laundry in un...,40.7427,6867392,-73.9957,...,3,0,0,0,True,2016-04-13,4,Wednesday,6,3
100026,1.0,1,a7efbeb58190aa267b4a9121cd0c88c0,1461119795000,pa website_redacted,Hamilton Terrace,|cats allowed|dogs allowed|elevator|laundry in...,40.8234,6898799,-73.9457,...,0,0,0,0,True,2016-04-20,4,Wednesday,2,2
100027,2.0,4,0,1459565895000,spacious four bedroom every bedroom able fit ...,522 E 11th,|dishwasher|hardwood floors,40.7278,6814332,-73.9808,...,0,0,1,0,False,2016-04-02,4,Saturday,2,3


In [9]:
from collections import Counter
from operator import itemgetter

def popular_featrues(feature_column):
    all_features = ''
    counter = Counter()
    for text in feature_column:
        text = text.split('|')
        all_features = ''.join([all_features, ','.join(text)])
    all_features = all_features.strip(',').split(',')
    #all_features = [item for item in all_features if item != '']
    counter = Counter(all_features)
    popular_features = [k for k, v in sorted(counter.items(), key=itemgetter(1), reverse=True)][:50]
    return popular_features
popular_features = popular_featrues(train_df['features'].values)

In [10]:
train_df["pop_feature_counts"] = train_df["features"].apply(lambda features: sum(1 for feature in features.split('|') if feature in popular_features))
test_df["pop_feature_counts"] = test_df["features"].apply(lambda features: sum(1 for feature in features.split('|') if feature in popular_features)) 

In [11]:
managers = train_df.manager_id.value_counts()
manager_df = train_df.groupby(['manager_id','interest_level']).manager_id.count().unstack('interest_level').fillna(0)
manager_df['sum'] = manager_df.sum(axis=1)
manager_df.sort_values(by='sum',ascending=False)
manager_df['high_rate'] = manager_df['high']/manager_df['sum']
manager_df['medium_rate'] = manager_df['medium']/manager_df['sum']
manager_df['low_rate'] = manager_df['low']/manager_df['sum']

ulimit = np.percentile(manager_df['sum'], 50) #4

high_manager = manager_df[(manager_df['sum']>=4) & (manager_df['high_rate']>0.5)].sort_values(by='high_rate',ascending=False)
medium_manager = manager_df[(manager_df['sum']>=4) & (manager_df['medium_rate']>0.6)].sort_values(by='medium_rate',ascending=False)
low_manager = manager_df[(manager_df['sum']>=4) & (manager_df['low_rate']>0.7)].sort_values(by='low_rate',ascending=False)

In [12]:
def manager_type(manager_id):
    if manager_id in high_manager.index:
        type = 1
    if manager_id in medium_manager.index:
        type = 2
    if manager_id in low_manager.index:
        type = 3
    else:
        type = 4
    return type
train_df["manager_type"] = train_df["manager_id"].apply(manager_type)
test_df["manager_type"] = test_df["manager_id"].apply(manager_type)

In [3]:
buildings = train_df.building_id.value_counts()
building_df = train_df.groupby(['building_id', 'interest_level']).building_id.count().unstack('interest_level').fillna(0)
building_df['sum'] = building_df.sum(axis=1)
building_df.sort_values(by='sum',ascending=False)
building_df['high_rate'] = building_df['high']/building_df['sum']
building_df['medium_rate'] = building_df['medium']/building_df['sum']
building_df['low_rate'] = building_df['low']/building_df['sum']

ulimit = np.percentile(building_df['sum'], 70) #3

high_building = building_df[(building_df['sum']>=3) & (building_df['high_rate']>0.6)].sort_values(by='high_rate',ascending=False)
medium_building = building_df[(building_df['sum']>=3) & (building_df['medium_rate']>0.6)].sort_values(by='medium_rate',ascending=False)
low_building = building_df[(building_df['sum']>=3) & (building_df['low_rate']>0.7)].sort_values(by='low_rate',ascending=False)

In [4]:
def building_type(building_id):
    if building_id in high_building.index:
        type = 1
    if building_id in medium_building.index:
        type = 2
    if building_id in low_building.index:
        type = 3
    else:
        type = 4
    return type
train_df["building_type"] = train_df["building_id"].apply(building_type)
test_df["building_type"] = test_df["building_id"].apply(building_type)

In [15]:
#write the cleaned test data into a csv file
train_df.to_json('clean_train.json')
test_df.to_json('clean_test.json')