In [1]:
import base64
import collections
import datetime
import itertools
import json
import os
import pickle
import random
import re
import sys
import time

import matplotlib as mpl
import matplotlib.pyplot as plt
#import nltk
import numpy as np
import pandas as pd
#import PIL
import pylab
import scipy
import seaborn as sns
import sklearn
from sklearn import *
import statsmodels as sm
#import xgboost as xgb

np.random.seed(1337)

%matplotlib inline

sns.set(font_scale=1.0)
mpl.rcParams['figure.figsize'] = 12, 8
sns.set_style('whitegrid')
sns.set_palette(sns.color_palette('muted'))



In [17]:
tr = pd.read_csv('tmp/combined_train.csv', index_col='id')
trl = pd.get_dummies(tr.interest_level)
del tr['interest_level']

In [18]:
NUM_FEATURES = 30

# Univariate Selection

In [4]:
uni_features = {}
for t in ['low', 'medium', 'high']:
    test = sklearn.feature_selection.SelectKBest(
                score_func=sklearn.feature_selection.chi2,
                k=NUM_FEATURES)
    fit = test.fit(tr.fillna(0) + 10000, trl[t])
    uni_features[t] = list(itertools.compress(tr.columns, fit.get_support()))

In [5]:
f = uni_features
print len(set(f['low']) |
          set(f['medium']) |
          set(f['high']))
print len(set(f['low']) &
          set(f['medium']) &
          set(f['high']))

31
28


In [6]:
set(f['low']) | set(f['medium']) | set(f['high'])

{'area_density',
 'bathrooms_per_bedrooms',
 'building_code',
 'cluster_id',
 'cluster_size',
 'created_day_of_month',
 'created_distance_from_peak',
 'created_hour',
 'created_hour_of_dataset',
 'created_minute_of_the_day',
 'desc_count_non_alphanumeric_characters',
 'desc_num_bangs',
 'desc_num_breaks',
 'desc_num_nonalpha',
 'desc_num_words',
 'hours_since_managers_last_listing',
 'manager_code',
 'num_apts_at_manager',
 'num_apts_in_building',
 'num_apts_next_2days',
 'num_apts_next_3hours',
 'num_apts_prev_2days',
 'num_apts_prev_3hours',
 'num_apts_same_2days',
 'num_apts_same_3hours',
 'num_apts_with_approx_same_price',
 'num_apts_with_same_daddr',
 'num_apts_with_same_saddr',
 'num_characters',
 'price',
 'price_per_bedrooms'}

In [7]:
set(f['low']) & set(f['medium']) & set(f['high'])

{'area_density',
 'bathrooms_per_bedrooms',
 'building_code',
 'cluster_id',
 'cluster_size',
 'created_distance_from_peak',
 'created_hour',
 'created_hour_of_dataset',
 'created_minute_of_the_day',
 'desc_count_non_alphanumeric_characters',
 'desc_num_bangs',
 'desc_num_nonalpha',
 'desc_num_words',
 'hours_since_managers_last_listing',
 'manager_code',
 'num_apts_at_manager',
 'num_apts_in_building',
 'num_apts_next_3hours',
 'num_apts_prev_2days',
 'num_apts_prev_3hours',
 'num_apts_same_2days',
 'num_apts_same_3hours',
 'num_apts_with_approx_same_price',
 'num_apts_with_same_daddr',
 'num_apts_with_same_saddr',
 'num_characters',
 'price',
 'price_per_bedrooms'}

# Feature Importance

In [8]:
fim_features = {}
for t in ['low', 'medium', 'high']:
    m = ensemble.ExtraTreesClassifier(
        n_estimators=100,
        max_depth=5,
        max_features=50,
        n_jobs=3)
    m.fit(tr.fillna(0), trl[t])
    fi = zip(m.feature_importances_, tr.columns)
    fi.sort(key=lambda x: x[0], reverse=True)
    fim_features[t] = [p[1] for p in fi[:NUM_FEATURES]]

In [9]:
f = fim_features
print len(set(f['low']) |
          set(f['medium']) |
          set(f['high']))
print len(set(f['low']) &
          set(f['medium']) &
          set(f['high']))

39
20


In [10]:
set(f['low']) | set(f['medium']) | set(f['high'])

{'area_density',
 'bathrooms_per_bedrooms',
 'bedrooms',
 'building_code',
 'building_id_is_zero',
 'cluster_id',
 'cluster_size',
 'created_distance_from_peak',
 'created_hour',
 'created_minute_of_the_day',
 'daddr_has_number',
 'daddr_saddr_same',
 'desc_num_breaks',
 'desc_num_emails',
 'desc_num_nonalpha',
 'desc_num_paragraphs',
 'desc_num_phones',
 'desc_num_website_redacted',
 'desc_num_words',
 'desc_num_words_q',
 'desc_top_500_word_coverage',
 'has_photos',
 'num_apts_at_manager_q',
 'num_apts_in_building',
 'num_apts_next_3hours',
 'num_apts_prev_3hours',
 'num_apts_same_3hours',
 'num_apts_with_approx_same_price',
 'num_apts_with_same_daddr',
 'num_apts_with_same_saddr',
 'num_characters',
 'num_photos',
 'price',
 'price_br_q',
 'price_clust_br_q',
 'price_per_bedrooms',
 'rooms_diff',
 'saddr_has_number',
 'top_10_feature_coverage'}

In [11]:
set(f['low']) & set(f['medium']) & set(f['high'])

{'building_code',
 'building_id_is_zero',
 'cluster_size',
 'created_distance_from_peak',
 'created_hour',
 'created_minute_of_the_day',
 'daddr_saddr_same',
 'desc_num_breaks',
 'desc_num_paragraphs',
 'desc_num_website_redacted',
 'has_photos',
 'num_apts_in_building',
 'num_apts_next_3hours',
 'num_apts_same_3hours',
 'num_apts_with_approx_same_price',
 'price',
 'price_br_q',
 'price_clust_br_q',
 'rooms_diff',
 'saddr_has_number'}

# Select from Model: Linear SVC

In [12]:
sfmsvc_features = {}
for t in ['low', 'medium', 'high']:
    m = sklearn.svm.LinearSVC(C=0.01, penalty='l1', dual=False)
    m.fit(tr.fillna(0), trl[t])
    m = sklearn.feature_selection.SelectFromModel(m, prefit=True)
    sfmsvc_features[t] = list(itertools.compress(tr.columns, m.get_support()))

In [13]:
f = sfmsvc_features
print len(set(f['low']) |
          set(f['medium']) |
          set(f['high']))
print len(set(f['low']) &
          set(f['medium']) &
          set(f['high']))

108
45


# Select from Model: LASSO

In [14]:
set(f['low']) & set(f['medium']) & set(f['high'])

{'area_density',
 'bathrooms_per_bedrooms',
 'bedrooms',
 'cluster_id',
 'cluster_size',
 'created_day_of_month',
 'created_distance_from_peak',
 'created_dow',
 'created_minute',
 'created_minute_of_the_day',
 'daddr_has_number',
 'daddr_has_ordinal',
 'daddr_num_upper_words',
 'daddr_num_words',
 'daddr_saddr_same',
 'desc_count_apartment',
 'desc_count_beautiful',
 'desc_count_bedroom',
 'desc_count_li',
 'desc_count_new',
 'desc_count_non_alphanumeric_characters',
 'desc_count_space',
 'desc_num_bangs',
 'desc_num_nonalpha',
 'desc_num_parentheses',
 'desc_num_phones',
 'desc_num_website_redacted',
 'desc_num_words',
 'distance_cc',
 'features_count_balcony',
 'features_count_high_speed_internet',
 'has_photos',
 'hours_since_managers_last_listing',
 'latitude',
 'longitude',
 'num_apts_in_building',
 'num_apts_next_3hours',
 'num_apts_same_3hours',
 'num_apts_with_same_daddr',
 'num_apts_with_same_saddr',
 'num_features',
 'price',
 'price_br_q',
 'price_clust_br_q',
 'rooms_sum'}

In [15]:
lasso_features = {}
for t in ['low', 'medium', 'high']:
    m = sklearn.linear_model.LassoCV()
    m = sklearn.feature_selection.SelectFromModel(m, threshold=0.05)
    m.fit(tr.fillna(0), trl[t])
    n_features = m.transform(tr.fillna(0)).shape[1]
    while n_features > 30:
        m.threshold += 0.1
        tr_transform = m.transform(tr.fillna(0))
        n_features = tr_transform.shape[1]
    lasso_features[t] = list(itertools.compress(tr.columns, m.get_support()))



In [16]:
f = lasso_features
print len(set(f['low']) |
          set(f['medium']) |
          set(f['high']))
print len(set(f['low']) &
          set(f['medium']) &
          set(f['high']))

0
0


# Recursive Feature Elimination

In [17]:
rfe_features = {}
for t in ['low', 'medium', 'high']:
    m = sklearn.linear_model.LogisticRegression()
    rfe = sklearn.feature_selection.RFE(
            estimator=m,
            n_features_to_select=NUM_FEATURES,
            step=0.03)
    fit = rfe.fit(tr.fillna(0), trl[t])
    rfe_features[t] = list(itertools.compress(tr.columns, fit.get_support()))

In [18]:
f = rfe_features
print len(set(f['low']) |
          set(f['medium']) |
          set(f['high']))
print len(set(f['low']) &
          set(f['medium']) &
          set(f['high']))

54
8


In [19]:
set(f['low']) | set(f['medium']) | set(f['high'])

{'bathrooms',
 'daddr_count_10th',
 'daddr_count_gold',
 'daddr_count_street',
 'daddr_has_number',
 'daddr_has_ordinal',
 'daddr_num_upper_words',
 'daddr_num_words',
 'daddr_saddr_same',
 'desc_count_beautiful',
 'desc_count_center',
 'desc_count_hardwood',
 'desc_count_private',
 'desc_count_space',
 'desc_num_emails',
 'desc_num_paragraphs',
 'desc_num_phones',
 'desc_num_website_redacted',
 'desc_num_words_q',
 'desc_top_500_word_coverage',
 'desc_top_500_word_ratio',
 'distance_cc',
 'features_count_balcony',
 'features_count_cats_allowed',
 'features_count_dogs_allowed',
 'features_count_high_ceilings',
 'features_count_high_speed_internet',
 'features_count_live_in_super',
 'features_count_loft',
 'features_count_lowrise',
 'features_count_no_fee',
 'features_count_parking_space',
 'features_count_simplex',
 'features_count_swimming_pool',
 'has_photos',
 'latitude',
 'longitude',
 'num_apts_at_manager_q',
 'num_features',
 'photos_per_room',
 'price_br_q',
 'price_clust_br_q',

In [None]:
set(f['low']) & set(f['medium']) & set(f['high'])

{'daddr_has_number',
 'daddr_saddr_same',
 'desc_num_website_redacted',
 'desc_num_words_q',
 'has_photos',
 'price_br_q',
 'price_clust_br_q',
 'top_10_feature_coverage'}

# RFE wit Cross Validation

In [19]:
rfecv_features = {}
for t in ['low', 'medium', 'high']:
    m = sklearn.linear_model.LogisticRegression()
    rfe = sklearn.feature_selection.RFECV(
            estimator=m,
            n_jobs=3)
    fit = rfe.fit(tr.fillna(0), trl[t])
    rfecv_features[t] = list(itertools.compress(tr.columns, fit.get_support()))

In [20]:
f = rfecv_features
print len(set(f['low']) |
          set(f['medium']) |
          set(f['high']))
print len(set(f['low']) &
          set(f['medium']) &
          set(f['high']))

199
2


In [21]:
set(tr.columns).difference(set(f['low']) | set(f['medium']) | set(f['high']))

{'area_distance_com',
 'building_id_is_zero',
 'daddr_count_place',
 'daddr_count_west',
 'desc_count_doorman',
 'features_count_dryer_in_unit',
 'features_count_multilevel',
 'features_count_publicoutdoor',
 'saddr_count_25',
 'saddr_count_place',
 'saddr_count_wall',
 'saddr_count_washington'}

In [22]:
set(f['low']) & set(f['medium']) & set(f['high'])

{'has_photos', 'price_clust_br_q'}

In [23]:
for t in ['low', 'medium', 'high']:
    print len(f[t])

113
2
198


In [25]:
len(set(f['low']) & set(f['high']))

112

# Save Selected Features for Later Use

In [26]:
tr = pd.read_csv('tmp/combined_train.csv', index_col='id')
te = pd.read_csv('tmp/combined_test.csv', index_col='id')

In [27]:
features = list(set(f['low']) | set(f['medium']) | set(f['high']))
features.append('interest_level')

In [33]:
f['all'] = features

In [28]:
for t in ['low', 'medium', 'high']:
    f[t].append('interest_level')

In [34]:
for t in ['low', 'medium', 'high', 'all']:
    tr[f[t]].to_csv('tmp/combined_train_selected_{0}.csv'.format(t))
    te[f[t]].to_csv('tmp/combined_test_selected_{0}.csv'.format(t))