## Impute missing values of session attributes based on label

In [8]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split

In [2]:
np.random.seed(1)

In [3]:
data = pd.read_csv("./data/df_all_inc_sess.csv")
y = pd.read_csv("./data/y.csv")

In [4]:
# drop last 15 samples as they're caused by a bug (https://www.kaggle.com/c/airbnb-recruiting-new-user-bookings/forums/t/17737/announcement-competition-will-restart-shortly)
X = data[0:213466]
X = X[0:213451]
y = y[0:213451]

In [5]:
print X.shape
print y.shape

(213451, 771)
(213451, 1)


In [6]:
X.country_destination = y.country_destination

In [9]:
# train / test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=1)

In [11]:
X_train.country_destination = y_train.country_destination

## IMPUTE W ALL DATA AVAILABLE

In [48]:
# impute w all data available
countries = ['FR', 'NL', 'PT', 'CA', 'DE', 'IT', 'US', 'NDF', 'other', 'AU', 'GB', 'ES']
# index: 221; col_name: action_lookup
# index: 767; col_name: num_sessions_24_hrs
for i in range(221,771):   
    col_name = X.columns[i] 
    print "Processing %s (%d/%d)" % (col_name, i-220, 768-221)
    for country in countries:
        country_value_counts = X[(X.country_destination == country) & (X[col_name].notnull())][col_name].value_counts()
        #print "country_value_counts:\n%s" % country_value_counts
        country_values = list(country_value_counts.index)
        #print "country_values:\n%s" % country_values
        country_value_probas = [val/float(sum(country_value_counts.values)) for val in country_value_counts.values]
        #print "country_value_probas:\n%s" % country_value_probas
        
        x = X[(X.country_destination == country) & (X[col_name].isnull())]
        #print "x:\n%s" % x
        #print "x.index:\n%s" % x.index
        #print "X.loc[x.index, [col_name]] (before):\n%s" % X.loc[x.index, [col_name]]
        X.loc[x.index, [col_name]] = np.random.choice(country_values, len(x.index), p=country_value_probas)
        #print "X.loc[x.index, [col_name]] (after):\n%s" % np.random.choice(country_values, len(x.index), p=country_value_probas)


Processing device_type_Blackberry (538/547)
Processing device_type_iPodtouch (539/547)
Processing device_type_Windows Phone (540/547)
Processing device_type_Opera Phone (541/547)
Processing num_sessions_1_hrs (542/547)
Processing num_sessions_2_hrs (543/547)
Processing num_sessions_4_hrs (544/547)
Processing num_sessions_8_hrs (545/547)
Processing num_sessions_16_hrs (546/547)
Processing num_sessions_24_hrs (547/547)
Processing secs_elapsed (548/547)
Processing hit_count (549/547)
Processing avg_secs_elapsed (550/547)


## IMPUTE W TRAIN SET ONLY (DON'T CHEAT|OVERFIT !)

In [12]:
# impute w all data available
countries = ['FR', 'NL', 'PT', 'CA', 'DE', 'IT', 'US', 'NDF', 'other', 'AU', 'GB', 'ES']
# index: 221; col_name: action_lookup
# index: 767; col_name: num_sessions_24_hrs
for i in range(221,771):   
    col_name = X.columns[i] 
    print "Processing %s (%d/%d)" % (col_name, i-220, 768-221)
    for country in countries:
        country_value_counts = X_train[(X_train.country_destination == country) & (X_train[col_name].notnull())][col_name].value_counts()
        #print "country_value_counts:\n%s" % country_value_counts
        country_values = list(country_value_counts.index)
        #print "country_values:\n%s" % country_values
        country_value_probas = [val/float(sum(country_value_counts.values)) for val in country_value_counts.values]
        #print "country_value_probas:\n%s" % country_value_probas
        
        x = X[(X.country_destination == country) & (X[col_name].isnull())]
        #print "x:\n%s" % x
        #print "x.index:\n%s" % x.index
        #print "X.loc[x.index, [col_name]] (before):\n%s" % X.loc[x.index, [col_name]]
        X.loc[x.index, [col_name]] = np.random.choice(country_values, len(x.index), p=country_value_probas)
        #print "X.loc[x.index, [col_name]] (after):\n%s" % np.random.choice(country_values, len(x.index), p=country_value_probas)


Processing action_lookup (1/547)
Processing action_search_results (2/547)
Processing action_personalize (3/547)
Processing action_index (4/547)
Processing action_similar_listings (5/547)
Processing action_ajax_refresh_subtotal (6/547)
Processing action_show (7/547)
Processing action_header_userpic (8/547)
Processing action_ask_question (9/547)
Processing action_ (10/547)
Processing action_other_hosting_reviews_first (11/547)
Processing action_hosting_social_proof (12/547)
Processing action_decision_tree (13/547)
Processing action_recent_reservations (14/547)
Processing action_faq_experiment_ids (15/547)
Processing action_multi (16/547)
Processing action_active (17/547)
Processing action_dashboard (18/547)
Processing action_create (19/547)
Processing action_confirm_email (20/547)
Processing action_show_personalize (21/547)
Processing action_verify (22/547)
Processing action_pending (23/547)
Processing action_requested (24/547)
Processing action_concierge (25/547)
Processing action_faq (

In [13]:
def_all_inc_sess_imputed = X.append(data[213466:], ignore_index=True)


In [14]:
def_all_inc_sess_imputed.shape

(275547, 771)

In [15]:
def_all_inc_sess_imputed.to_csv('./data/df_all_inc_sess_imputed_w_train_data_only.csv', index=False)

In [62]:
# y w 213451 rows (last 15 buggy samples dropped is saved)
y.to_csv("./data/y_2.csv", index=False) # 