# Data exploration - checking some initial thoughts and ideas, seeing what the data looks like

In [1]:
import pandas as pd
import numpy as np

In [2]:
with open('data/training_set_VU_DM_2014.csv', 'r') as csvfile:
    train = pd.read_csv(csvfile)
# with open('data/test_set_VU_DM_2014.csv', 'rb') as csvfile:
#     test = pd.read_csv(csvfile)

In [3]:
print(train.columns)

Index(['srch_id', 'date_time', 'site_id', 'visitor_location_country_id',
       'visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_country_id',
       'prop_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool',
       'prop_location_score1', 'prop_location_score2',
       'prop_log_historical_price', 'position', 'price_usd', 'promotion_flag',
       'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window',
       'srch_adults_count', 'srch_children_count', 'srch_room_count',
       'srch_saturday_night_bool', 'srch_query_affinity_score',
       'orig_destination_distance', 'random_bool', 'comp1_rate', 'comp1_inv',
       'comp1_rate_percent_diff', 'comp2_rate', 'comp2_inv',
       'comp2_rate_percent_diff', 'comp3_rate', 'comp3_inv',
       'comp3_rate_percent_diff', 'comp4_rate', 'comp4_inv',
       'comp4_rate_percent_diff', 'comp5_rate', 'comp5_inv',
       'comp5_rate_percent_diff', 'comp6_rate', 'comp6_inv',
       'comp6_rate_percent_diff', 'comp7_rate'

## User > search > search output > selection columns

In [None]:
user_cols = ['srch_id', 'visitor_location_country_id', 'visitor_hist_starrating', 'visitor_hist_adr_usd']
search_cols = ['srch_id', 'date_time', 'site_id', 'srch_destination_id', 'srch_length_of_stay', \
'srch_booking_window', 'srch_adults_count', 'srch_children_count', 'srch_room_count', \
'srch_saturday_night_bool', 'srch_query_affinity_score']
search_output_cols = ['srch_id', 'comp1_rate', 'comp1_inv', 'comp1_rate_percent_diff', \
'comp2_rate', 'comp2_inv', 'comp2_rate_percent_diff', 'comp3_rate', 'comp3_inv', \
'comp3_rate_percent_diff', 'comp4_rate', 'comp4_inv', 'comp4_rate_percent_diff', \
'comp5_rate', 'comp5_inv', 'comp5_rate_percent_diff', 'comp6_rate', 'comp6_inv', \
'comp6_rate_percent_diff', 'comp7_rate', 'comp7_inv', 'comp7_rate_percent_diff', \
'comp8_rate', 'comp8_inv', 'comp8_rate_percent_diff', 'orig_destination_distance', \
'random_bool', 'prop_country_id', 'prop_id', 'prop_starrating', 'prop_review_score', \
'prop_brand_bool', 'prop_location_score1', 'prop_location_score2', 'prop_log_historical_price', \
'price_usd', 'promotion_flag']
selection_cols = ['srch_id', 'click_bool', 'gross_bookings_usd', 'booking_bool', 'position']

In [None]:
user_train_df = train[user_cols]
search_train_df = train[search_cols]
search_output_train_df = train[search_output_cols]
selection_train_df = train[selection_cols]
user_test_df = test[user_cols]
search_test_df = test[search_cols]
search_output_test_df = test[search_output_cols]

## Preprocessing steps

#### Impute missing values, normalise numerical values, check and correct any potential class imbalances

## 1. Impute missing values - sort out NaN and onehot encode the comp_inv and comp_rate variables

In [3]:
# for the comp1_rate, comp1_inv must one hot encode
list_of_onehots = ['comp1_rate', 'comp1_inv', 'comp2_rate', 'comp2_inv', \
                   'comp3_rate', 'comp3_inv', 'comp4_rate', 'comp4_inv', \
                   'comp5_rate', 'comp5_inv', 'comp6_rate', 'comp6_inv', \
                   'comp7_rate', 'comp7_inv', 'comp8_rate', 'comp8_inv',]

# first find another value for the NaN to be encoded by, chosen -2 here. 
train[list_of_onehots] = train[list_of_onehots].fillna(-2)

onehots = train[list_of_onehots].copy()

df_with_dummies = pd.get_dummies(onehots,columns=list_of_onehots)
train = train.drop(labels=list_of_onehots,axis=1)
train = pd.concat([train,df_with_dummies], axis=1)

# some null/nan values can be replaced with 0. These are listed below
nan_to_zeros_list = ['visitor_hist_starrating', 'visitor_hist_adr_usd', \
                     'prop_location_score1', 'prop_location_score2', \
                     'promotion_flag', 'srch_query_affinity_score', \
                     'comp1_rate_percent_diff', 'comp2_rate_percent_diff', \
                     'comp3_rate_percent_diff', 'comp4_rate_percent_diff', \
                     'comp5_rate_percent_diff', 'comp6_rate_percent_diff', \
                     'comp7_rate_percent_diff', 'comp8_rate_percent_diff', \
                     'gross_bookings_usd'] 

train[nan_to_zeros_list] = train[nan_to_zeros_list].fillna(0)
# orig_destination_distance has to go to -1 as 0 does not make sense
train.orig_destination_distance = train.orig_destination_distance.fillna(-1)
train.prop_review_score = train.prop_review_score.fillna(-1)