In [70]:
import pandas as pd
import time
import datetime
import lightgbm as lgb

In [160]:
train = pd.read_csv("training_set_VU_DM.csv")

In [161]:
test = pd.read_csv("test_set_VU_DM.csv")

# Data Processing

## Feature engineering part 1:
- date_time
    - convert to unix
    - converst to timestamp format
    - day of week
    - weekend
    - month
- visitor_hist_starrating
    - no_visitor_starrating (when null)
- visitor_hist_adr_usd
    - no_visitor_price (when null)
- prop_starrating
    - not_available_star (when null)
    - no_star (when 0)
- prop_review_score
    - not_available_review (when null)
    - no_review (when 0)
- prop_log_historical_price
    - not_available_price (when 0)
    
...

<ins>Ideas (not implemented)</ins>
- available_competitors (bool)
- available_competitors_count


In [108]:
cols = train.columns

In [109]:
cols

Index(['srch_id', 'date_time', 'site_id', 'visitor_location_country_id',
       'visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_country_id',
       'prop_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool',
       'prop_location_score1', 'prop_location_score2',
       'prop_log_historical_price', 'position', 'price_usd', 'promotion_flag',
       'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window',
       'srch_adults_count', 'srch_children_count', 'srch_room_count',
       'srch_saturday_night_bool', 'srch_query_affinity_score',
       'orig_destination_distance', 'random_bool', 'comp1_rate', 'comp1_inv',
       'comp1_rate_percent_diff', 'comp2_rate', 'comp2_inv',
       'comp2_rate_percent_diff', 'comp3_rate', 'comp3_inv',
       'comp3_rate_percent_diff', 'comp4_rate', 'comp4_inv',
       'comp4_rate_percent_diff', 'comp5_rate', 'comp5_inv',
       'comp5_rate_percent_diff', 'comp6_rate', 'comp6_inv',
       'comp6_rate_percent_diff', 'comp7_rate'

In [162]:
# convert datetime to correct object
train["datetime"] = pd.to_datetime(train["date_time"])
train = train.drop(["date_time"], axis=1)

In [163]:
test["datetime"] = pd.to_datetime(test["date_time"])
test = test.drop(["date_time"], axis=1)

In [164]:
train["datetime_unix"] = train["datetime"].astype(int) / 10**9

In [165]:
test["datetime_unix"] = test["datetime"].astype(int) / 10**9

In [166]:
train["weekday"] = train["datetime"].apply(lambda x: x.dayofweek)

In [167]:
test["weekday"] = test["datetime"].apply(lambda x: x.dayofweek)

In [168]:
train["weekend"] = (train["weekday"]>4).astype(int) 

In [169]:
test["weekend"] = (test["weekday"]>4).astype(int) 

In [170]:
train["month"] = train["datetime"].apply(lambda x: x.month)

In [171]:
test["month"] = test["datetime"].apply(lambda x: x.month)

In [172]:
train = train.drop(["datetime"], axis=1)

In [173]:
test = test.drop(["datetime"], axis=1)

In [174]:
multi_categorical_features = multi_categorical_features + ["weekday", "month"]

In [175]:
train["no_visitor_starrating"] = train["visitor_hist_starrating"].isna().astype(int)
train["no_visitor_price"] = train["visitor_hist_adr_usd"].isna().astype(int)
train["prop_not_available_star"] = train["prop_starrating"].isna().astype(int)
train["prop_no_star"] = (train["prop_starrating"]==0).astype(int)
train["prop_not_available_review"] = train["prop_review_score"].isna().astype(int)
train["prop_no_review"] = (train["prop_review_score"]==0).astype(int)
train["prop_not_available_price"] = (train["prop_log_historical_price"]==0).astype(int)

In [176]:
test["no_visitor_starrating"] = test["visitor_hist_starrating"].isna().astype(int)
test["no_visitor_price"] = test["visitor_hist_adr_usd"].isna().astype(int)
test["prop_not_available_star"] = test["prop_starrating"].isna().astype(int)
test["prop_no_star"] = (test["prop_starrating"]==0).astype(int)
test["prop_not_available_review"] = test["prop_review_score"].isna().astype(int)
test["prop_no_review"] = (test["prop_review_score"]==0).astype(int)
test["prop_not_available_price"] = (test["prop_log_historical_price"]==0).astype(int)

## Droping columns: 
srch_query_affinity_score, gross_bookings_usd

- at least temporary:
comp1_rate_percent_diff, comp2_rate_percent_diff, comp3_rate_percent_diff, comp4_rate_percent_diff
comp5_rate_percent_diff, comp6_rate_percent_diff, comp7_rate_percent_diff, comp8_rate_percent_diff


In [177]:
train = train.drop(["srch_query_affinity_score", "gross_bookings_usd"], axis=1) 
test = test.drop(["srch_query_affinity_score"], axis=1) 


In [178]:
train = train.drop(["comp1_rate_percent_diff", "comp2_rate_percent_diff", "comp3_rate_percent_diff", 
                    "comp4_rate_percent_diff", "comp5_rate_percent_diff", "comp6_rate_percent_diff", 
                    "comp7_rate_percent_diff", "comp8_rate_percent_diff"], axis=1) 
test = test.drop(["comp1_rate_percent_diff", "comp2_rate_percent_diff", "comp3_rate_percent_diff", 
                    "comp4_rate_percent_diff", "comp5_rate_percent_diff", "comp6_rate_percent_diff", 
                    "comp7_rate_percent_diff", "comp8_rate_percent_diff"], axis=1) 

In [183]:
# create copies
df_train = train
df_test = test

In [181]:
cols

Index(['srch_id', 'date_time', 'site_id', 'visitor_location_country_id',
       'visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_country_id',
       'prop_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool',
       'prop_location_score1', 'prop_location_score2',
       'prop_log_historical_price', 'position', 'price_usd', 'promotion_flag',
       'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window',
       'srch_adults_count', 'srch_children_count', 'srch_room_count',
       'srch_saturday_night_bool', 'srch_query_affinity_score',
       'orig_destination_distance', 'random_bool', 'comp1_rate', 'comp1_inv',
       'comp1_rate_percent_diff', 'comp2_rate', 'comp2_inv',
       'comp2_rate_percent_diff', 'comp3_rate', 'comp3_inv',
       'comp3_rate_percent_diff', 'comp4_rate', 'comp4_inv',
       'comp4_rate_percent_diff', 'comp5_rate', 'comp5_inv',
       'comp5_rate_percent_diff', 'comp6_rate', 'comp6_inv',
       'comp6_rate_percent_diff', 'comp7_rate'

In [205]:
df_train = df_train.drop(["orig_destination_distance"], axis=1) 
df_test = df_test.drop(["orig_destination_distance"], axis=1) 

## Filling in missing values
- comp_rate -> +1 when null because then it is an advantage against the competitor
- comp_inv -> +1 then null because then it is an advantage against the competitor
- user starrating, visitor_hist_adr_usd -> mean
- prop review, location score 2 -> mean

<ins>Ideas (not implemented)</ins>
- orig_destination_distance by looking at other users
- prop review, star = knn/mean region, price
- user rating, usd -> fill in random normal with mean column

In [212]:
for i in range(1, 9):
    rate = "comp"+str(i)+"_rate"
    inv = "comp"+str(i)+"_inv"
    df_train[rate] = df_train[rate].fillna(1)
    df_train[inv] = df_train[inv].fillna(1)
    df_test[rate] = df_test[rate].fillna(1)
    df_test[inv] = df_test[inv].fillna(1)

In [227]:
df_train["visitor_hist_starrating"].fillna(df_train["visitor_hist_starrating"].mean(), inplace=True)
df_test["visitor_hist_starrating"].fillna(df_test["visitor_hist_starrating"].mean(), inplace=True)
df_train["visitor_hist_adr_usd"].fillna(df_train["visitor_hist_adr_usd"].mean(), inplace=True)
df_test["visitor_hist_adr_usd"].fillna(df_test["visitor_hist_adr_usd"].mean(), inplace=True)
df_train["prop_review_score"].fillna(df_train["prop_review_score"].mean(), inplace=True)
df_test["prop_review_score"].fillna(df_test["prop_review_score"].mean(), inplace=True)
df_train["prop_location_score2"].fillna(df_train["prop_location_score2"].mean(), inplace=True)
df_test["prop_location_score2"].fillna(df_test["prop_location_score2"].mean(), inplace=True)

In [228]:
for i in df_train.columns:
    if df_train[i].isna().sum()>0:
        print(i)
        print(df_train[i].isna().sum()/len(df_train)*100)

## Feature engineering part 2

- prop_location_score1 + prop_location_score2
    - prop_location_score_total
- price_usd - prop_log_historical_price
    - prop_price_diff
- srch_adults_count + srch_children_count
    - srch_count
- srch_count/srch_room_count
    - persons_per_room
- price_usd/srch_count
    - price_per_person
- srch_destination_id != visitor_location_country_id
    - srch_abroad
    

- click_bool, booking_bool
    - target_score (5 booked, 1 clicked)

In [231]:
df_train["prop_location_score_total"] = df_train["prop_location_score1"] + df_train["prop_location_score2"]
df_test["prop_location_score_total"] = df_test["prop_location_score1"] + df_test["prop_location_score2"]

In [233]:
df_train["prop_price_diff"] = df_train["price_usd"] - df_train["prop_log_historical_price"]
df_test["prop_price_diff"] = df_test["price_usd"] - df_test["prop_log_historical_price"]

In [235]:
df_train["srch_count"] = df_train["srch_adults_count"] + df_train["srch_children_count"]
df_test["srch_count"] = df_test["srch_adults_count"] + df_test["srch_children_count"]

In [238]:
df_train["persons_per_room"] = df_train["srch_count"] / df_train["srch_room_count"]
df_test["persons_per_room"] = df_test["srch_count"] / df_test["srch_room_count"]

In [240]:
df_train["price_per_person"] = df_train["price_usd"] + df_train["srch_count"]
df_test["price_per_person"] = df_test["price_usd"] + df_test["srch_count"]

In [246]:
df_train["srch_abroad"] = (df_train["srch_destination_id"] == df_train["visitor_location_country_id"]).astype(int)
df_test["srch_abroad"] = (df_test["srch_destination_id"] == df_test["visitor_location_country_id"]).astype(int)

In [265]:
df_train["target_score"] = df_train["click_bool"] + 5* df_train["booking_bool"]

In [269]:
df_train = df_train.drop(["click_bool", "booking_bool"], axis=1)

In [270]:
len(df_train.columns)

58

In [None]:
df_train.to_csv("cleaned_train.csv", index=False)

In [None]:
df_test.to_csv("cleaned_test.csv", index=False)