In [5]:
"""Use dask to explore the whole dataset"""

import dask.dataframe as dd 
import numpy as np
import pandas as pd

# Load Data

In [6]:
train = dd.read_csv("../data/Raw/training_set_VU_DM.csv")
test = dd.read_csv("../data/Raw/test_set_VU_DM.csv")
sample_submission = dd.read_csv("../data/Raw/submission_sample.csv")

FileNotFoundError: ignored

# Inspect Train Data

In [None]:
train.columns

Index(['srch_id', 'date_time', 'site_id', 'visitor_location_country_id',
       'visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_country_id',
       'prop_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool',
       'prop_location_score1', 'prop_location_score2',
       'prop_log_historical_price', 'position', 'price_usd', 'promotion_flag',
       'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window',
       'srch_adults_count', 'srch_children_count', 'srch_room_count',
       'srch_saturday_night_bool', 'srch_query_affinity_score',
       'orig_destination_distance', 'random_bool', 'comp1_rate', 'comp1_inv',
       'comp1_rate_percent_diff', 'comp2_rate', 'comp2_inv',
       'comp2_rate_percent_diff', 'comp3_rate', 'comp3_inv',
       'comp3_rate_percent_diff', 'comp4_rate', 'comp4_inv',
       'comp4_rate_percent_diff', 'comp5_rate', 'comp5_inv',
       'comp5_rate_percent_diff', 'comp6_rate', 'comp6_inv',
       'comp6_rate_percent_diff', 'comp7_rate'

In [None]:
# Get shape
train_shape = train.shape 

In [None]:
# Compute shape 
train_nrows = train[0].compute()

In [None]:
# Print dimensions
print(train_nrows, train_shape[-1])

4958347 54


In [None]:
# Get the unique number of search IDs
unique_srch_ids = train["srch_id"].unique().compute()

In [None]:
unique_srch_ids.shape  

(199795,)

In [None]:
# Get distributions of nans
isnans = train.isna().sum().compute()

In [None]:
isnans

srch_id                              0
date_time                            0
site_id                              0
visitor_location_country_id          0
visitor_hist_starrating        4706481
visitor_hist_adr_usd           4705359
prop_country_id                      0
prop_id                              0
prop_starrating                      0
prop_review_score                 7364
prop_brand_bool                      0
prop_location_score1                 0
prop_location_score2           1090348
prop_log_historical_price            0
position                             0
price_usd                            0
promotion_flag                       0
srch_destination_id                  0
srch_length_of_stay                  0
srch_booking_window                  0
srch_adults_count                    0
srch_children_count                  0
srch_room_count                      0
srch_saturday_night_bool             0
srch_query_affinity_score      4640941
orig_destination_distance

In [None]:
# Get class imbalances

# Inspect Submission Sample

In [None]:
# Does the submission sample coincide with the test data??
srch_id = 1 

sample_submission_id_1: pd.DataFrame = sample_submission[
    sample_submission["srch_id"] == srch_id].compute()

test_id_1: pd.DataFrame = test[test["srch_id"] == 1].compute()

In [None]:
sample_submission_id_1

Unnamed: 0,srch_id,prop_id
0,1,3180
1,1,5543
2,1,14142
3,1,22393
4,1,24194
5,1,28181
6,1,34263
7,1,37567
8,1,50162
9,1,54937


In [None]:
test_id_1[["srch_id", "prop_id"]]

Unnamed: 0,srch_id,prop_id
0,1,893
1,1,10404
2,1,21315
3,1,27348
4,1,29604
5,1,30184
6,1,44147
7,1,50984
8,1,53341
9,1,56880


In [None]:
# Are the properties in the testing data the same as the properties
# in the sample submission?
print(test_id_1.shape)
print(sample_submission_id_1.shape)
print(test_id_1["prop_id"].isin(sample_submission_id_1["prop_id"]))

(29, 50)
(29, 2)
0     True
1     True
2     True
3     True
4     True
5     True
6     True
7     True
8     True
9     True
10    True
11    True
12    True
13    True
14    True
15    True
16    True
17    True
18    True
19    True
20    True
21    True
22    True
23    True
24    True
25    True
26    True
27    True
28    True
Name: prop_id, dtype: bool


In [None]:
# Put the sample submission and the training data for ID 1 side by side
test_id_1_prop_only = test_id_1[
    ["srch_id", "prop_id"]].sort_values(by=["prop_id"])

test_id_1_prop_only.rename(
    {"srch_id": "test_srch_id", "prop_id": "test_prop_id"}, 
    inplace=True,
    axis=1)

submission_sample_vs_test = pd.concat(
    (sample_submission_id_1, test_id_1_prop_only), 
    axis=1,
    )

submission_sample_vs_test.rename({"srch_id": "subm_srch_id", 
                                   "prop_id": "subm_prop_id"},
                                   axis=1,
                                   inplace=True)

submission_sample_vs_test

Unnamed: 0,subm_srch_id,subm_prop_id,test_srch_id,test_prop_id
0,1,3180,1,3180
1,1,5543,1,5543
2,1,14142,1,14142
3,1,22393,1,22393
4,1,24194,1,24194
5,1,28181,1,28181
6,1,34263,1,34263
7,1,37567,1,37567
8,1,50162,1,50162
9,1,54937,1,54937
