# Hotel Third Submission

Trained Random Forest Classifier with 1/200 training set.

In [1]:
import pandas as pd
import numpy as np

# For memory efficiency
dtypes = {
    'site_name': np.uint8,
    'posa_continent': np.uint8,
    'user_location_country': np.uint8,
    'user_location_region': np.uint16,
    'user_location_city': np.uint16,
    'orig_destination_distance': np.float16,
    'user_id': np.uint32,
    'is_mobile': np.uint8,
    'is_package': np.uint8,
    'channel': np.uint8,
    'srch_adults_cnt': np.uint8,
    'srch_children_cnt': np.uint8,
    'srch_rm_cnt': np.uint8,
    'srch_destination_id': np.uint16,
    'srch_destination_type_id': np.uint16,
    'is_booking': np.uint8,
    'cnt': np.uint8,
    'hotel_continent': np.uint8,
    'hotel_country': np.uint8,
    'hotel_cluster': np.uint8
}

df_test = pd.read_csv('../dataset/hotel_test.csv', dtype=dtypes, iterator=True, chunksize=10000)
df_dest = pd.read_csv('../dataset/destinations.csv', dtype=np.float32)

In [2]:
def transform(df):
    ### Parse Date & Time
    from datetime import datetime

    def parse_date(d):
        if type(d) is int:
            d = datetime.fromtimestamp(d / 1000000000).isoformat()
        if str(d) == 'nan':
            d = '2013-01-01'

        try:
            return np.datetime64(str(d))
        except:
            return np.datetime64('2013-01-01')


    df['date_time'] = df['date_time'].map(parse_date)
    df['srch_co'] = df['srch_co'].map(parse_date)
    df['srch_ci'] = df['srch_ci'].map(parse_date)

    ### Make new features with time data and others
    df['days'] = (df['srch_co'] - df['srch_ci']).map(lambda d: d.days)
    df['month'] = df['date_time'].map(lambda d: d.month - 1)
    df['hour'] = df['date_time'].map(lambda d: d.hour - 1)

    df['people'] = df['srch_adults_cnt'] + df['srch_children_cnt']

    del df['srch_co']
    del df['srch_ci']
    del df['date_time']
    
    ### Fill empty values in column 'orig_destination_distance'
    means = df[['posa_continent', 'hotel_continent', 'orig_destination_distance']].groupby(['posa_continent', 'hotel_continent']).mean()
    means = means['orig_destination_distance'].reset_index().fillna(2000)

    means['avg_distance'] = means['orig_destination_distance']

    df = pd.merge(df, means, how='left')
    df['orig_destination_distance'] = df['orig_destination_distance'].fillna(df['avg_distance'])

    del means
    del df['avg_distance']
    df = df.fillna(-1)

    ### One-Hot Encoding
    from sklearn.preprocessing import OneHotEncoder as ohe

    categorical_columns = {'site_name': 54, 'posa_continent': 5, 'user_location_country': 240, 'user_location_region': 1028,
                    'channel': 11, 'srch_destination_type_id': 10,
                    'hotel_continent': 7, 'hotel_country': 213, 'hotel_market': 2118, 'month': 12, 'hour': 24}

    for c, v in categorical_columns.items():
        columns = [c + str(i) for i in range(v)]
        one_hot = np.zeros([df.shape[0], v], dtype=np.uint8)
        one_hot[np.arange(df.shape[0]), df[c].values] = 1
        np.put(one_hot, [v], 1)
        df = pd.concat([df, pd.DataFrame(one_hot, columns=columns, dtype=np.uint8)], axis=1)

        del df[c]

    del df['srch_destination_id']
    del df['user_location_city']
    del df['user_id']
    
    return df

### Validation

In [5]:
import pickle
import numpy as np
from tqdm import tqdm

output = open('prediction.txt', 'w')
output.write('id,hotel_cluster\n')

clf = pickle.load(open('hotel_clf_200.pickle', 'rb'))

for d in tqdm(df_test):
    d = pd.merge(d, df_dest, how='left', on='srch_destination_id')
    d = transform(d)
    X = d.drop('id', axis=1)

    proba = clf.predict_proba(X.values)
    for i, p in zip(d['id'], proba):
        top5 = np.argsort(p)[-5:]
        result = [str(r) for r in list(top5)]
        output.write('{},{}\n'.format(i, ' '.join(result)))

output.close()

253it [08:01,  1.77s/it]


### Score: 0.14567