In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist
from collections import Counter
import datetime


In [2]:
train = pd.DataFrame()
chunk_size = 2000000
j = 0

for train_chunk in pd.read_csv('./input/train.csv', chunksize=chunk_size):
    train_chunk = train_chunk[train_chunk['is_booking']==1]
    train_chunk = train_chunk.drop(['channel', 'user_location_city', 'user_id', 'is_booking',
                              'orig_destination_distance', 'is_mobile', 'cnt',
                             'user_location_region', 'srch_rm_cnt', 'srch_adults_cnt',
                             'user_location_country', 'srch_destination_id',
                             'srch_children_cnt', 'posa_continent', 'hotel_continent'],
                         axis=1)
    
    train_chunk['date_time'] = pd.to_datetime(train_chunk['date_time'], errors='coerce').map(lambda x: x.date())
    train_chunk['srch_ci'] = pd.to_datetime(train_chunk['srch_ci'], errors='coerce').map(lambda x: x.date())
    train_chunk['srch_ci'][pd.isnull(train_chunk['srch_ci'])] = train_chunk['date_time'][pd.isnull(train_chunk['srch_ci'])]
    train_chunk['srch_co'] = pd.to_datetime(train_chunk['srch_co'], errors='coerce').map(lambda x: x.date())
    train_chunk['srch_co'][pd.isnull(train_chunk['srch_co'])] = train_chunk['date_time'][pd.isnull(train_chunk['srch_co'])]
    train_chunk['srch_ci'] = (train_chunk['srch_ci']-train_chunk['date_time']).map(lambda x: x.days)
    train_chunk['srch_co'] = (train_chunk['srch_co']-train_chunk['date_time']).map(lambda x: x.days)
    train_chunk = train_chunk.drop(['date_time'], axis=1)
    train = train.append(train_chunk)
    j+=1
    print('{} rows of train data processed.'.format(j*chunk_size))
hotel_cluster = train['hotel_cluster']
train = train.drop(['hotel_cluster'], axis=1)
train.head()

2000000 rows of train data processed.
4000000 rows of train data processed.
6000000 rows of train data processed.
8000000 rows of train data processed.
10000000 rows of train data processed.
12000000 rows of train data processed.
14000000 rows of train data processed.
16000000 rows of train data processed.
18000000 rows of train data processed.
20000000 rows of train data processed.
22000000 rows of train data processed.
24000000 rows of train data processed.
26000000 rows of train data processed.
28000000 rows of train data processed.
30000000 rows of train data processed.
32000000 rows of train data processed.
34000000 rows of train data processed.
36000000 rows of train data processed.
38000000 rows of train data processed.


Unnamed: 0,site_name,is_package,srch_ci,srch_co,srch_destination_type_id,hotel_country,hotel_market
1,2,1,18,22,1,50,628
20,2,1,50,52,1,50,191
27,30,1,83,91,1,185,185
72,30,0,215,217,1,151,69
79,2,1,50,55,1,50,680


In [3]:
test = pd.read_csv('./input/test.csv')
test = test.drop(['channel', 'user_location_city', 'user_id',
                              'orig_destination_distance', 'is_mobile', 
                             'user_location_region', 'srch_rm_cnt', 'srch_adults_cnt',
                             'user_location_country', 'srch_destination_id',
                             'srch_children_cnt', 'posa_continent', 'hotel_continent'],
                         axis=1)
test['date_time'] = pd.to_datetime(test['date_time'], errors='coerce').map(lambda x: x.date())
test['srch_ci'] = pd.to_datetime(test['srch_ci'], errors='coerce').map(lambda x: x.date())
test['srch_ci'][pd.isnull(test['srch_ci'])] = test['date_time'][pd.isnull(test['srch_ci'])]
test['srch_co'] = pd.to_datetime(test['srch_co'], errors='coerce').map(lambda x: x.date())
test['srch_co'][pd.isnull(test['srch_co'])] = test['date_time'][pd.isnull(test['srch_co'])]
test['srch_ci'] = (test['srch_ci']-test['date_time']).map(lambda x: x.days)
test['srch_co'] = (test['srch_co']-test['date_time']).map(lambda x: x.days)

test = test.drop(['date_time'], axis=1)
test.head()

Unnamed: 0,id,site_name,is_package,srch_ci,srch_co,srch_destination_type_id,hotel_country,hotel_market
0,0,2,0,259,263,6,204,27
1,1,2,0,231,234,7,204,1540
2,2,2,0,49,50,1,50,699
3,3,2,1,0,2,1,50,628
4,4,2,0,5,6,1,50,538


In [16]:
now = datetime.datetime.now()
path = 'submission_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
out = open(path, 'w')
out.write('id,hotel_cluster\n')
len_test = len(test)
X_train = train.as_matrix()
for idx, test_row in test.iterrows():
        
    out.write(str(test_row.iloc[0])+',')
    X_test = [test_row.iloc[1:].values]
    similarities = cdist(X_test, X_train, 'cosine')
    
    hc = np.zeros(10, int)
    for i in range(10):
        index = np.argmax(similarities)
        hc[i] = hotel_cluster.values[index]; 
        similarities[:,index] = -1
    #hc = hotel_cluster[(-similarities[0]).argsort()[:6]].values
    hc_str = ' '.join([str(ite) for ite, it in Counter(hc).most_common(5)])
    out.write(hc_str)
    out.write('\n')

    if not idx%1000: 
        print('%{} of test data processed!'.format((100*idx/len_test)))
    
    #if idx==100: break

out.close()
print(datetime.datetime.now()-now)

%0.0 of test data processed!
%0.03955316004039169 of test data processed!
%0.07910632008078337 of test data processed!
%0.11865948012117507 of test data processed!
%0.15821264016156675 of test data processed!
%0.19776580020195844 of test data processed!
%0.23731896024235014 of test data processed!
%0.27687212028274183 of test data processed!
%0.3164252803231335 of test data processed!
%0.35597844036352516 of test data processed!
%0.3955316004039169 of test data processed!
%0.43508476044430855 of test data processed!
%0.47463792048470027 of test data processed!
%0.5141910805250919 of test data processed!
%0.5537442405654837 of test data processed!
%0.5932974006058753 of test data processed!
%0.632850560646267 of test data processed!
%0.6724037206866587 of test data processed!
%0.7119568807270503 of test data processed!
%0.7515100407674421 of test data processed!
%0.7910632008078338 of test data processed!
%0.8306163608482254 of test data processed!
%0.8701695208886171 of test data proce

KeyboardInterrupt: 

In [11]:
test.shape

(2528243, 8)

In [12]:
train.shape

(3000693, 7)

In [13]:
hotel_cluster.values[2]

58