In [1]:
import datetime
import pandas as pd
from pandas import DataFrame
import numpy as np
from sklearn.decomposition import PCA
import random
import ml_metrics as metrics
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split



In [2]:
# load datasets
destinations = pd.read_csv("destinations.csv")
# test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")

In [3]:
# get year and month attributes from date_time
train["date_time"] = pd.to_datetime(train["date_time"])
train["year"] = train["date_time"].dt.year
train["month"] = train["date_time"].dt.month

In [4]:
train.shape

(37670293, 26)

In [5]:
m=train.orig_destination_distance.mean()
train.orig_destination_distance.fillna(m, inplace=True)

In [72]:
# random 100000 samples
unique_users = train.user_id.unique()
print len(unique_users)

1198786


In [104]:
selected_user_id = unique_users[:100000]#random.sample(unique_users,10000)
selected_train = train[train.user_id.isin(selected_user_id)]
# selected_train = train

In [105]:
len(selected_user_id)

100000

In [106]:
selected_train.shape

(3095328, 26)

In [107]:
# train = 2013 | until July 2014; 
# test = After July 2014
train2 = selected_train[((selected_train.year == 2013) | ((selected_train.year == 2014) & (selected_train.month < 8)))]
test2 = selected_train[((selected_train.year == 2014) & (selected_train.month >= 8))]
# train2 = train[((train.year == 2013) | ((train.year == 2014) & (train.month < 8)))]
# test2 = train[((train.year == 2014) & (train.month >= 8))]

In [108]:
size = len(selected_train)
# train2 = selected_train[ : int(size*.80)]
# test2 = selected_train[int(size*.80)+1 : ]
print size
print len(train2)
print len(test2)

3095328
1946999
1148329


In [109]:
# only get those whose booking is true
test2 = test2[test2.is_booking == True]

In [110]:
print "train2 - ", len(train2)
print "test2 - ", len(test2)

train2 -  1946999
test2 -  83003


In [111]:
def map5eval(preds, actual):
    predicted = preds.argsort(axis=1)[:,-np.arange(5)]
    metric = 0.
    for i in range(5):
        metric += np.sum(actual==predicted[:,i])/(i+1)
    metric /= actual.shape[0]
    return metric

### Base Case

In [112]:
most_common_clusters = list(train.hotel_cluster.value_counts().head().index)
predictions = [most_common_clusters for i in range(test2.shape[0])]

In [113]:
most_common_clusters

[91, 41, 48, 64, 65]

In [114]:
target = [[l] for l in test2["hotel_cluster"]]
print "Base case evaluation: ", metrics.mapk(target, predictions, k=5)

Base case evaluation:  0.062750743949


### Generate features from destinations csv

In [115]:
pca = PCA(n_components=3)
dest_small = pca.fit_transform(destinations[["d{0}".format(i + 1) for i in range(149)]])
dest_small = pd.DataFrame(dest_small)
dest_small["srch_destination_id"] = destinations["srch_destination_id"]

In [85]:
# dest_small

### Feature Engineering
- Generate new date features based on date_time, srch_ci, and srch_co.
- Remove non-numeric columns like date_time.
- Add in features from dest_small.
- Replace any missing values with -1.
- Calculate features such as length of stay, check in day, and check out month.

In [116]:
def calc_fast_features(df):
    df.loc[:,'date_time'] = pd.to_datetime(df["date_time"])
    df.loc[:,'srch_ci'] = pd.to_datetime(df["srch_ci"], format='%Y-%m-%d', errors="coerce")
    df.loc[:,'srch_co'] = pd.to_datetime(df["srch_co"], format='%Y-%m-%d', errors="coerce")
    
    props = {}
    for prop in ["month", "day", "hour", "minute", "dayofweek", "quarter"]:
        props[prop] = getattr(df["date_time"].dt, prop)
    
    carryover = [p for p in df.columns if p not in ["date_time", "srch_ci", "srch_co"]]
    for prop in carryover:
        props[prop] = df[prop]
    
    date_props = ["month", "day", "dayofweek", "quarter"]
    for prop in date_props:
        props["ci_{0}".format(prop)] = getattr(df["srch_ci"].dt, prop)
        props["co_{0}".format(prop)] = getattr(df["srch_co"].dt, prop)
    props["stay_span"] = (df["srch_co"] - df["srch_ci"]).astype('timedelta64[h]')
    
    ret = pd.DataFrame(props)
    
    ret = ret.join(dest_small, on="srch_destination_id", how='left', rsuffix="dest")
    ret = ret.drop("srch_destination_iddest", axis=1)
    return ret

df = calc_fast_features(train2)
df.fillna(-1, inplace=True)

### Random Forest

In [117]:
predictors = [c for c in df.columns if c not in ["hotel_cluster"]]
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier

# clf = RandomForestClassifier(n_estimators=10, min_weight_fraction_leaf=0.1)
clf = RandomForestClassifier(n_estimators=31,max_depth=10,random_state=123)
scores = model_selection.cross_val_score(clf, df[predictors], df['hotel_cluster'], cv=5)
print "Random Forest: ", scores

KeyboardInterrupt: 

In [None]:
clf.fit(df[predictors], df['hotel_cluster'])

In [None]:
importance = clf.feature_importances_
indices=np.argsort(importance)[::-1][:10]

In [None]:
plt.barh(range(10), importance[indices],color='r')
plt.yticks(range(10),df[predictors].columns[indices])
plt.xlabel('Feature Importance')
plt.show()

In [None]:
clf.classes_

In [None]:
dict_cluster = {}
for (k,v) in enumerate(clf.classes_):
    dict_cluster[k] = v
# dict_cluster

In [93]:
t2 = calc_fast_features(test2)
t2.fillna(-1, inplace=True)

In [94]:
testPredictors = [c for c in t2.columns if c not in ["hotel_cluster"]]

In [95]:
y_pred=clf.predict_proba(t2[testPredictors])
#take largest 5 probablities' indexes
a=y_pred.argsort(axis=1)[:,-5:]

In [96]:
y_pred

array([[  1.17694195e-03,   1.88233923e-01,   1.10046794e-04, ...,
          2.53586090e-04,   1.79008064e-03,   5.34785793e-04],
       [  2.24716957e-04,   0.00000000e+00,   1.98771986e-02, ...,
          1.28368553e-02,   4.37507742e-03,   1.82909628e-02],
       [  1.67497931e-03,   0.00000000e+00,   1.35284319e-02, ...,
          1.37780671e-02,   1.97351447e-03,   1.13258244e-02],
       ..., 
       [  2.70393272e-03,   2.10150257e-04,   1.76694302e-02, ...,
          3.98134805e-03,   2.14341200e-03,   1.26736963e-02],
       [  1.08729928e-02,   2.06035711e-03,   6.74487076e-03, ...,
          6.16262912e-03,   1.31184117e-02,   6.34572580e-03],
       [  4.74224008e-03,   4.62709968e-04,   2.36678415e-03, ...,
          3.95677628e-02,   6.70697351e-02,   3.69446932e-03]])

In [97]:
a

array([[88, 79, 24, 45,  1],
       [58, 36, 81, 57, 46],
       [82, 42, 13, 63, 46],
       ..., 
       [61, 11, 29, 82, 46],
       [41, 18, 42, 48, 91],
       [69, 98, 41, 70, 56]])

In [98]:
#take the corresonding cluster of the 5 top indices
b = []
for i in a.flatten():
    b.append(dict_cluster.get(i))

In [99]:
cluster_pred = np.array(b).reshape(a.shape)

In [100]:
cluster_pred

array([[88, 79, 24, 45,  1],
       [58, 36, 81, 57, 46],
       [82, 42, 13, 63, 46],
       ..., 
       [61, 11, 29, 82, 46],
       [41, 18, 42, 48, 91],
       [69, 98, 41, 70, 56]])

In [101]:
target = [[l] for l in t2["hotel_cluster"]]

In [102]:
print "Random Forest score: ",metrics.mapk(target,cluster_pred,k=5)

Random Forest score:  0.132733372804


In [103]:
map5 = map5eval(y_pred, t2['hotel_cluster'])
map5

0.10550621669626999

### Binary Classifiers

In [41]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from itertools import chain

all_probs = []
unique_clusters = df["hotel_cluster"].unique()
for cluster in unique_clusters:
    df["target"] = 1
    df["target"][df["hotel_cluster"] != cluster] = 0
#     print df["target"]
    predictors = [col for col in df if col not in ['hotel_cluster', "target"]]
    probs = []
    kf = KFold(n_splits=2)
    cv = kf.split(df["target"])
    clf = RandomForestClassifier(n_estimators=10, min_weight_fraction_leaf=0.1)
    for i, (tr, te) in enumerate(cv):
        clf.fit(df[predictors].iloc[tr], df["target"].iloc[tr])
        preds = clf.predict_proba(df[predictors].iloc[te])
        probs.append([p[1] for p in preds])
    full_probs = chain.from_iterable(probs)
    all_probs.append(list(full_probs))

prediction_frame = pd.DataFrame(all_probs).T
prediction_frame.columns = unique_clusters
def find_top_5(row):
    return list(row.nlargest(5).index)

preds = []
for index, row in prediction_frame.iterrows():
    preds.append(find_top_5(row))

print "test2222", test2.iloc[:,"hotel_cluster"]
print metrics.mapk([[l] for l in test2.iloc["hotel_cluster"]], preds, k=5)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


test2222

ValueError: Location based indexing can only have [integer, integer slice (START point is INCLUDED, END point is EXCLUDED), listlike of integers, boolean array] types

### SGD Classifier

In [42]:
from sklearn import linear_model

clf = linear_model.SGDClassifier(loss='log', n_jobs=-1, alpha=0.0000025, verbose=0)
sgdScores = model_selection.cross_val_score(clf, df[predictors], df['hotel_cluster'], cv=3)
print "SGD Classifier: ", sgdScores



 SGD Classifier:  [ 0.01247255  0.00478047  0.00810516]


In [43]:
# clf.fit(df[predictors], df['hotel_cluster'])
clf.partial_fit(df[predictors], df['hotel_cluster'], classes=np.arange(100))

SGDClassifier(alpha=2.5e-06, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=None, n_iter=None,
       n_jobs=-1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [44]:
#predictors

In [45]:
clf.classes_

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])

In [46]:
dict_cluster = {}
for (k,v) in enumerate(clf.classes_):
    dict_cluster[k] = v

In [118]:
dict_cluster

{0: 0,
 1: 1,
 2: 2,
 3: 3,
 4: 4,
 5: 5,
 6: 6,
 7: 7,
 8: 8,
 9: 9,
 10: 10,
 11: 11,
 12: 12,
 13: 13,
 14: 14,
 15: 15,
 16: 16,
 17: 17,
 18: 18,
 19: 19,
 20: 20,
 21: 21,
 22: 22,
 23: 23,
 24: 24,
 25: 25,
 26: 26,
 27: 27,
 28: 28,
 29: 29,
 30: 30,
 31: 31,
 32: 32,
 33: 33,
 34: 34,
 35: 35,
 36: 36,
 37: 37,
 38: 38,
 39: 39,
 40: 40,
 41: 41,
 42: 42,
 43: 43,
 44: 44,
 45: 45,
 46: 46,
 47: 47,
 48: 48,
 49: 49,
 50: 50,
 51: 51,
 52: 52,
 53: 53,
 54: 54,
 55: 55,
 56: 56,
 57: 57,
 58: 58,
 59: 59,
 60: 60,
 61: 61,
 62: 62,
 63: 63,
 64: 64,
 65: 65,
 66: 66,
 67: 67,
 68: 68,
 69: 69,
 70: 70,
 71: 71,
 72: 72,
 73: 73,
 74: 74,
 75: 75,
 76: 76,
 77: 77,
 78: 78,
 79: 79,
 80: 80,
 81: 81,
 82: 82,
 83: 83,
 84: 84,
 85: 85,
 86: 86,
 87: 87,
 88: 88,
 89: 89,
 90: 90,
 91: 91,
 92: 92,
 93: 93,
 94: 94,
 95: 95,
 96: 96,
 97: 97,
 98: 98,
 99: 99}

In [47]:
y_pred=clf.predict_proba(t2[testPredictors])
#take largest 5 probablities' indexes
a=y_pred.argsort(axis=1)[:,-5:]

  np.exp(prob, prob)
  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))


In [48]:
y_pred

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.04761905,  0.04761905, ...,  0.04761905,
         0.        ,  0.        ],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [49]:
a

array([[89, 39, 65, 92, 95],
       [39, 65, 91, 92, 95],
       [61, 22, 64, 86, 44],
       ..., 
       [27, 26, 35, 99, 64],
       [27, 26, 35, 99, 64],
       [27, 26, 35, 99, 64]])

In [50]:
#take the corresonding cluster of the 5 top indices
b = []
for i in a.flatten():
    b.append(dict_cluster.get(i))

In [51]:
cluster_pred = np.array(b).reshape(a.shape)

In [52]:
cluster_pred

array([[89, 39, 65, 92, 95],
       [39, 65, 91, 92, 95],
       [61, 22, 64, 86, 44],
       ..., 
       [27, 26, 35, 99, 64],
       [27, 26, 35, 99, 64],
       [27, 26, 35, 99, 64]])

In [53]:
target = [[l] for l in t2["hotel_cluster"]]

In [54]:
print "Stochastic Gradient Descent (SGD) score: ",metrics.mapk(target,cluster_pred,k=5)

Stochastic Gradient Descent (SGD) score:  0.014351687389


In [55]:
map5 = map5eval(y_pred, t2['hotel_cluster'])
map5

0.01965660153937241

### Naive Bayes

In [56]:
from sklearn.naive_bayes import BernoulliNB

clf = BernoulliNB(alpha=1.0)
nbScores = model_selection.cross_val_score(clf, df[predictors], df['hotel_cluster'], cv=3)
print "Naive Bayes: ", nbScores

Naive Bayes:  [ 0.0389246   0.04291191  0.04171348]


In [57]:
# clf.fit(df[predictors], df['hotel_cluster'])
clf.partial_fit(df[predictors], df['hotel_cluster'], classes=np.arange(100))

dict_cluster = {}
for (k,v) in enumerate(clf.classes_):
    dict_cluster[k] = v

y_pred=clf.predict_proba(t2[testPredictors])
#take largest 5 probablities' indexes
a=y_pred.argsort(axis=1)[:,-5:]

print "nb_a:", a 

#take the corresonding cluster of the 5 top indices
b = []
for i in a.flatten():
    b.append(dict_cluster.get(i))

cluster_pred = np.array(b).reshape(a.shape)
target = [[l] for l in t2["hotel_cluster"]]
print "Naive Bayes score: ",metrics.mapk(target,cluster_pred,k=5)

nb_a: [[88 79 24 45  1]
 [59 82 42 48 91]
 [16 50 42 48 91]
 ..., 
 [42 59 82 91 48]
 [18 28 42 48 91]
 [28 59 82 48 91]]
Naive Bayes score:  0.0614288533649


In [58]:
map5 = map5eval(y_pred, t2['hotel_cluster'])
map5

0.045707519242155124

### Improvements
- Aggregate `hotel_cluster` based on `srch_destination_id`
> find the most popular hotel clusters for each destination to predict that a user who searches for a destination is going to one of the most popular hotel clusters for that destination
- Group training data by `search_destination_id` and `hotel_cluster`
- Iterate each group
> - Assign 1 point to each hotel cluster where `is_booking` is True.
> - Assign .15 points to each hotel cluster where `is_booking` is False.
> - Assign the score to the `srch_destination_id` / `hotel_cluster` combination in a dictionary.

#### Output:
We'll have a dictionary where each key is a `srch_destination_id`. Each value in the dictionary will be another dictionary, containing `hotel_clusters` as keys with scores as values


In [59]:
def make_key(items):
    return "_".join([str(i) for i in items])

match_cols = ["srch_destination_id"]
cluster_cols = match_cols + ['hotel_cluster']
groups = train2.groupby(cluster_cols)
top_clusters = {}
for name, group in groups:
    clicks = len(group.is_booking[group.is_booking == False])
    bookings = len(group.is_booking[group.is_booking == True])
    
    score = bookings + .15 * clicks
    
    clus_name = make_key(name[:len(match_cols)])
    if clus_name not in top_clusters:
        top_clusters[clus_name] = {}
    top_clusters[clus_name][name[-1]] = score

In [60]:
top_clusters

{'44857': {61: 0.15},
 '35542': {42: 2.3, 47: 0.3},
 '11542': {91: 2.3},
 '11547': {30: 0.15, 50: 0.3, 80: 1.75, 83: 0.15},
 '8090': {29: 1.15, 38: 0.3, 46: 0.3, 64: 0.6, 82: 1.45},
 '5980': {42: 0.3, 48: 0.3, 82: 0.15},
 '51264': {48: 0.15},
 '5986': {3: 0.15, 48: 0.15},
 '19392': {53: 0.3},
 '19390': {8: 0.15},
 '22261': {20: 0.3,
  30: 0.8999999999999999,
  38: 1.15,
  43: 1.3,
  44: 0.3,
  53: 0.15,
  61: 0.3},
 '22260': {2: 2.3, 8: 0.15, 9: 0.15, 37: 0.3},
 '22266': {3: 0.15, 44: 0.15, 53: 0.15, 61: 0.15, 78: 0.15},
 '22268': {5: 0.15, 14: 0.15, 20: 0.15, 57: 0.15, 60: 0.15, 85: 0.3},
 '25062': {91: 1.3},
 '25061': {8: 0.15,
  18: 0.3,
  68: 0.3,
  70: 0.44999999999999996,
  89: 0.15,
  95: 0.44999999999999996,
  98: 2.9},
 '25066': {40: 0.15, 47: 0.15},
 '25067': {48: 2.0, 60: 0.44999999999999996},
 '25064': {4: 0.15,
  17: 0.15,
  18: 0.75,
  40: 0.3,
  44: 0.15,
  47: 0.15,
  48: 0.15,
  55: 0.15,
  72: 0.15,
  83: 0.44999999999999996,
  91: 0.15},
 '25065': {5: 0.15, 33: 1.15,

### Find top 5 clusters for each `srch_destination_id`
- Loop through each key in top_clusters.
- Find the top 5 clusters for that key.
- Assign the top 5 clusters to a new dictionary, cluster_dict

In [61]:
import operator

cluster_dict = {}
for n in top_clusters:
    tc = top_clusters[n]
    top = [l[0] for l in sorted(tc.items(), key=operator.itemgetter(1), reverse=True)[:5]]
    cluster_dict[n] = top

In [62]:
cluster_dict

{'35542': [42, 47],
 '21094': [89, 20, 78],
 '11542': [91],
 '11547': [80, 50, 83, 30],
 '5980': [48, 42, 82],
 '51264': [48],
 '5986': [48, 3],
 '19392': [53],
 '19390': [8],
 '22261': [43, 38, 30, 44, 20],
 '22260': [2, 37, 8, 9],
 '22266': [3, 44, 53, 78, 61],
 '22268': [85, 5, 14, 20, 57],
 '25062': [91],
 '25061': [98, 70, 95, 68, 18],
 '25066': [40, 47],
 '25067': [48, 60],
 '25064': [18, 83, 40, 4, 44],
 '25065': [33, 51, 83, 5, 48],
 '19714': [78, 3, 35, 82, 85],
 '27376': [16, 89],
 '27377': [48, 43, 30, 47],
 '27378': [77, 33],
 '46133': [42],
 '6798': [28, 32, 50, 39, 43],
 '46130': [63, 58, 3, 61, 30],
 '25289': [89],
 '25284': [60],
 '25285': [48, 33],
 '25282': [7, 72, 62, 32, 8],
 '25280': [27],
 '36702': [3],
 '51843': [73, 18],
 '16252': [91],
 '273': [89, 44, 93],
 '51847': [28],
 '16708': [16, 51, 47],
 '12019': [65, 78, 34, 75, 63],
 '12018': [50, 32, 42, 10, 7],
 '60095': [35],
 '12015': [76, 91, 32, 47, 83],
 '12014': [91, 4, 48, 55, 23],
 '12017': [77, 91, 72, 7,

### Make predictions based on destination
- Iterate each test data
- Extract `srch_destination_id` for the row
- Find top clusters for that `srch_destination_id`
- Append top clusters to preds

#### Output
`preds` - a list of lists containing predictions

In [63]:
preds = []
for index, row in test2.iterrows():
    key = make_key([row[m] for m in match_cols])
    if key in cluster_dict:
        preds.append(cluster_dict[key])
    else:
        preds.append([])

#### Calculate error

In [64]:
metrics.mapk([[l] for l in test2["hotel_cluster"]], preds, k=5)

0.23547069271758436

### Finding matching users
Finding users in training set that matches in testing set
- Split the training data into groups based on the match columns.
- Loop through the testing data.
- Create an index based on the match columns.
- Get any matches between the testing data and the training data using the groups.

In [65]:
match_cols = ['user_location_country', 'user_location_region', 'user_location_city', 'hotel_market', 'orig_destination_distance']

groups = train2.groupby(match_cols)
    
def generate_exact_matches(row, match_cols):
    index = tuple([row[t] for t in match_cols])
    try:
        group = groups.get_group(index)
    except Exception:
        return []
    clus = list(set(group.hotel_cluster))
    return clus

exact_matches = []
for i in range(test2.shape[0]):
    exact_matches.append(generate_exact_matches(test2.iloc[i], match_cols))

### Combining predictions

In [66]:
def f5(seq, idfun=None): 
    if idfun is None:
        def idfun(x): return x
    seen = {}
    result = []
    for item in seq:
        marker = idfun(item)
        if marker in seen: continue
        seen[marker] = 1
        result.append(item)
    return result
    
full_preds = [f5(exact_matches[p] + preds[p] + most_common_clusters)[:5] for p in range(len(preds))]
metrics.mapk([[l] for l in test2["hotel_cluster"]], full_preds, k=5)

0.27520820998618517