In [1]:
import pandas as pd
import os
import numpy as np
import itertools
import lightgbm as lgb
from tqdm.notebook import tqdm

tqdm.pandas()
pd.set_option('display.max_columns', None)

  from pandas import Panel


# Read and Preprocess

In [2]:
dsdir = 'C:\\Users\\Daniel\\Downloads\\3_Plus_1\\RecommendationEngine\\coupon-purchase-prediction\\dataset'

In [3]:
train = pd.read_csv('CPP_REPRO_coupon_list_train.csv').sample(frac=1, random_state=0).reset_index(drop=True)
test =  pd.read_csv('CPP_REPRO_coupon_list_test.csv')
submission = pd.read_csv(os.path.join(dsdir,'sample_submission.csv'))

In [4]:
train.drop(['DISPFROM', 'DISPEND', 'VALIDFROM', 'VALIDEND', 'REG_DATE', 'WITHDRAW_DATE'], axis=1, inplace=True)
test.drop(['DISPFROM', 'DISPEND', 'VALIDFROM', 'VALIDEND', 'REG_DATE', 'WITHDRAW_DATE'], axis=1, inplace=True)

In [5]:
train.drop(['USABLE_DATE_HOLIDAY', 'USABLE_DATE_BEFORE_HOLIDAY'], axis=1, inplace=True)
test.drop(['USABLE_DATE_HOLIDAY', 'USABLE_DATE_BEFORE_HOLIDAY'], axis=1, inplace=True)

In [6]:
train['SAME_PREF'] = train['PREF_NAME_COUPON'] == train['PREF_NAME_USER']
test['SAME_PREF'] =  test['PREF_NAME_COUPON'] == test['PREF_NAME_USER']

In [7]:
def haversine(lat1, lat2, lon1, lon2):
    R = 6372.8 # kilometres (3959.87433 miles)
    
    dlat_rad = np.radians(lat2 - lat1)
    dlon_rad = np.radians(lon2 - lon1)
    lat1_rad = np.radians(lat1)
    lat2_rad = np.radians(lat2)

    a = np.sin(dlat_rad / 2) ** 2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon_rad / 2) ** 2
    hdist = 2 * R * np.arcsin(np.sqrt(a))

    return hdist

In [8]:
# Use vector operations for speed
train['HAVERSINE_DIST'] = haversine(train['LATITUDE_COUPON'], train['LATITUDE_USER'], train['LONGITUDE_COUPON'], train['LONGITUDE_USER'])
train['LATITUDE_DELTA'] = np.abs(train['LATITUDE_COUPON'] - train['LATITUDE_USER'])
train['LONGITUDE_DELTA'] = np.abs(train['LONGITUDE_COUPON'] - train['LONGITUDE_USER'])

test['HAVERSINE_DIST'] = haversine(test['LATITUDE_COUPON'], test['LATITUDE_USER'], test['LONGITUDE_COUPON'], test['LONGITUDE_USER'])
test['LATITUDE_DELTA'] = np.abs(test['LATITUDE_COUPON'] - test['LATITUDE_USER'])
test['LONGITUDE_DELTA'] = np.abs(test['LONGITUDE_COUPON'] - test['LONGITUDE_USER'])

In [9]:
x_train = train.drop(['USER_ID_hash', 'COUPON_ID_hash', 'TARGET'], axis=1)
y_train = train.TARGET.values.reshape(-1)
x_test = test.drop(['USER_ID_hash', 'COUPON_ID_hash'], axis=1)

categoricals = x_train.dtypes[x_train.dtypes == 'object'].index.tolist()
categoricals

['CAPSULE_TEXT',
 'GENRE_NAME',
 'LARGE_AREA_NAME',
 'PREF_NAME_COUPON',
 'SMALL_AREA_NAME',
 'SEX_ID',
 'PREF_NAME_USER']

In [10]:
x_train[categoricals] = x_train[categoricals].astype('category')
x_test[categoricals] = x_test[categoricals].astype('category')

In [11]:
x_train, x_test = x_train.align(x_test, join='left', axis=1)
x_train.head()

Unnamed: 0,CAPSULE_TEXT,GENRE_NAME,PRICE_RATE,CATALOG_PRICE,DISCOUNT_PRICE,DISPPERIOD,VALIDPERIOD,USABLE_DATE_MON,USABLE_DATE_TUE,USABLE_DATE_WED,USABLE_DATE_THU,USABLE_DATE_FRI,USABLE_DATE_SAT,USABLE_DATE_SUN,LARGE_AREA_NAME,PREF_NAME_COUPON,SMALL_AREA_NAME,LATITUDE_COUPON,LONGITUDE_COUPON,SEX_ID,AGE,PREF_NAME_USER,LATITUDE_USER,LONGITUDE_USER,SAME_PREF,HAVERSINE_DIST,LATITUDE_DELTA,LONGITUDE_DELTA
0,Food,Food,50,5260,2630,3,179,1.0,1.0,1.0,1.0,1.0,1.0,1.0,Kansai,Hyogo,Hyogo,34.691279,135.183025,f,57,Kochi,33.559705,133.53108,False,197.417359,1.131574,1.651945
1,Food,Food,50,4500,2250,3,151,1.0,1.0,1.0,1.0,1.0,1.0,1.0,Kansai,Kyoto,Kyoto,35.021004,135.755608,f,26,Saitama,35.857428,139.648933,False,364.841385,0.836424,3.893325
2,Relaxation,Relaxation,58,5800,2400,4,178,1.0,1.0,1.0,1.0,1.0,1.0,1.0,Kanto,Kanagawa,Yokohama,35.447753,139.642514,f,39,Tokyo,35.689521,139.691704,False,27.256747,0.241768,0.04919
3,Food,Food,50,2000,1000,4,119,1.0,1.0,1.0,1.0,1.0,1.0,1.0,Kanto,Kanagawa,Yokohama,35.447753,139.642514,f,64,Nagasaki,32.744839,129.873756,False,948.145794,2.702914,9.768758
4,Japanese hotel,Hotel and Japanese hotel,52,13650,6480,3,149,1.0,1.0,1.0,1.0,1.0,2.0,1.0,Kyushu-Okinawa,Nagasaki,Nagasaki,32.744839,129.873756,m,34,Tokyo,35.689521,139.691704,False,960.011172,2.944682,9.817948


In [12]:
train_data = lgb.Dataset(x_train, label=y_train)

# Training

In [13]:
model_params = {
    'boosting_type': 'gbdt',
    'class_weight': None,
    'colsample_bytree': 1.0,
    'importance_type': 'split',
    'learning_rate': 0.1,
    'max_depth': -1,
    'min_child_samples': 20,
    'min_child_weight': 0.001,
    'min_split_gain': 0.0,
    'num_iterations': 100,
    'n_jobs': -1,
    'num_leaves': 31,
    'objective': 'binary',
    'random_state': 0,
    'reg_alpha': 0.0,
    'reg_lambda': 0.0,
    'subsample': 1.0,
    'subsample_for_bin': 200000,
    'subsample_freq': 0,
    'metric': 'binary_logloss'
}

In [14]:
gbm_model = lgb.train(model_params, train_data)



In [15]:
gbm_model.save_model('CPP_REPRO_LGBM.mdl')
gbm_model = lgb.Booster(model_file='CPP_REPRO_LGBM.mdl')

In [16]:
A = x_test.iloc[:len(x_test) // 3]
B = x_test.iloc[len(x_test) // 3:len(x_test) // 3 * 2]
C = x_test.iloc[len(x_test) // 3 * 2:]

In [17]:
y_predA = gbm_model.predict(A, raw_score=True)
y_predB = gbm_model.predict(B, raw_score=True)
y_predC = gbm_model.predict(C, raw_score=True)

In [18]:
y_pred = y_predA.tolist() + y_predB.tolist() + y_predC.tolist()

In [19]:
sub = test[['USER_ID_hash','COUPON_ID_hash']].copy()
sub['TARGET'] = y_pred

In [20]:
grouped = sub.groupby('USER_ID_hash')

In [21]:
def get_top10(row):
    pred = grouped.get_group(row.USER_ID_hash).sort_values(by=['TARGET'],ascending=False)
    pred = ' '.join(map(str, pred.head(10).COUPON_ID_hash.values))
    return pred

In [22]:
submission['PURCHASED_COUPONS'] = submission.progress_apply(get_top10, axis=1)
submission.to_csv('sub_CPP_REPRO_LGBM.csv', index=False)
submission

HBox(children=(FloatProgress(value=0.0, max=22873.0), HTML(value='')))




Unnamed: 0,USER_ID_hash,PURCHASED_COUPONS
0,0000b53e182165208887ba65c079fc21,c60dbd64087f40d46d539a96947d0e87 c988d799bc7db...
1,00035b86e6884589ec8d28fbf2fe7757,fc5f052a1bd97696fbcab35d8d974b73 d506a61810346...
2,0005b1068d5f2b8f2a7c978fcfe1ca06,46da51ba6dd20c514c2802f79a4e94b2 c988d799bc7db...
3,000cc06982785a19e2a2fdb40b1c9d59,79de77aa8c36fdf17cb3366e2084e353 e3e9027e1b87d...
4,0013518e41c416cd6a181d277dd8ca0b,c988d799bc7db9254fe865ee6cf2d4ff c60dbd64087f4...
...,...,...
22868,fff1a623187cefd7a594e338709b0f40,51da52d5516033bea13972588b671184 c988d799bc7db...
22869,fff4a076cfda6ff9dbe85e1cb678791b,79de77aa8c36fdf17cb3366e2084e353 e3e9027e1b87d...
22870,fff970d2014c3e10a77e38d540239017,46da51ba6dd20c514c2802f79a4e94b2 09aeb1dad89fe...
22871,fffafc024e264d5d539813444cf61199,63eeb16b672d0b8554acb63e88035ec8 c60dbd64087f4...
