In [104]:
import pandas as pd
import os
import numpy as np
import itertools
import lightgbm as lgb
from scipy.spatial.distance import euclidean
import joblib
from tqdm.notebook import tqdm

tqdm.pandas()
pd.set_option('display.max_columns', None)

  from pandas import Panel


# Read and Preprocess

In [105]:
dsdir = 'C:\\Users\\Daniel\\Downloads\\3_Plus_1\\RecommendationEngine\\coupon-purchase-prediction\\dataset'

In [106]:
train = pd.read_csv('CPP_REPRO_coupon_list_train.csv').sample(frac=1, random_state=0).reset_index(drop=True)
test =  pd.read_csv('CPP_REPRO_coupon_list_test.csv')
submission = pd.read_csv(os.path.join(dsdir,'sample_submission.csv'))

In [107]:
train.drop(['DISPFROM', 'DISPEND', 'VALIDFROM', 'VALIDEND', 'REG_DATE', 'WITHDRAW_DATE'], axis=1, inplace=True)
test.drop(['DISPFROM', 'DISPEND', 'VALIDFROM', 'VALIDEND', 'REG_DATE', 'WITHDRAW_DATE'], axis=1, inplace=True)

In [108]:
train['SAME_PREF'] = train['PREF_NAME_COUPON'] == train['PREF_NAME_USER']
test['SAME_PREF'] =  test['PREF_NAME_COUPON'] == test['PREF_NAME_USER']

In [109]:
def haversine(lat1, lat2, lon1, lon2):
    R = 6372.8 # kilometres (3959.87433 miles)
    
    dlat_rad = np.radians(lat2 - lat1)
    dlon_rad = np.radians(lon2 - lon1)
    lat1_rad = np.radians(lat1)
    lat2_rad = np.radians(lat2)

    a = np.sin(dlat_rad / 2) ** 2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon_rad / 2) ** 2
    hdist = 2 * R * np.arcsin(np.sqrt(a))

    return hdist

In [110]:
# Use vector operations for speed
train['HAVERSINE_DIST'] = haversine(train['LATITUDE_COUPON'], train['LATITUDE_USER'], train['LONGITUDE_COUPON'], train['LONGITUDE_USER'])
train['LATITUDE_DELTA'] = train['LATITUDE_COUPON'] - train['LATITUDE_USER']
train['LONGITUDE_DELTA'] = train['LONGITUDE_COUPON'] - train['LONGITUDE_USER']

test['HAVERSINE_DIST'] = haversine(test['LATITUDE_COUPON'], test['LATITUDE_USER'], test['LONGITUDE_COUPON'], test['LONGITUDE_USER'])
test['LATITUDE_DELTA'] = test['LATITUDE_COUPON'] - test['LATITUDE_USER']
test['LONGITUDE_DELTA'] = test['LONGITUDE_COUPON'] - test['LONGITUDE_USER']

In [111]:
x_train = train.drop(['USER_ID_hash', 'COUPON_ID_hash', 'TARGET'], axis=1)
y_train = train.TARGET.values.reshape(-1)
x_test = test.drop(['USER_ID_hash', 'COUPON_ID_hash'], axis=1)

categoricals = x_train.dtypes[x_train.dtypes == 'object'].index.tolist()
categoricals

['CAPSULE_TEXT',
 'GENRE_NAME',
 'LARGE_AREA_NAME',
 'PREF_NAME_COUPON',
 'SMALL_AREA_NAME',
 'SEX_ID',
 'PREF_NAME_USER']

In [112]:
x_train[categoricals] = x_train[categoricals].astype('category')
x_test[categoricals] = x_test[categoricals].astype('category')

In [113]:
x_train, x_test = x_train.align(x_test, join='left', axis=1)

In [114]:
train_data = lgb.Dataset(x_train, label=y_train, feature_name=x_train.columns.values.tolist(), categorical_feature=categoricals)

# Training

In [115]:
model_params = {
    'boosting_type': 'gbdt',
    'class_weight': None,
    'colsample_bytree': 1.0,
    'importance_type': 'split',
    'learning_rate': 0.1,
    'max_depth': -1,
    'min_child_samples': 20,
    'min_child_weight': 0.001,
    'min_split_gain': 0.0,
    'n_estimators': 100,
    'n_jobs': -1,
    'num_leaves': 31,
    'objective': 'binary',
    'random_state': 0,
    'reg_alpha': 0.0,
    'reg_lambda': 0.0,
    'silent': False,
    'subsample': 1.0,
    'subsample_for_bin': 200000,
    'subsample_freq': 0,
    'metric': 'binary_logloss'
}

In [116]:
gbm_model = lgb.train(model_params, train_data)

Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


In [117]:
# gbm_model.save_model('CPP_REPRO_LGBM.mdl')
# gbm_model = lgb.Booster(model_file='CPP_REPRO_LGBM.mdl')

<lightgbm.basic.Booster at 0x16f20765048>

In [118]:
A = x_test.iloc[:len(x_test) // 3]
B = x_test.iloc[len(x_test) // 3:len(x_test) // 3 * 2]
C = x_test.iloc[len(x_test) // 3 * 2:]

In [119]:
y_predA = gbm_model.predict(A, raw_score=True)
y_predB = gbm_model.predict(B, raw_score=True)
y_predC = gbm_model.predict(C, raw_score=True)

In [120]:
y_pred = y_predA.tolist() + y_predB.tolist() + y_predC.tolist()

In [121]:
sub = test[['USER_ID_hash','COUPON_ID_hash']].copy()
sub['TARGET'] = y_pred

In [122]:
grouped = sub.groupby('USER_ID_hash')

In [123]:
def get_top10(row):
    pred = grouped.get_group(row.USER_ID_hash).sort_values(by=['TARGET'],ascending=False)
    pred = ' '.join(map(str, pred.head(10).COUPON_ID_hash.values))
    return pred

In [124]:
submission['PURCHASED_COUPONS'] = submission.progress_apply(get_top10, axis=1)
submission.to_csv('sub_CPP_REPRO_LGBM.csv', index=False)
submission

HBox(children=(FloatProgress(value=0.0, max=22873.0), HTML(value='')))




Unnamed: 0,USER_ID_hash,PURCHASED_COUPONS
0,0000b53e182165208887ba65c079fc21,c1812b1c062f096ce5e58cfd6ba86d62 9fe88dabce140...
1,00035b86e6884589ec8d28fbf2fe7757,fc5f052a1bd97696fbcab35d8d974b73 262572324a598...
2,0005b1068d5f2b8f2a7c978fcfe1ca06,c1812b1c062f096ce5e58cfd6ba86d62 9fe88dabce140...
3,000cc06982785a19e2a2fdb40b1c9d59,79de77aa8c36fdf17cb3366e2084e353 784c1314b9f64...
4,0013518e41c416cd6a181d277dd8ca0b,c988d799bc7db9254fe865ee6cf2d4ff 0c01530659756...
...,...,...
22868,fff1a623187cefd7a594e338709b0f40,51da52d5516033bea13972588b671184 c988d799bc7db...
22869,fff4a076cfda6ff9dbe85e1cb678791b,79de77aa8c36fdf17cb3366e2084e353 8c470d8651dbc...
22870,fff970d2014c3e10a77e38d540239017,46da51ba6dd20c514c2802f79a4e94b2 f453a31322bc6...
22871,fffafc024e264d5d539813444cf61199,63eeb16b672d0b8554acb63e88035ec8 c1812b1c062f0...
