In [None]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
from math import log10
from math import pow

In [None]:
# load the data
ulist = pd.read_csv('data/user_list.csv')
cpltr = pd.read_csv('data/coupon_list_train.csv')
cplte = pd.read_csv('data/coupon_list_test.csv')
cpdtr = pd.read_csv('data/coupon_detail_train.csv')
cpvtr = pd.read_csv('data/coupon_visit_train.csv')
cpvtr = cpvtr.loc[cpvtr.PURCHASE_FLG == 0, ['VIEW_COUPON_ID_hash', 'USER_ID_hash']]
cpvtr.columns = ['COUPON_ID_hash', 'USER_ID_hash']

In [None]:
# fix validity period
cpltr.loc[pd.isnull(cpltr.VALIDPERIOD), 'VALIDPERIOD'] = -1
cpltr.VALIDPERIOD = cpltr.VALIDPERIOD + 1
cpltr.loc[cpltr.VALIDPERIOD > 0, 'VALIDPERIOD'] = 1

cplte.loc[pd.isnull(cplte.VALIDPERIOD), 'VALIDPERIOD'] = -1
cplte.VALIDPERIOD = cplte.VALIDPERIOD + 1
cplte.loc[cplte.VALIDPERIOD > 0, 'VALIDPERIOD'] = 1

In [None]:
# compute sums for usable dates
for i in range(9):
    c = cpltr.columns[11 + i]
    cpltr.iloc[:, 11 + i] = cpltr.icol(11 + i).fillna(0)
    cpltr.loc[cpltr[c] > 1, c] = 1
    cplte.iloc[:, 11 + i] = cplte.icol(11 + i).fillna(0)
    cplte.loc[cplte[c] > 1, c]
cpltr['USABLE_DATE_sum'] = cpltr.iloc[:,11:20].sum(axis=1)
cplte['USABLE_DATE_sum'] = cplte.iloc[:,11:20].sum(axis=1)

In [None]:
# create training set
train = pd.merge(cpdtr, cpltr, how='inner', on='COUPON_ID_hash', sort=True)
train = train[["COUPON_ID_hash",
              "USER_ID_hash",
              "GENRE_NAME",
              "DISCOUNT_PRICE",
              "DISPPERIOD",
              "large_area_name",
              "small_area_name",
              "VALIDPERIOD",
              "USABLE_DATE_sum"]]

In [None]:
# create test set
cplte['USER_ID_hash'] = 'dummyuser'
cplte = cplte[["COUPON_ID_hash",
              "USER_ID_hash",
              "GENRE_NAME",
              "DISCOUNT_PRICE",
              "DISPPERIOD",
              "large_area_name",
              "small_area_name",
              "VALIDPERIOD",
              "USABLE_DATE_sum"]]

In [None]:
# append test set to train
train = pd.concat([train, cplte])

In [None]:
# create views set
trainv = pd.merge(cpvtr, cpltr, how='inner', on='COUPON_ID_hash', sort=True)
trainv = trainv[["COUPON_ID_hash",
              "USER_ID_hash",
              "GENRE_NAME",
              "DISCOUNT_PRICE",
              "DISPPERIOD",
              "large_area_name",
              "small_area_name",
              "VALIDPERIOD",
              "USABLE_DATE_sum"]]

In [None]:
# fill NAs (unnecessary, there are no NA values)
train = train.fillna(1)
trainv = trainv.fillna(1)

In [None]:
# feature engineering
train.loc[train.DISCOUNT_PRICE <= 0, 'DISCOUNT_PRICE'] = 0.00001
train.DISCOUNT_PRICE = train.DISCOUNT_PRICE.apply(lambda x: 1. / log10(x))
train.loc[train.DISPPERIOD > 7, 'DISPPERIOD'] = 7
train.DISPPERIOD = train.DISPPERIOD / 7
train.USABLE_DATE_sum = train.USABLE_DATE_sum / 9

In [None]:
# convert categories to dummies
train = pd.get_dummies(train, columns=['GENRE_NAME', 'large_area_name', 'small_area_name', 'VALIDPERIOD'])
trainv = pd.get_dummies(trainv, columns=['GENRE_NAME', 'large_area_name', 'small_area_name', 'VALIDPERIOD'])

In [None]:
# separate test and train
test = train.loc[train.USER_ID_hash == 'dummyuser']
del test['USER_ID_hash']
train = train.loc[train.USER_ID_hash != 'dummyuser']

In [None]:
# numeric attributes set to 1 before cosine
train.DISCOUNT_PRICE = 1
train.DISPPERIOD = 1
train.USABLE_DATE_sum = 1

# !!!
trainv.DISCOUNT_PRICE = 1
train.DISPPERIOD = 1
train.USABLE_DATE_sum = 1

In [None]:
# discount view weight
VIEW_WEIGHT = 0.005
for i in range(len(trainv.columns) - 2):
    trainv.iloc[:, 2 + i] = trainv.icol(2 + i) * VIEW_WEIGHT

In [None]:
# concat train and trainv
ct = pd.concat([train, trainv])

In [None]:
# aggregate coupon details to get user characteristics
del ct['COUPON_ID_hash']
grouped = ct.groupby(['USER_ID_hash'])
uchar = grouped.sum()

In [None]:
#i = 0
#for c in uchar.columns:
#    print i, c

In [None]:
# weight matrices
WM = np.diag(
    [1.25] + # discount price
    [1.25] + # disp period
    [0.35] + # usable date sum
    ([2.0] * 13) + # genre
    ([1.0] * 9) + # large area
    ([4.5] * 55) + # small area
    ([0.625] * 2)) # validperiod
WF = np.diag(
    [0.75] + # discount price
    [1.50] + # disp period
    [0.25] + # usable date sum
    ([1.75] * 13) + # genre
    ([1.0] * 9) + # large area
    ([4.5] * 55) + # small area
    ([0.625] * 2)) # validperiod

In [None]:
# get sexes
uchar_sex = uchar.copy()
uchar_sex['USER_ID_hash'] = uchar.index
sexes = pd.merge(uchar_sex, ulist, how='inner', on='USER_ID_hash')
sexes = sexes[['USER_ID_hash', 'SEX_ID']]

In [None]:
# compute scores
del test['COUPON_ID_hash']
scoremxm = np.dot(np.dot(uchar.as_matrix(), WM), test.as_matrix().T)
scoremxf = np.dot(np.dot(uchar.as_matrix(), WF), test.as_matrix().T)
score = DataFrame(scoremxm, index=uchar.index, columns=cplte.COUPON_ID_hash)
scoref = DataFrame(scoremxf, index=uchar.index, columns=cplte.COUPON_ID_hash)
score[(sexes.SEX_ID == 'f').tolist()] = scoref[(sexes.SEX_ID == 'f').tolist()]

In [None]:
# get recommended coupons
def get_recommended_coupons(row):
    row.sort(ascending=False)
    ret = " ".join(row.index[:10])
    return ret
rec = score[:].apply(lambda row: get_recommended_coupons(row), axis=1)

In [None]:
# join recommendations with test data
ssub = pd.read_csv('data/sample_submission.csv')
del ssub['PURCHASED_COUPONS']
recf = DataFrame({'PURCHASED_COUPONS': rec})
recf['USER_ID_hash'] = recf.index
sub = pd.merge(ssub, recf, how='outer', on='USER_ID_hash')
sub = sub.fillna("")

In [None]:
# write recommendations to file
sub.to_csv('submission/mod_cos_sim_py.csv', index=False)