In [6]:
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import pickle
import random
import numpy as np
import datetime
import time
import torch

In [2]:
historical_transactions = pd.read_csv("dataset/historical_transactions.csv")

In [8]:
history = {}

In [9]:
pbar = tqdm(total=len(historical_transactions))
for index, row in historical_transactions.iterrows():
    key = (row['card_id'], row['merchant_id'])
    if key not in history:
        history[key] = (None, None)
    purchase_date = datetime.datetime.strptime(row['purchase_date'], '%Y-%m-%d %H:%M:%S')
    if history[key][0] is None or purchase_date < history[key][0]:
        history[key] = (purchase_date, history[key][0])
    elif history[key][1] is None or purchase_date < history[key][1]:
        history[key] = (history[key][0], purchase_date)
    pbar.update(1)
pbar.close()

In [118]:
print 'Processing...'
data = [(d[0], d[1], (time.mktime(history[d][1].timetuple())-time.mktime(history[d][0].timetuple()))/1000000.0) for d in history if history[d][1] is not None]
random.shuffle(data)
train, test = data[:-1000], data[-1000:]

Processing...


In [121]:
dim = 64
class embedding(torch.nn.Module):
    def __init__(self, user_size, item_size):
        super(embedding, self).__init__()
        self.embeddings = torch.nn.Embedding(user_size, dim)
        self.embeddings2 = torch.nn.Embedding(item_size, dim)
        self.linear1 = torch.nn.Linear(dim*2, 32)
        self.linear2 = torch.nn.Linear(32, 1)
    def forward(self, inputs, item):
        inputs = self.embeddings(inputs)
        item = self.embeddings2(item)
        inputs = torch.cat((inputs, item), -1)
        inputs = torch.nn.functional.relu(self.linear1(inputs))
        out = self.linear2(inputs)
        return out

In [122]:
maps, map_i, count, count2 = {}, {}, 0, 0
for i in range(len(data)):
    if data[i][0] not in maps:
        maps[data[i][0]] = count
        count += 1
    if data[i][1] not in map_i:
        map_i[data[i][1]] = count2
        count2 += 1
model = embedding(count, count2)
loss_function = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

In [125]:
prev_MSE = 10**20
B = 1000
model = model.cuda()
while True:
    model.train()
    idx = 0
    while idx < len(train):
        batch = train[idx:idx+B]
        optimizer.zero_grad()
        x = [maps[u] for u, i, r in batch]
        x = torch.Tensor([x]).long()
        x = torch.autograd.Variable(x).cuda()
        x2 = [map_i[i] for u, i, r in batch]
        x2 = torch.Tensor([x2]).long()
        x2 = torch.autograd.Variable(x2).cuda()
        r = torch.Tensor([r for u, i, r in batch]).cuda()
        #r = torch.autograd.Variable(r)
        y = model(x, x2)
        loss = loss_function(y.view(-1), r)
        loss.backward()
        optimizer.step()
        idx += B
        
    model.eval()
    MSE = 0
    for u, i, r in test:
        x = maps[u]
        x = torch.Tensor([x]).long()
        x = torch.autograd.Variable(x).cuda()
        x2 = map_i[i]
        x2 = torch.Tensor([x2]).long()
        x2 = torch.autograd.Variable(x2).cuda()
        #r = torch.autograd.Variable(r)
        y = model(x, x2) 
        MSE += (y.view(-1).cpu().data - r)**2
    MSE = float(MSE) / len(test)
    print 'MSE: ', MSE
    if MSE > prev_MSE:
        break
    prev_MSE = MSE

MSE:  32.9577265625
MSE:  32.959703125


# Loyalty

In [52]:
trainset = pd.read_csv('dataset/train.csv')

In [103]:
dim = 64
class loyalty(torch.nn.Module):
    def __init__(self, emb):
        super(loyalty, self).__init__()
        self.embeddings = emb
        self.linear1 = torch.nn.Linear(dim+3, 32)
        self.linear2 = torch.nn.Linear(32, 1)
    def forward(self, inputs, features):
        inputs = self.embeddings(inputs)
#        print inputs.shape, features.shape
        inputs = torch.cat((inputs, features), 2)
        inputs = torch.nn.functional.relu(self.linear1(inputs))
        out = self.linear2(inputs)
        return out

In [104]:
model2 = loyalty(model.embeddings)
loss_function = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model2.parameters(), lr=0.001)

In [97]:
print 'Processing...'
data = [(d['card_id'], [d['feature_1'], d['feature_2'], d['feature_3']], d['target']) for d in trainset.to_dict(orient='records')]
random.shuffle(data)
train, test = data[:-1000], data[-1000:]

Processing...


In [113]:
prev_MSE = 10**20
B = 1000
model2 = model2.cuda()
while True:
    model2.train()
    idx = 0
    while idx < len(train):
        batch = train[idx:idx+B]
        optimizer.zero_grad()
        x = [maps[u] for u, fts, r in batch if u in maps]
        x = torch.Tensor([x]).long()
        x = torch.autograd.Variable(x).cuda()
        ft = [fts for u, fts, r in batch if u in maps]
        ft = torch.Tensor([ft])
        ft = torch.autograd.Variable(ft).cuda()
        r = torch.Tensor([r for u, fts, r in batch if u in maps]).cuda()
        #r = torch.autograd.Variable(r)
        y = model2(x, ft)
        loss = loss_function(y.view(-1), r)
        loss.backward()
        optimizer.step()
        idx += B
        
    model2.eval()
    MSE = 0
    for u, fts, r in test:
        if u not in maps:
            continue
        x = maps[u]
        x = torch.Tensor([[x]]).long()
        x = torch.autograd.Variable(x).cuda()
        ft = torch.Tensor([[fts]])
        ft = torch.autograd.Variable(ft).cuda()
        #r = torch.autograd.Variable(r)
        y = model2(x, ft) 
        MSE += (y.view(-1).cpu().data - r)**2
    MSE = float(MSE) / len(test)
    print 'RMSE: ', np.sqrt(MSE)
    if MSE > prev_MSE:
        break
    prev_MSE = MSE

RMSE:  3.8272846016301165
RMSE:  3.827471117624534


# Submission

In [111]:
testset, pred = pd.read_csv('dataset/test.csv'), []
for idx,row in enumerate(testset.to_dict(orient='records')):
    model2.eval()
    u, ft = row['card_id'], [row['feature_1'], row['feature_2'], row['feature_3']]
    if u not in maps:
        pred.append(0)
        continue
    x = maps[u]
    x = torch.Tensor([[x]]).long()
    x = torch.autograd.Variable(x).cuda()
    ft = torch.Tensor([[fts]])
    ft = torch.autograd.Variable(ft).cuda()
    #r = torch.autograd.Variable(r)
    y = model2(x, ft) 
    y = float(y.view(-1).cpu().data)
    pred.append(y)
df_sub = pd.DataFrame({"card_id":testset["card_id"].values})
df_sub["target"] = pred
df_sub.to_csv("submission.csv", index=False)

In [114]:
np.mean(pred)

-0.3762698570282051