In [1]:
import pandas as pd

In [2]:
products = pd.read_csv("products.csv")
transactions = pd.read_csv("transactions.csv")

Для пользователя - [id_пользователя, кол-во купленных продуктов, показатель перезаказов]
Показатель перезаказов - это характеристика, отражающая процент продуктов, которые пользователь покупал больше 1 раза

In [3]:
users_embeddings = transactions.groupby("user_id").agg({"product_id": "count", "reordered": "sum"}).rename(columns={"product_id": "n_bought"}).reset_index()

In [4]:
users_embeddings.reordered = users_embeddings.reordered / users_embeddings.n_bought

Для продуктов - [id продукта, процент перезаказа этого продукта среди всех пользователей, кол-во заказов данного продукта всеми пользователями, процент заказа данного продукта конкретным пользователем]

In [5]:
products_embeddings = transactions.groupby("product_id").agg({"user_id": "count", "reordered": "sum"})

In [6]:
products_embeddings["reordered"] = products_embeddings.reordered / products_embeddings.user_id

In [7]:
products_embeddings.columns = ["n_ordered", "reorder_coeff"]

In [8]:
products_embeddings.reset_index(inplace=True)

процент заказа данного продукта конкретным пользователем

In [9]:
product_user_coeff = transactions.groupby(["user_id", "product_id"]).agg({"reordered":"count"})

In [10]:
product_user_coeff = product_user_coeff.reset_index().rename(columns={"reordered": "user_ordered"})

In [11]:
product_user_coeff = pd.merge(products_embeddings[["product_id", "n_ordered"]], product_user_coeff, on="product_id")

In [12]:
product_user_coeff["product_user_coeff"] = product_user_coeff.user_ordered / product_user_coeff.n_ordered

In [13]:
product_user_coeff = product_user_coeff[["product_id", "user_id", "product_user_coeff"]]

In [14]:
users_embeddings.columns = ["user_id", "user_total_bought", "user_reorder_coeff"]

In [15]:
products_embeddings.columns = ["product_id", "product_total_ordered", "product_reorder_coeff"]

In [16]:
product_user_coeff = pd.merge(product_user_coeff, users_embeddings, on="user_id")

In [17]:
product_user_coeff = pd.merge(product_user_coeff, products_embeddings, on="product_id")

Для "игреков" можно например взять 1, если продукт был перезаказан, 0 , если не покупался/не перезаказывался 

In [18]:
y = transactions.groupby(["product_id", "user_id"])["reordered"].agg("max").reset_index()

In [19]:
y.columns = ["product_id", "user_id", "reordered_target"]

In [20]:
df = pd.merge(product_user_coeff, y, on=["product_id", "user_id"])

In [21]:
df.head()

Unnamed: 0,product_id,user_id,product_user_coeff,user_total_bought,user_reorder_coeff,product_total_ordered,product_reorder_coeff,reordered_target
0,1,138,0.001291,148,0.628378,1549,0.678502,1.0
1,1,777,0.000646,113,0.734513,1549,0.678502,0.0
2,1,1052,0.001291,48,0.5,1549,0.678502,1.0
3,1,1480,0.001937,192,0.734375,1549,0.678502,1.0
4,1,1494,0.001937,158,0.550633,1549,0.678502,1.0


давайте обсудим нейросеть: 

Достаточно использовать только dense слои

модель должна принимать на вход эмбеддинги пользователя + эмбеддинги какого то продукта, на выходе получаем вероятность покупки этого продукта 

После для каждого пользователя сортируем все вероятности по убыванию и берем первые 10 

Активация на выходе - сигмоид 

In [22]:
import warnings
warnings.filterwarnings("ignore")
import torch
import numpy as np
from sklearn.model_selection import train_test_split

In [23]:
X_cols = ['product_user_coeff', 'user_total_bought', 'user_reorder_coeff', 'product_total_ordered', 'product_reorder_coeff']

In [24]:
y_cols = ["reordered_target"]

In [25]:
X = df[X_cols]
y = df[y_cols]

In [26]:
# X_train, X_test, y_train, y_test = train_test_split(X, y,
#                                                     random_state=42)

In [27]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [28]:
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(5, 120)
        self.fc2 = nn.Linear(120, 120)
        self.fc3 = nn.Linear(120, 120)
        self.fc4 = nn.Linear(120, 120)
        self.fc5 = nn.Linear(120, 120)
        self.fc6 = nn.Linear(120, 84)
        self.fc7 = nn.Linear(84, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        x = F.relu(self.fc6(x))
        x = F.sigmoid(self.fc7(x))
        return x

In [29]:
from torch.utils.data import Dataset, DataLoader

# Convert data to torch tensors
class Data(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X.astype(np.float32))
        self.y = torch.from_numpy(y.astype(np.float32))
        self.len = self.X.shape[0]
       
    def __getitem__(self, index):
        return self.X[index], self.y[index]
   
    def __len__(self):
        return self.len

In [30]:
model = Net()
loss_fn = nn.BCELoss()

In [31]:
model = model.to("cuda")

In [32]:
batch_size = 2048

# Instantiate training and test data
train_data = Data(X.values, y.values)
train_dataloader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)

# test_data = Data(X_test.values, y_test.values)
# test_dataloader = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=True)

In [33]:
from tqdm import tqdm
import torch.optim as optim

In [34]:
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)


In [35]:
num_epochs = 300
loss_values = []


for epoch in tqdm(range(num_epochs)):
    for X, y in train_dataloader:
        X = X.to("cuda")
        y = y.to("cuda")
        # zero the parameter gradients
        optimizer.zero_grad()
        # forward + backward + optimize
        pred = model(X)
        loss = loss_fn(pred, y.unsqueeze(-1)[:,:,0])
        loss_values.append(loss.item())
        loss.backward()
        optimizer.step()

print("Training Complete")

100%|██████████| 300/300 [6:59:10<00:00, 83.83s/it]   

Training Complete





In [36]:
model = model.to("cpu")

In [None]:
predictions = []

for X, y in tqdm(train_dataloader):
    pred = model(X)
    predictions.append(pred.detach().numpy())

 31%|███       | 1442/4619 [00:31<01:02, 50.60it/s]

In [None]:
df["predictions"] = np.vstack(predictions)

In [39]:
df.to_csv("predictions.csv")

In [45]:
ranked_lists = df.sort_values(["user_id", "predictions"], ascending=False).groupby("user_id")["product_id"].agg(list)

top 10 sales

In [60]:
top_20 = list(transactions.groupby("product_id")["user_id"].agg("count").sort_values()[-20:].sort_values(ascending=False).keys())

In [73]:
def fill_10(row):
    if len(row) == 10:
        return row.product_id
    pids = row.product_id
    diff = set(top_20) - set(pids)
    unique_top_ranged = [p for p in top_20 if p in diff]
    res = pids + unique_top_ranged
    return res[:10]

In [68]:
ranked_lists = ranked_lists.reset_index()

In [77]:
predictions_list = ranked_lists.apply(fill_10, axis=1)

In [82]:
ranked_lists.product_id = predictions_list

In [84]:
ranked_lists.product_id = ranked_lists.product_id.apply(lambda x: " ".join(map(str, x)))

In [87]:
ranked_lists.columns = ["user_id", "product_id"]

In [89]:
ranked_lists.to_csv("sample_submission_2.csv", index=False)