In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics import pairwise, pairwise_distances
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import json
import networkx as nx
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from collections import defaultdict
##

import os.path as osp

import argparse
import torch
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv, GAE, VGAE
from torch_geometric.utils import train_test_split_edges
from torch_geometric.utils import from_networkx
from itertools import product

In [2]:
def nan_to_mode(df):
    """
    Replace NaN with mode in dataframe
    """
    df_mode=df.mode()
    for x in df.columns.values:
        df.loc[:, x]=df.loc[:, x].fillna(value=df_mode[x].iloc[0])

    return df


In [3]:
def is_user(node_id, user2node):
    if node_id in user2node.values():
        return True
    else:
        return False

In [4]:
dataset = pd.read_csv("data/dataset_split/train.csv", index_col=0)

In [5]:
dataset.head()

Unnamed: 0,fit,user_id,bust size,item_id,weight,rating,rented for,review_text,body type,review_summary,category,height,size,age,review_date,class_rating
46628,fit,345809,36b,326784,150lbs,5.0,wedding,I wore this dress for my bridal shower this pa...,athletic,Great Bridal Shower Dress,dress,"5' 6""",16,30.0,"May 3, 2016",1
18399,fit,45235,,2766308,128lbs,5.0,everyday,I really liked this sweater. I wore it on a fl...,athletic,"Great sweater, perfect for travel!",cardigan,"5' 6""",8,34.0,"March 7, 2017",1
12853,fit,508677,36b,254960,145lbs,3.0,wedding,I ordered this dress and one other for a weddi...,athletic,Seeing as I didn't wear this dress....not this...,sheath,"5' 8""",16,39.0,"January 18, 2017",0
17290,fit,117290,34a,1687082,110lbs,5.0,wedding,This dress is so fun! Was a little tight aroun...,petite,The COLOR!!!!,gown,"5' 4""",4,33.0,"March 28, 2016",1
9628,fit,144767,34b,135459,,4.0,party,"I get nervous about gold, especially gold lace...",pear,Opulent but Chic and Sexy,dress,"5' 5""",16,33.0,"January 11, 2016",1


In [6]:
user_ids = set(dataset['user_id'].values)
user2node = dict(zip(user_ids, np.arange(len(user_ids))))

item_ids = set(dataset['item_id'].values)
item2node = dict(zip(item_ids, np.arange(len(item_ids)) + len(user_ids)))

In [7]:
item_modes = dataset.groupby('item_id').apply(lambda x: x.mode().iloc[0])
item_means = dataset.groupby('item_id').mean()

In [8]:
user_modes = dataset.groupby('user_id').apply(lambda x: x.mode().iloc[0])
user_means = dataset.groupby('user_id').mean()

In [9]:
assert np.all(user_modes.index == user_means.index)
assert np.all(item_modes.index == item_means.index)

In [10]:
category_encoder = OneHotEncoder().fit(nan_to_mode(dataset[['rented for', 'bust size', 'body type', 'category']].copy()).values)
numerical_scaler = StandardScaler().fit(nan_to_mode(dataset[['rating', 'size', 'age']].copy()).values)

In [11]:
user_numerical = nan_to_mode(user_means[['rating', 'size', 'age']].copy()).values
user_numerical = numerical_scaler.transform(user_numerical)

user_categorical = nan_to_mode(user_modes[['rented for', 'bust size', 'body type', 'category']].copy()).values
user_categorical = category_encoder.transform(user_categorical).todense().A

user_features = np.concatenate([user_numerical, user_categorical], axis=1)
user_features.shape

(5631, 144)

In [12]:
item_numerical = nan_to_mode(item_means[['rating', 'size', 'age']].copy()).values
item_numerical = numerical_scaler.transform(item_numerical)

item_categorical = nan_to_mode(item_modes[['rented for', 'bust size', 'body type', 'category']].copy()).values
item_categorical = category_encoder.transform(item_categorical).todense().A

item_features = np.concatenate([item_numerical, item_categorical], axis=1)
item_features.shape

(4752, 144)

In [13]:
graph = nx.Graph()

for i, id in tqdm(enumerate(user_means.index)):
    graph.add_node(user2node[id], features=user_features[i], id=user2node[id])

for i, id in tqdm(enumerate(item_means.index)):
    graph.add_node(item2node[id], features=item_features[i], id=item2node[id])

for _, row in dataset.iterrows():
    user_node = user2node[row['user_id']]
    item_node = item2node[row['item_id']]

    graph.add_edge(user_node, item_node, edge_weight=row['rating'] / 5)

5631it [00:00, 512946.87it/s]
4752it [00:00, 599961.85it/s]


In [14]:
data = from_networkx(graph)
data

Data(edge_index=[2, 84650], edge_weight=[84650], features=[10383, 144], id=[10383])

In [15]:
data_split = train_test_split_edges(data)

In [16]:
data_split

Data(edge_weight=[84650], features=[10383, 144], id=[10383], test_neg_edge_index=[2, 4232], test_pos_edge_index=[2, 4232], train_neg_adj_mask=[10383, 10383], train_pos_edge_index=[2, 71954], val_neg_edge_index=[2, 2116], val_pos_edge_index=[2, 2116])

In [17]:
num_features = data.features.shape[-1]
hidden_size = 768

In [18]:
class GCNEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(GCNEncoder, self).__init__()
        self.conv1 = GCNConv(in_channels, out_channels, cached=True)
        self.conv2 = GCNConv(out_channels, out_channels, cached=True)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv2(x, edge_index)

model = GAE(GCNEncoder(num_features, hidden_size))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

def train():
    model.train()
    optimizer.zero_grad()
    z = model.encode(data.features.float(), data.train_pos_edge_index)
    loss = model.recon_loss(z, data.train_pos_edge_index)
    loss.backward()
    optimizer.step()
    return float(loss)

def test(pos_edge_index, neg_edge_index):
    model.eval()
    with torch.no_grad():
        z = model.encode(data.features.float(), data.train_pos_edge_index)
    return model.test(z, pos_edge_index, neg_edge_index)


for epoch in range(100):
    loss = train()
    auc, ap = test(data.test_pos_edge_index, data.test_neg_edge_index)
    print('Epoch: {:03d}, AUC: {:.4f}, AP: {:.4f}'.format(epoch, auc, ap))


Epoch: 000, AUC: 0.7511, AP: 0.7519
Epoch: 001, AUC: 0.7607, AP: 0.7560
Epoch: 002, AUC: 0.7523, AP: 0.7476
Epoch: 003, AUC: 0.7478, AP: 0.7444
Epoch: 004, AUC: 0.7500, AP: 0.7460
Epoch: 005, AUC: 0.7536, AP: 0.7478
Epoch: 006, AUC: 0.7552, AP: 0.7478
Epoch: 007, AUC: 0.7547, AP: 0.7467
Epoch: 008, AUC: 0.7533, AP: 0.7451
Epoch: 009, AUC: 0.7513, AP: 0.7434
Epoch: 010, AUC: 0.7486, AP: 0.7412
Epoch: 011, AUC: 0.7447, AP: 0.7377
Epoch: 012, AUC: 0.7396, AP: 0.7332
Epoch: 013, AUC: 0.7334, AP: 0.7276
Epoch: 014, AUC: 0.7262, AP: 0.7207
Epoch: 015, AUC: 0.7191, AP: 0.7133
Epoch: 016, AUC: 0.7142, AP: 0.7083
Epoch: 017, AUC: 0.7121, AP: 0.7066
Epoch: 018, AUC: 0.7127, AP: 0.7078
Epoch: 019, AUC: 0.7145, AP: 0.7101
Epoch: 020, AUC: 0.7172, AP: 0.7129
Epoch: 021, AUC: 0.7206, AP: 0.7161
Epoch: 022, AUC: 0.7249, AP: 0.7208
Epoch: 023, AUC: 0.7298, AP: 0.7263
Epoch: 024, AUC: 0.7341, AP: 0.7309
Epoch: 025, AUC: 0.7376, AP: 0.7343
Epoch: 026, AUC: 0.7402, AP: 0.7368
Epoch: 027, AUC: 0.7426, AP:

In [19]:
most_popular_items = dataset.groupby('item_id').count().sort_values('fit', ascending=False)[:1000].index.values

item_node_ids = list(map(lambda x: item2node[x], most_popular_items))
user_node_ids = list(map(lambda x: user2node[x], user_ids))

potential_edges = np.array(list(product(item_node_ids, user_node_ids))).transpose(1, 0)
potential_edges = torch.tensor(potential_edges.astype(np.long)).to('cuda')

In [20]:
potential_edges.shape

torch.Size([2, 5631000])

In [21]:
model.eval()
z = model.encode(data.features.float(), data.train_pos_edge_index)
#edge_probs = model.decode(z, potential_edges).detach().cpu().numpy()
edge_probs = torch.sum(z.cpu()[potential_edges.cpu()[0]] * z.cpu()[potential_edges.cpu()[1]], dim=-1)
edge_probs = torch.sigmoid(edge_probs).detach().cpu().numpy()

In [23]:
data.to('cpu')
most_probable_edges = np.argsort(edge_probs)[::-1]

In [24]:
edge_probs.shape

(5631000,)

In [25]:
node2user = dict(map(lambda x: (x[1], x[0]), user2node.items()))
node2item = dict(map(lambda x: (x[1], x[0]), item2node.items()))

In [26]:
transactions = dataset[['user_id', 'item_id']].values.tolist()
transactions = map(tuple, transactions)
transactions = set(transactions)

In [27]:
recomendations = defaultdict(lambda: {})

potential_edges = potential_edges.cpu()

for edge in tqdm(most_probable_edges):
    prob = edge_probs[edge]
    node_from_id, node_to_id = potential_edges[:, edge].numpy()

    node_from = data.id[node_from_id].item()
    node_to = data.id[node_to_id].item()

    if is_user(node_from, user2node):
        user_id = node2user[node_from]
        item_id = node2item[node_to]
    else:
        user_id = node2user[node_to]
        item_id = node2item[node_from]

    index = (user_id, item_id)
    # Don't recomend already bought items
    if index not in transactions:
        if len(recomendations[str(user_id)]) < 100:
            recomendations[str(user_id)][str(item_id)] = float(prob)

100%|██████████| 5631000/5631000 [1:16:34<00:00, 1225.63it/s]


In [28]:
with open("deep_gnn.json", 'w') as f:
    json.dump(recomendations, f, indent=2)

In [29]:
potential_edges.transpose(1, 0).shape

torch.Size([5631000, 2])