# Todo
- [x] load clip
- [x] load images
- [x] batch run inference on images
- [x] figure out how to load images from zip
- [x] save embeddings
- [x] benchmark, gpu: 2h
- [x] upload to gcp bucket
- [x] embed text as well
- [ ] predict a baseline score for kaggle, on image+description cos similarity only

In [3]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Wed Apr 20 08:20:00 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import auth
auth.authenticate_user()

Don't forget to upload your **kaggle.json** for authentication

# Setup

1. get data

In [4]:
!KAGGLE_CONFIG_DIR=/content kaggle competitions download -c h-and-m-personalized-fashion-recommendations

Downloading h-and-m-personalized-fashion-recommendations.zip to /content
100% 28.7G/28.7G [02:35<00:00, 199MB/s]
100% 28.7G/28.7G [02:35<00:00, 198MB/s]


2. mount zip

In [5]:
!apt-get install -y fuse-zip

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  libzip4
The following NEW packages will be installed:
  fuse-zip libzip4
0 upgraded, 2 newly installed, 0 to remove and 39 not upgraded.
Need to get 65.6 kB of archives.
After this operation, 178 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libzip4 amd64 1.1.2-1.1 [37.8 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 fuse-zip amd64 0.4.4-1 [27.9 kB]
Fetched 65.6 kB in 1s (92.1 kB/s)
Selecting previously unselected package libzip4:amd64.
(Reading database ... 155455 files and directories currently installed.)
Preparing to unpack .../libzip4_1.1.2-1.1_amd64.deb ...
Unpacking libzip4:amd64 (1.1.2-1.1) ...
Selecting previously unselected package fuse-zip.
Preparing to unpack .../fuse-zip_0.4.4-1_amd64.deb ...
Unpacking fuse-zip (0.4.4-1) ...
Setting up libzip4:amd64 (

In [6]:
!mkdir /content/archive
!fuse-zip /content/h-and-m-personalized-fashion-recommendations.zip /content/archive

In [None]:
# to unmount
# !fusermount -u /content/archive

3. get clip model

In [2]:
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git
# !pip install torch-scatter -f https://data.pyg.org/whl/torch-1.11.0+${CUDA}.html
# !pip install torch-sparse -f https://data.pyg.org/whl/torch-1.11.0+${CUDA}.html
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-1.10.0+cu111.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-1.10.0+cu111.html
# !pip install torch-scatter torch-sparse
!pip install https://github.com/pyg-team/pytorch_geometric/archive/master.zip 

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-ur52s49x
  Running command git clone -q https://github.com/openai/CLIP.git /tmp/pip-req-build-ur52s49x
[K     |████████████████████████████████| 3.5 MB 8.8 MB/s 
[?25hCollecting https://github.com/pyg-team/pytorch_geometric/archive/master.zip
  Downloading https://github.com/pyg-team/pytorch_geometric/archive/master.zip
[K     / 2.4 MB 3.8 MB/s
Building wheels for collected packages: torch-geometric
  Building wheel for torch-geometric (setup.py) ... [?25l[?25hdone
  Created wheel for torch-geometric: filename=torch_geometric-2.0.5-py3-none-any.whl size=628231 sha256=e80dc7c1879159734c96d6a6dce3ad8e1b8a2693f16dd8fee9cac36fed08320f
  Stored in directory: /tmp/pip-ephem-wheel-cache-4huw9gnf/wheels/70/53/71/38e50390ffab43b7bf5e55f1cdec398bbb09b9b3d2facb4478
Successfully built torch-geometric
Installing collected packages: torch-geometric
Successfully installed torch-ge

# Get embeddings

In [None]:
import torch as t
import clip

device = "cuda" if t.cuda.is_available() else "cpu"

In [None]:
# TODO to use tpu
# import torch_xla
# import torch_xla.core.xla_model as xm
# device = xm.xla_device()

In [None]:
# example useage of clip ala. https://github.com/openai/CLIP
# model, preprocess = clip.load("ViT-B/32", device=device)
# image = preprocess(Image.open("/content/images/010/0108775015.jpg")).unsqueeze(0).to(device)
# text = clip.tokenize(["a dress", "a dog", "a cat"]).to(device)

# with t.no_grad():
#     image_features = model.encode_image(image)
#     text_features = model.encode_text(text)
    
#     logits_per_image, logits_per_text = model(image, text)
#     probs = logits_per_image.softmax(dim=-1).cpu().numpy()

# print("Label probs:", probs)

In [None]:
import pandas as pd
import zipfile
from io import BytesIO
from PIL import Image
from t.utils.data import Dataset, DataLoader
from os.path import exists


class FashionImagesDataset(Dataset):
    def __init__(self, transform=lambda id: id):
        self.articles = pd.read_csv('/content/archive/articles.csv')
        self.articles['img_path'] = self.articles['article_id'].map(lambda id: "/content/archive/images/0" + str(id)[0:2] + "/0" + str(id) + ".jpg")
        self.valid_idx = self.articles[self.articles.apply(lambda article: exists(article['img_path']), axis=1)]
        print('valid and has image:', len(self.valid_idx), 'from:', len(self.articles))
        self.transform = transform

    def __len__(self):
        return len(self.valid_idx)

    def __getitem__(self, idx):
        img_path = self.valid_idx.iloc[idx]['img_path']
        image = Image.open(img_path)
        label = self.valid_idx.iloc[idx]['article_id']
        image = self.transform(image)
        return image, label

In [None]:
model_name = 'ViT-B/32'
# also ViT-L/14, etc.
clip.available_models()

['RN50',
 'RN101',
 'RN50x4',
 'RN50x16',
 'RN50x64',
 'ViT-B/32',
 'ViT-B/16',
 'ViT-L/14']

In [None]:
model, preprocess = clip.load(model_name, device=device)

100%|████████████████████████████████████████| 338M/338M [00:01<00:00, 195MiB/s]


In [None]:
batch_size = 64

In [None]:
dataset = FashionImagesDataset(transform=preprocess)

valid and has image: 105100 from: 105542


In [None]:
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

In [None]:
images, labels = next(iter(data_loader))

In [None]:
images.size(), images.chunk(batch_size)[1].squeeze().size()

(t.Size([64, 3, 224, 224]), t.Size([3, 224, 224]))

# Get image embeddings

In [None]:
from tqdm import tqdm

image_features = {}
with t.no_grad():
    for images, labels in tqdm(data_loader):
      features = model.encode_image(images.to(device))
      for label, feature in zip(labels, features):
        image_features[label.item()] = feature.to('cpu')

100%|██████████| 1643/1643 [1:20:46<00:00,  2.95s/it]


In [None]:
image_features[111565003].size()

t.Size([512])

# Save image embeddings

In [None]:
file_name = '/content/fashion-recommendation-image-embeddings-clip-' + model_name.replace('/', '-') + '.pt'

In [None]:
t.save(image_features, file_name)

In [None]:
len(image_features.keys())

105100

In [None]:
!ls -lah $file_name

-rw-r--r-- 1 root root 130M Apr  5 14:57 /content/fashion-recommendation-image-embeddings-clip-ViT-B-32.pt


In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
!gsutil cp $file_name gs://heii-public/

Copying file:///content/fashion-recommendation-image-embeddings-clip-ViT-B-32.pt [Content-Type=application/octet-stream]...
|
Operation completed over 1 objects/130.0 MiB.                                    


# Reload image embeddings (e.g.: if you restart colab, etc...)



In [None]:
file_name = '/content/fashion-recommendation-image-embeddings-clip-' + model_name.replace('/', '-') + '.pt'

In [None]:
remote_file = "https://storage.googleapis.com/heii-public/" + file_name.replace('/content/', '')
remote_file

In [None]:
!wget $remote_file

In [None]:
image_features = t.load(file_name)

# Get text embeddings

In [None]:
articles = pd.read_csv('/content/archive/articles.csv')
articles.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [None]:
keys = ['derived_name', 'derived_look', 'derived_category', 'prod_name', 'product_type_name', 'product_group_name', 'graphical_appearance_name', 'colour_group_name', 'perceived_colour_value_name', 'department_name', 'index_name', 'index_group_name', 'section_name', 'garment_group_name', 'detail_desc']

In [None]:
articles = pd.read_csv('/content/archive/articles.csv')
articles['derived_name'] = articles.apply(lambda row: ' '.join(['This is a', row['prod_name'], row['product_type_name']]), axis=1)
articles['derived_look'] = articles.apply(lambda row: ' '.join(['It has a', row['graphical_appearance_name'], row['perceived_colour_value_name'], row['colour_group_name'], 'color']), axis=1)
articles['derived_category'] = articles.apply(lambda row: ' / '.join([row['index_group_name'], row['index_name'], row['section_name'], row['department_name'], row['garment_group_name'], row['product_group_name'], row['product_type_name'], row['graphical_appearance_name'], row['colour_group_name']]), axis=1)
print('Example derived values:')
for i in range(3):
    print(articles.iloc[i]['derived_name'])
    print(articles.iloc[i]['derived_look'])
    print(articles.iloc[i]['derived_category'])

Example derived values:
This is a Strap top Vest top
It has a Solid Dark Black color
Ladieswear / Ladieswear / Womens Everyday Basics / Jersey Basic / Jersey Basic / Garment Upper body / Vest top / Solid / Black
This is a Strap top Vest top
It has a Solid Light White color
Ladieswear / Ladieswear / Womens Everyday Basics / Jersey Basic / Jersey Basic / Garment Upper body / Vest top / Solid / White
This is a Strap top (1) Vest top
It has a Stripe Dusty Light Off White color
Ladieswear / Ladieswear / Womens Everyday Basics / Jersey Basic / Jersey Basic / Garment Upper body / Vest top / Stripe / Off White


In [None]:
class FashionTextDataset(Dataset):
    def __init__(self, key, articles):
        self.key = key
        self.articles = articles
        
    def __len__(self):
        return len(self.articles)

    def __getitem__(self, idx):
        label = self.articles.iloc[idx]['article_id']
        # tokenize already pads text
        tokens = clip.tokenize(str.strip(str(self.articles.iloc[idx][self.key])), 77, True).squeeze().to(device)
        return tokens, label

In [None]:
text_dataset = FashionTextDataset(key=keys[0], articles=articles)

In [None]:
text_data_loader = DataLoader(text_dataset, batch_size=batch_size, shuffle=False)

In [None]:
tokens, labels = next(iter(text_data_loader))

In [None]:
batch_size, len(keys), tokens.size()

(64, 15, t.Size([64, 77]))

In [None]:
tokenizer = clip.simple_tokenizer.SimpleTokenizer()
tokenizer.decode(tokens[1].tolist())

'<|startoftext|>this is a strap top vest top <|endoftext|>!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'

In [None]:
from tqdm import tqdm

text_features = {}
with t.no_grad():
  for i, key in enumerate(keys):
      print('getting embeddings for ', key, 'text fields left:', len(keys)-i)
      text_dataset_ = FashionTextDataset(key=key, articles=articles)
      text_data_loader_ = DataLoader(text_dataset_, batch_size=64, shuffle=False)
      for tokens, labels in tqdm(text_data_loader_):
          features = model.encode_text(tokens)
          for label, feature in zip(labels, features):
              text_features.setdefault(label.item(), {})[key] = feature.to('cpu')

getting embeddings for  derived_name text fields left: 15


100%|██████████| 1650/1650 [03:22<00:00,  8.16it/s]


getting embeddings for  derived_look text fields left: 14


100%|██████████| 1650/1650 [03:22<00:00,  8.15it/s]


getting embeddings for  derived_category text fields left: 13


100%|██████████| 1650/1650 [03:30<00:00,  7.85it/s]


getting embeddings for  prod_name text fields left: 12


100%|██████████| 1650/1650 [03:19<00:00,  8.29it/s]


getting embeddings for  product_type_name text fields left: 11


100%|██████████| 1650/1650 [03:17<00:00,  8.37it/s]


getting embeddings for  product_group_name text fields left: 10


100%|██████████| 1650/1650 [03:18<00:00,  8.33it/s]


getting embeddings for  graphical_appearance_name text fields left: 9


100%|██████████| 1650/1650 [03:17<00:00,  8.37it/s]


getting embeddings for  colour_group_name text fields left: 8


100%|██████████| 1650/1650 [03:17<00:00,  8.34it/s]


getting embeddings for  perceived_colour_value_name text fields left: 7


100%|██████████| 1650/1650 [03:16<00:00,  8.39it/s]


getting embeddings for  department_name text fields left: 6


100%|██████████| 1650/1650 [03:17<00:00,  8.35it/s]


getting embeddings for  index_name text fields left: 5


100%|██████████| 1650/1650 [03:17<00:00,  8.35it/s]


getting embeddings for  index_group_name text fields left: 4


100%|██████████| 1650/1650 [03:16<00:00,  8.39it/s]


getting embeddings for  section_name text fields left: 3


100%|██████████| 1650/1650 [03:17<00:00,  8.33it/s]


getting embeddings for  garment_group_name text fields left: 2


100%|██████████| 1650/1650 [03:17<00:00,  8.37it/s]


getting embeddings for  detail_desc text fields left: 1


100%|██████████| 1650/1650 [03:31<00:00,  7.82it/s]


In [None]:
text_features[116379047]['derived_name'].size()

t.Size([512])

# Save text embeddings

should match len(articles):

In [None]:
len(articles), len(text_features.keys()), len(articles) == len(text_features.keys()), all([512 == len(text_features[108775015].get(key, {})) for key in keys])

(105542, 105542, True, True)

In [None]:
text_file_name = '/content/fashion-recommendation-text-embeddings-clip-' + model_name.replace('/', '-') + '.pt'

In [None]:
t.save(text_features, text_file_name)

In [None]:
!gsutil cp $text_file_name gs://heii-public/

Copying file:///content/fashion-recommendation-text-embeddings-clip-ViT-B-32.pt [Content-Type=application/octet-stream]...
/ [0 files][    0.0 B/  1.9 GiB]                                                ==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

-
Operation completed over 1 objects/1.9 GiB.                                      


In [None]:
remote_text_file = "https://storage.googleapis.com/heii-public/" + text_file_name.replace('/content/', '')
remote_text_file

'https://storage.googleapis.com/heii-public/fashion-recommendation-text-embeddings-clip-ViT-B-32.pt'

# Reload text embeddings

In [None]:
text_file_name = '/content/fashion-recommendation-text-embeddings-clip-' + model_name.replace('/', '-') + '.pt'

In [None]:
remote_text_file = "https://storage.googleapis.com/heii-public/" + text_file_name.replace('/content/', '')

In [None]:
!wget $remote_text_file

--2022-04-05 11:33:10--  https://storage.googleapis.com/heii-public/fashion-recommendation-text-embeddings-clip-ViT-B-32.pt
Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.5.128, 74.125.133.128, 108.177.15.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.5.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 480994078 (459M) [application/octet-stream]
Saving to: ‘fashion-recommendation-text-embeddings-clip-ViT-B-32.pt’


2022-04-05 11:33:16 (88.5 MB/s) - ‘fashion-recommendation-text-embeddings-clip-ViT-B-32.pt’ saved [480994078/480994078]



In [None]:
text_features = t.load(text_file_name)

# Predict customer - product interaction:
prerequisites: run setup

In [1]:
import torch as t
import pandas as pd
from sklearn import preprocessing
import torch_geometric.transforms as T
from torch_geometric.data import HeteroData, InMemoryDataset, download_url
from torch_geometric.utils import negative_sampling


class HMDataset(InMemoryDataset):
    image_embeddings_url = "https://storage.googleapis.com/heii-public/fashion-recommendation-image-embeddings-clip-ViT-B-32.pt"
    text_embeddings_url = "https://storage.googleapis.com/heii-public/fashion-recommendation-text-embeddings-clip-ViT-B-32.pt"
    raw_dir = "/content"
    processed_dir = "/content"

    def __init__(self, root, transform=None, pre_transform=None):
        super().__init__(root, transform, pre_transform)
        self.data, self.slices = t.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return [
            "fashion-recommendation-image-embeddings-clip-ViT-B-32.pt",
            "fashion-recommendation-text-embeddings-clip-ViT-B-32.pt",
            "archive/articles.csv",
            "archive/customers.csv",
            "archive/transactions_train.csv",
        ]

    @property
    def processed_file_names(self):
        return f"hm_graph.pt"

    def download(self):
        download_url(self.image_embeddings_url, self.raw_dir)
        download_url(self.text_embeddings_url, self.raw_dir)

    def process(self):
        self.articles = pd.read_csv(self.raw_paths[2], index_col="article_id")
        self.customers = pd.read_csv(self.raw_paths[3], index_col="customer_id").fillna(
            0.0
        )
        self.transactions = pd.read_csv(self.raw_paths[4])

        data = HeteroData()
        
        # create node edges
        t = self.transactions.to_dict()
        customers_id_ix = {v: k for k, v in enumerate(self.customers.index.unique())}
        # customers_ix_id = {k: v for k, v in enumerate(self.customers.index.unique())}
        articles_id_ix = {v: k for k, v in enumerate(self.articles.index.unique())}
        # articles_ix_id = {k: v for k, v in enumerate(self.articles.index.unique())}
        src = [customers_id_ix[t["customer_id"][i]] for i in t["customer_id"]]
        dst = [articles_id_ix[t["article_id"][i]] for i in t["article_id"]]
        data["customer", "buys", "article"].edge_index = t.tensor([src, dst]).long()
        
        # avoid out of memory on colab
        del t
        del customers_id_ix
        del articles_id_ix

        # encode customers
        le = preprocessing.LabelEncoder()
        self.customers["postal_code"] = le.fit_transform(self.customers["postal_code"])
        self.customers.loc[
            self.customers["fashion_news_frequency"] == "None", "fashion_news_frequency"
        ] = 0.0
        self.customers.loc[
            self.customers["fashion_news_frequency"] == "NONE", "fashion_news_frequency"
        ] = 0.0
        customer_features = self.customers[
            [
                "postal_code",
                "age",
                "fashion_news_frequency",
                "FN",
                "Active",
                "club_member_status",
            ]
        ]
        customer_features = pd.get_dummies(
            customer_features,
            columns=["age", "fashion_news_frequency", "club_member_status"],
        )
        customer_features = t.from_numpy(customer_features.to_numpy())

        # encode articles
        self.article_image_embeddings = t.load(self.raw_paths[0])
        self.article_text_embeddings = t.load(self.raw_paths[1])
        self.articles = self.articles.merge(
            self.transactions.groupby("article_id")["price"].mean(),
            on="article_id",
            how="outer",
        ).fillna(0.0)
        # self.articles["price_bin"] = pd.qcut(self.articles["price"], 100, labels=False)
        self.articles["product_type_no"] = self.articles["product_type_no"].astype(str)
        product_type_no_le = preprocessing.LabelEncoder()
        self.articles["product_type_no"] = product_type_no_le.fit_transform(
            self.articles["product_type_no"]
        )
        self.articles["graphical_appearance_no"] = self.articles[
            "graphical_appearance_no"
        ].astype(str)
        graphical_appearance_no_le = preprocessing.LabelEncoder()
        self.articles[
            "graphical_appearance_no"
        ] = graphical_appearance_no_le.fit_transform(
            self.articles["graphical_appearance_no"]
        )
        article_features = self.articles[
            ["product_type_no", "graphical_appearance_no", "price"]
        ]
        # article_features = pd.get_dummies(
        #     article_features,
        #     columns=["price_bin"],
        # )
        article_features = t.from_numpy(article_features.to_numpy())
        article_features = t.cat(
            (
                article_features,
                t.stack(
                    self.articles.apply(
                        lambda article: self.article_image_embeddings.get(
                            int(article.name), t.zeros(512)
                        ),
                        axis=1,
                    ).tolist()
                ),
            ),
            1,
        )
        for key in ["derived_name", "derived_look", "derived_category"]:
            article_features = t.cat(
                (
                    article_features,
                    t.stack(
                        self.articles.apply(
                            lambda article: self.article_text_embeddings[
                                int(article.name)
                            ].get(key, t.zeros(512)),
                            axis=1,
                        ).tolist()
                    ),
                ),
                1,
            )

        # create nodes
        data["article"].x = article_features.float()
        data["customer"].x = customer_features.float()

        # transform?
        if self.pre_transform is not None:
            data = self.pre_transform(data)

        # PyTorch tensor functionality:
        # data = data.pin_memory()
        # data = data.to('cuda:0', non_blocking=True)
        t.save(self.collate([data]), self.processed_paths[0])


if __name__ == "__main__":
    dataset = HMDataset("/content")

In [6]:
import math
from tqdm import tqdm
import torch as t
import torch.nn.functional as F
from torch.nn import Linear
from torch_geometric.loader import LinkNeighborLoader
import torch_geometric.transforms as T
# from hm_dataset import HMDataset
from torch_geometric.nn import SAGEConv, to_hetero


device = t.device("cuda" if t.cuda.is_available() else "cpu")
t.cuda.empty_cache()

dataset = HMDataset("/content")
data = dataset[0] # .to(device)
data["article"].x = data["article"].x.float()
data["customer"].x = data["customer"].x.float()
data[("customer", "buys", "article")].edge_index = data[
    ("customer", "buys", "article")
].edge_index.long()

# Add a reverse ('article', 'rev_buys', 'customer') relation for message passing:
data = T.ToUndirected()(data).to('cpu')

# Perform a link-level split into training, validation, and test edges:
train_data, val_data, test_data = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    neg_sampling_ratio=0.5,
    add_negative_train_samples=True,
    edge_types=[("customer", "buys", "article")],
    rev_edge_types=[("article", "rev_buys", "customer")],
    is_undirected=True,
)(data)
# when neg_sampling_ratio > 0 and add_negative_train_samples=True only then you will have negative edges

def create_loader(d):
    return LinkNeighborLoader(
      d,
      num_neighbors=[64] * 2,
      batch_size=12,
      edge_label_index=(("customer", "buys", "article"), d[("customer", "buys", "article")].edge_label_index),
      edge_label=d[("customer", "buys", "article")].edge_label,
      directed=False,
      replace=False,
      shuffle=True,
      pin_memory=True,
      num_workers=1
    )
train_loader = create_loader(train_data)
val_loader = create_loader(val_data)
test_loader = create_loader(test_data)


class GNNEncoder(t.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x


class EdgeDecoder(t.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = Linear(hidden_channels, 1)

    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
        z = t.cat([z_dict["customer"][row], z_dict["article"][col]], dim=-1)

        z = self.lin1(z).relu()
        z = self.lin2(z)
        return z.view(-1)


class Model(t.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr="sum")
        self.decoder = EdgeDecoder(hidden_channels)

    def forward(self, x_dict, edge_index_dict, edge_label_index):
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict, edge_label_index)


model = Model(hidden_channels=32).to(device)


In [7]:
# model.load_state_dict(t.load(f"/content/link_pred_0.pt"))

In [8]:
# !export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:1024

In [None]:

# Due to lazy initialization, we need to run one model step so the number
# of parameters can be inferred:
with t.no_grad():
    batch_ = next(iter(train_loader)).to(device, non_blocking=True)
    model.encoder(batch_.x_dict, batch_.edge_index_dict)
    del batch_

optimizer = t.optim.Adam(model.parameters(), lr=0.01)


def train(train_data):
    model.train()
    optimizer.zero_grad()
    pred = model(
        train_data.x_dict,
        train_data.edge_index_dict,
        train_data["customer", "article"].edge_label_index,
    )
    target = train_data["customer", "article"].edge_label
    loss = F.mse_loss(pred, target)
    # loss = F.binary_cross_entropy_with_logits(pred, target)
    loss.backward()
    optimizer.step()
    return float(loss)


@t.no_grad()
def test(data):
    model.eval()
    pred = model(
        data.x_dict,
        data.edge_index_dict,
        data["customer", "article"].edge_label_index,
    )
    pred = pred.clamp(min=0, max=1)
    target = data["customer", "article"].edge_label
    loss = F.mse_loss(pred, target).sqrt()
    # loss = t.nn.BCEWithLogitsLoss()(pred, target)
    return float(loss)


t.cuda.reset_peak_memory_stats()
num_epochs = 301
it = 0
for epoch in range(1, num_epochs):
    acc_loss = 0
    train_steps, val_steps, test_steps = 200, 200, 200
    prog = tqdm(zip(range(train_steps), train_loader), total=train_steps)
    for i, batch in prog:
        batch = batch.to(device, non_blocking=True)
        loss = train(batch)
        acc_loss += loss
        it += 1
        train_rmse = test(batch)
        # cuda_mem = t.cuda.memory_stats()
        # cuda_reserved = t.cuda.max_memory_reserved()
        prog.set_description(f"loss: {loss:.4f}")
    train_loss = acc_loss / it
    for i, batch in tqdm(zip(range(train_steps), val_loader), total=val_steps):
        val_rmse = test(batch.to(device, non_blocking=True))
    for i, batch in tqdm(zip(range(train_steps), test_loader), total=test_steps):
        test_rmse = test(batch.to(device, non_blocking=True))
    if epoch % 30 == 0:
        t.save(model.state_dict(), f"/content/link_pred_{epoch:03d}.pt")
    print(
        f"Epoch: {epoch:03d}, Loss: {loss:.4f}, AccLoss: {acc_loss:.4f}, Train: {train_loss:.4f}, TrainRMSE: {train_rmse:.4f}"
        f"Val: {val_rmse:.4f}, Test: {test_rmse:.4f}"
    )


loss: 1112.6575: 100%|██████████| 200/200 [02:52<00:00,  1.16it/s]
100%|██████████| 200/200 [02:48<00:00,  1.18it/s]
100%|██████████| 200/200 [03:14<00:00,  1.03it/s]


Epoch: 001, Loss: 1112.6575, AccLoss: 23607856.5081, Train: 118039.2825, TrainRMSE: 0.5000Val: 0.2887, Test: 0.5000


loss: 0.3976:  65%|██████▌   | 130/200 [01:52<00:57,  1.23it/s]