In [None]:
!pip install datasets
!pip install implicit

from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/Shareddrives/ISR Project/


Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.22.2-py3-none-an

In [None]:
import datasets
datasets.logging.set_verbosity_error()
from datasets import load_dataset
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, BertModel, AdamW
from transformers import CLIPProcessor, CLIPModel
import random
from implicit.als import AlternatingLeastSquares
from implicit.cpu.bpr import BayesianPersonalizedRanking
from scipy.sparse import csr_matrix
import torch.nn as nn
import torch
import torch.nn.functional as F
from tqdm.auto import tqdm
import numpy as np
from sklearn.model_selection import train_test_split
import requests
from multiprocessing import Pool
from PIL import Image
from io import BytesIO

LATENT_DIM = 200
random.seed(69)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:

config = {
    'save_path': '/content/drive/Shareddrives/ISR Project',
    'source_domain': 'All_Beauty',
    'target_domain': 'Amazon_Fashion'

}

config['save_path'] = config['save_path'] + '/' + config['source_domain'] + '_' + config['target_domain']

In [None]:

config['save_path']

'/content/drive/Shareddrives/ISR Project/All_Beauty_Amazon_Fashion'

In [None]:
import os

if not os.path.exists(config['save_path']):
    os.makedirs(config['save_path'])
    print(f"Directory created: {config['save_path']}")
else:
    print(f"Directory already exists: {config['save_path']}")

Directory created: /content/drive/Shareddrives/ISR Project/All_Beauty_Amazon_Fashion


In [None]:
from transformers import (
    Text2TextGenerationPipeline,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
)

class KeyphraseGenerationPipeline(Text2TextGenerationPipeline):
    def __init__(self, model, keyphrase_sep_token=";", *args, **kwargs):
        super().__init__(
            model=AutoModelForSeq2SeqLM.from_pretrained(model),
            tokenizer=AutoTokenizer.from_pretrained(model),
            *args,
            **kwargs
        )
        self.keyphrase_sep_token = keyphrase_sep_token

    def postprocess(self, model_outputs):
        results = super().postprocess(
            model_outputs=model_outputs
        )
        return [[keyphrase.strip() for keyphrase in result.get("generated_text").split(self.keyphrase_sep_token) if keyphrase != ""] for result in results]


class DataGeneration():
  def __init__(self, source_domain_name, target_domain_name, save_path, train_distribution_coeff = 4, test_distribution_coeff = 4, model_name = "ml6team/keyphrase-generation-t5-small-inspec"):
    self.source_domain_name = source_domain_name
    self.target_domain_name = target_domain_name
    self.save_path = save_path
    self.train_distribution_coeff = train_distribution_coeff
    self.test_distribution_coeff = test_distribution_coeff
    self.model_name = model_name
    self.generator = KeyphraseGenerationPipeline(model=model_name)


  def load_and_filter_data(self):
    source_reviews = load_dataset("McAuley-Lab/Amazon-Reviews-2023", f"raw_review_{self.source_domain_name}", trust_remote_code=True)
    source_meta = load_dataset("McAuley-Lab/Amazon-Reviews-2023", f"raw_meta_{self.source_domain_name}", trust_remote_code=True)
    target_reviews = load_dataset("McAuley-Lab/Amazon-Reviews-2023", f"raw_review_{self.target_domain_name}", trust_remote_code=True)
    target_meta = load_dataset("McAuley-Lab/Amazon-Reviews-2023", f"raw_meta_{self.target_domain_name}", trust_remote_code=True)

    common_users = list(set(source_reviews["full"][:]['user_id']).intersection(set(target_reviews["full"][:]['user_id'])))

    # Create dataframes
    source_reviews_df = source_reviews["full"].to_pandas()
    self.source_meta_df = source_meta["full"].to_pandas()
    target_reviews_df = target_reviews["full"].to_pandas()
    self.target_meta_df = target_meta["full"].to_pandas()

    # Filter dataframe
    common_users_source_df = source_reviews_df[source_reviews_df['user_id'].isin(common_users)]
    temp = common_users_source_df['user_id'].value_counts()
    self.source_filtered_data = source_reviews_df[source_reviews_df['user_id'].isin(temp[temp >= 3].index)]
    self.source_filtered_data = self.source_filtered_data[self.source_filtered_data['text'] != '']
    print(f'Unique common users: {len(self.source_filtered_data["user_id"].unique())}')


    self.target_filtered_data = target_reviews_df[target_reviews_df['user_id'].isin(self.source_filtered_data['user_id'].unique())]
    # temp = common_users_target_df['user_id'].value_counts()
    # self.target_filtered_data = target_reviews_df[target_reviews_df['user_id'].isin(temp[temp >= 3].index)]
    # self.target_filtered_data = self.target_filtered_data[self.target_filtered_data['text'] != '']

  def generate_user_item_ratings_csv(self, target = False):
    # user-item ratings
    if target:
      filtered_data = self.target_filtered_data
      name = self.target_domain_name
      datasetmeta_df = self.target_meta_df
      coeff = self.test_distribution_coeff
    else:
      filtered_data = self.source_filtered_data
      name = self.source_domain_name
      datasetmeta_df = self.source_meta_df
      coeff = self.train_distribution_coeff

    filtered_data['true_rating'] = filtered_data['rating']
    filtered_data.loc[filtered_data['rating'] <= 3, 'rating'] = 0

    filtered_data.loc[filtered_data['rating'] != 0, 'rating'] = 1

    positive_samples = filtered_data[filtered_data['rating'] == 1][['user_id', 'parent_asin', 'rating', 'true_rating']]
    negative_samples = filtered_data[filtered_data['rating'] == 0][['user_id', 'parent_asin', 'rating', 'true_rating']]

    positive_samples_ind = list(zip(positive_samples['user_id'], positive_samples['parent_asin'], positive_samples['rating'], positive_samples['true_rating']))
    negative_samples_ind = list(zip(negative_samples['user_id'], negative_samples['parent_asin'], negative_samples['rating'], negative_samples['true_rating']))
    total_samples = np.array(positive_samples_ind + negative_samples_ind)

    num_positive_samples = len(positive_samples)
    num_negative_samples = len(negative_samples)

    print(f'Positive samples: {num_positive_samples}, Negative samples: {num_negative_samples}, Total samples: {len(total_samples)}')

    tobe_sampled = coeff*num_positive_samples - num_negative_samples
    unrated_items = list(set(datasetmeta_df['parent_asin']) - set(filtered_data['parent_asin']))
    all_users = filtered_data['user_id'].tolist()
    all_items = datasetmeta_df['parent_asin'].tolist()

    while tobe_sampled > 0:
        sampled_neagtive_item = random.choices(all_items, k = tobe_sampled)
        sampled_neagtive_user = random.choices(all_users, k = tobe_sampled)
        sampled_items = set(list(zip(sampled_neagtive_item, sampled_neagtive_user)))
        total_itemset = set(total_samples[:, :2].flatten())
        if sampled_items & total_itemset:
            tobe_added = sampled_items - total_itemset
            tobe_sampled = len(sampled_items & total_itemset)
            total_samples = np.vstack((total_samples, np.array([(x, y, 0, -1) for x, y in tobe_added])))
        else:
            total_samples = np.vstack((total_samples, np.array(list(zip(sampled_neagtive_user, sampled_neagtive_item, [0]*len(sampled_neagtive_user), [-1]*len(sampled_neagtive_user))))))
            tobe_sampled = 0
    print(total_samples.shape)
    total_samples = list(total_samples)
    # If miraculously the negative samples are more,
    # else: bombastic side eye

    self.useritem_df = pd.DataFrame(total_samples, columns=['user_id', 'item_id', 'rating', 'true_rating'])
    self.useritem_df.to_csv(f'{self.save_path}/{name}_user_item_ratings.csv', index=False)


  def generate_item_profile(self, target = False):
    if target:
      filtered_data = self.target_filtered_data
      name = self.target_domain_name
      datasetmeta_df = self.target_meta_df
      coeff = self.test_distribution_coeff
      self.useritem_df = pd.read_csv(f'{self.save_path}/{self.target_domain_name}_user_item_ratings.csv')
    else:
      filtered_data = self.source_filtered_data
      name = self.source_domain_name
      datasetmeta_df = self.source_meta_df
      coeff = self.train_distribution_coeff
      self.useritem_df = pd.read_csv(f'{self.save_path}/{name}_user_item_ratings.csv')
    image_links = []
    unique_items = self.useritem_df['item_id'].unique() #unique is not really necessary, but just to be sure
    all_items_keyphrases = []
    for item in unique_items:
      titles = datasetmeta_df[datasetmeta_df['parent_asin'] == item]['title']
      features = datasetmeta_df[datasetmeta_df['parent_asin'] == item]['features']
      descriptions = datasetmeta_df[datasetmeta_df['parent_asin'] == item]['description']
      categories = datasetmeta_df[datasetmeta_df['parent_asin'] == item]['categories']
      details = datasetmeta_df[datasetmeta_df['parent_asin'] == item]['details']
      # print(datasetmeta_df[datasetmeta_df['parent_asin'] == item]['images'])
      if not len(datasetmeta_df[datasetmeta_df['parent_asin'] == item]['images'].tolist()):
        print('oh no', item, (item in datasetmeta_df['parent_asin']))
        print(datasetmeta_df[datasetmeta_df['parent_asin'] == item]['images'].tolist())
        image_links.append('')
      elif 'large' in datasetmeta_df[datasetmeta_df['parent_asin'] == item]['images'].tolist()[0]:
        if len(datasetmeta_df[datasetmeta_df['parent_asin'] == item]['images'].tolist()[0]['large']):
          image_links.append(datasetmeta_df[datasetmeta_df['parent_asin'] == item]['images'].tolist()[0]['large'][0])
        else:
          image_links.append('')
      else:
        image_links.append('')
      item_keyphrases = set()
      for title, feature, description, category, detail in zip(titles,  features, descriptions, categories, details):
        keyphrases = []
        keyphrases.append(title)
        if feature.size:
          keyphrases.append(feature[0])
        if description.size:
          keyphrases.append(description[0])
        if category.size:
          keyphrases.append(category[0])
        if detail:
          keyphrases.append(','.join(detail[1:-1].split(', ')))
        item_keyphrases.update([','.join(keyphrases)])

      all_items_keyphrases.append(list(item_keyphrases))

    item_profile_df = pd.DataFrame({'item_id': unique_items, 'keyphrases': all_items_keyphrases, 'image_links': image_links})
    item_profile_df.to_csv(f'{self.save_path}/{name}_item_profiles.csv', index=False)

  # def generate_item_profile(self, target = False):
  #   if target:
  #     filtered_data = self.target_filtered_data
  #     name = self.target_domain_name
  #     datasetmeta_df = self.target_meta_df
  #     coeff = self.test_distribution_coeff
  #     self.useritem_df = pd.read_csv(f'{self.save_path}/{self.target_domain_name}_user_item_ratings.csv')
  #   else:
  #     filtered_data = self.source_filtered_data
  #     name = self.source_domain_name
  #     datasetmeta_df = self.source_meta_df
  #     coeff = self.train_distribution_coeff
  #     self.useritem_df = pd.read_csv(f'{self.save_path}/{name}_user_item_ratings.csv')

  #   unique_items = self.useritem_df['item_id'].unique() #unique is not really necessary, but just to be sure
  #   all_items_keyphrases = []
  #   for item in unique_items:
  #     titles = datasetmeta_df[datasetmeta_df['parent_asin'] == item]['title']
  #     features = datasetmeta_df[datasetmeta_df['parent_asin'] == item]['features']
  #     descriptions = datasetmeta_df[datasetmeta_df['parent_asin'] == item]['description']
  #     categories = datasetmeta_df[datasetmeta_df['parent_asin'] == item]['categories']
  #     details = datasetmeta_df[datasetmeta_df['parent_asin'] == item]['details']
  #     item_keyphrases = set()
  #     for title, feature, description, category, detail in zip(titles,  features, descriptions, categories, details):
  #       keyphrases = []
  #       keyphrases.append(title)
  #       if feature.size:
  #         keyphrases.append(feature[0])
  #       if description.size:
  #         keyphrases.append(description[0])
  #       if category.size:
  #         keyphrases.append(category[0])
  #       if detail:
  #         keyphrases.append(','.join(detail[1:-1].split(', ')))
  #       item_keyphrases.update([','.join(keyphrases)])

  #     all_items_keyphrases.append(list(item_keyphrases))

  #   item_profile_df = pd.DataFrame({'item_id': unique_items, 'keyphrases': all_items_keyphrases})
  #   item_profile_df.to_csv(f'{self.save_path}/{name}_item_profiles.csv', index=False)

  def generate_user_profile(self, target = False):
    if target:
      filtered_data = self.target_filtered_data
      name = self.target_domain_name
      datasetmeta_df = self.target_meta_df
      coeff = self.test_distribution_coeff
    else:
      filtered_data = self.source_filtered_data
      name = self.source_domain_name
      datasetmeta_df = self.source_meta_df
      coeff = self.train_distribution_coeff
    unique_users = filtered_data['user_id'].unique()
    # all_users_keyphrases = []

    # num_processes = 4
    # with Pool(processes=num_processes) as pool:

    #   chunk_size = len(unique_users) // num_processes
    #   user_chunks = [unique_users[i:i + chunk_size] for i in range(0, len(unique_users), chunk_size)]
    #   results = pool.map(self.generate_user_profile_parallel, user_chunks)

    # all_users_keyphrases = []
    # for result in results:
    #   all_users_keyphrases.extend(result)
    all_users_keyphrases = []
    for user in unique_users:
      titles = filtered_data[filtered_data['user_id'] == user]['title']
      reviews = filtered_data[filtered_data['user_id'] == user]['text']
      user_keyphrases = set()
      for title, review in zip(titles, reviews):

        keyphrases = self.generator(title + '. ' + review)
        user_keyphrases.update(keyphrases[0])
      all_users_keyphrases.append(list(user_keyphrases))
    user_profile_df = pd.DataFrame({'user_id': unique_users, 'keyphrases': all_users_keyphrases})
    user_profile_df.to_csv(f'{self.save_path}/{name}_user_profiles.csv', index=False)

  def generate_user_profile_parallel(self, user_chunk):
    print('New worker')
    user_keyphrases = []
    for user in user_chunk:
      print(user)
      titles = self.source_filtered_data[self.source_filtered_data['user_id'] == user]['title']
      reviews = self.source_filtered_data[self.source_filtered_data['user_id'] == user]['text']
      user_keyphrase_set = set()
      for title, review in zip(titles, reviews):
        keyphrases = self.generator(title + '. ' + review, max_new_tokens = 50)
        user_keyphrase_set.update(keyphrases[0])
      user_keyphrases.append(list(user_keyphrase_set))
    return user_keyphrases

  def train_val_test_split(self, stratified = True, split = 0.8):
    useritem_df = pd.read_csv(f'{self.save_path}/{self.source_domain_name}_user_item_ratings.csv')
    useritem_df = useritem_df.sample(frac=1).reset_index(drop=True)
    X = useritem_df[['user_id', 'item_id', 'true_rating']]
    y = useritem_df['rating']

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y)

    train_useritem_df = X_train.copy()
    train_useritem_df['rating'] = y_train

    val_useritem_df = X_val.copy()
    val_useritem_df['rating'] = y_val
    # train_useritem_df = useritem_df.sample(int(split*len(useritem_df)))
    # val_useritem_df = useritem_df.sample(int((1 - split)*len(useritem_df)))

    train_useritem_df.to_csv(f'{self.save_path}/{self.source_domain_name}_train_user_item_ratings.csv', index=False)
    val_useritem_df.to_csv(f'{self.save_path}/{self.source_domain_name}_val_user_item_ratings.csv', index=False)

  def generation_pipline(self):
    print('Loading and Filtering Data...')
    self.load_and_filter_data()
    print('Generating source user-item csv...')
    self.generate_user_item_ratings_csv()
    print('Generating target user-item csv...')
    self.generate_user_item_ratings_csv(target = True)
    print('Generating source user profile csv...')
    self.generate_user_profile()
    # print('Generating source user profile csv...')
    # self.generate_user_profile(target = True)
    print('Generating source item profile csv...')
    self.generate_item_profile()
    print('Generating target item profile csv...')
    self.generate_item_profile(target = True)
    print('Splitting ratings into train-val...')
    self.train_val_test_split(split = 0.8)
    print('Done!')


In [None]:
"""
class ReviewsDataset(Dataset):

  @staticmethod
  def load_user_profiles(user_profile_filepath):
    user_profiles = pd.read_csv(user_profile_filepath)
    return dict(zip(user_profiles['user_id'], user_profiles['keyphrases']))


  @staticmethod
  def load_item_profiles(item_profile_filepath):
    item_profiles = pd.read_csv(item_profile_filepath)
    return dict(zip(item_profiles['item_id'], item_profiles['keyphrases']))

  def __init__(self, user_profile_filepath, item_profile_filepath, ratings_filepath, tokenizer):
    self.user_profiles = ReviewsDataset.load_user_profiles(user_profile_filepath)
    self.item_profiles = ReviewsDataset.load_item_profiles(item_profile_filepath)
    self.ratings = pd.read_csv(ratings_filepath)
    self.tokenizer = tokenizer

  def __len__(self):
    return len(self.ratings)

  def __getitem__(self, idx):
    rating = self.ratings.iloc[idx]['rating']
    true_rating = self.ratings.iloc[idx]['true_rating']
    user_profile = self.user_profiles[self.ratings.iloc[idx]['user_id']]
    item_profile = self.item_profiles[self.ratings.iloc[idx]['item_id']]

    user_profile_tokens = self.tokenizer(text=user_profile,
                                        padding="max_length",
                                        max_length=128,
                                        add_special_tokens=True,
                                        return_attention_mask=True,
                                        truncation=True,
                                        return_tensors="pt")

    item_profile_tokens = self.tokenizer(text=item_profile,
                                          padding="max_length",
                                          max_length=128,
                                          add_special_tokens=True,
                                          return_attention_mask=True,
                                          truncation=True,
                                          return_tensors="pt")

    return self.ratings.iloc[idx]['user_id'], self.ratings.iloc[idx]['item_id'], user_profile_tokens, item_profile_tokens, rating, true_rating
"""

'\nclass ReviewsDataset(Dataset):\n\n  @staticmethod\n  def load_user_profiles(user_profile_filepath):\n    user_profiles = pd.read_csv(user_profile_filepath)\n    return dict(zip(user_profiles[\'user_id\'], user_profiles[\'keyphrases\']))\n\n\n  @staticmethod\n  def load_item_profiles(item_profile_filepath):\n    item_profiles = pd.read_csv(item_profile_filepath)\n    return dict(zip(item_profiles[\'item_id\'], item_profiles[\'keyphrases\']))\n\n  def __init__(self, user_profile_filepath, item_profile_filepath, ratings_filepath, tokenizer):\n    self.user_profiles = ReviewsDataset.load_user_profiles(user_profile_filepath)\n    self.item_profiles = ReviewsDataset.load_item_profiles(item_profile_filepath)\n    self.ratings = pd.read_csv(ratings_filepath)\n    self.tokenizer = tokenizer\n\n  def __len__(self):\n    return len(self.ratings)\n\n  def __getitem__(self, idx):\n    rating = self.ratings.iloc[idx][\'rating\']\n    true_rating = self.ratings.iloc[idx][\'true_rating\']\n    user

In [None]:
class ReviewsDataset(Dataset):

  @staticmethod
  def load_user_profiles(user_profile_filepath):
    user_profiles = pd.read_csv(user_profile_filepath)
    return dict(zip(user_profiles['user_id'], user_profiles['keyphrases']))


  @staticmethod
  def load_item_profiles(item_profile_filepath):
    item_profiles = pd.read_csv(item_profile_filepath)
    return dict(zip(item_profiles['item_id'], zip(item_profiles['keyphrases'], item_profiles['image_links'])))

  def __init__(self, user_profile_filepath, item_profile_filepath, ratings_filepath, tokenizer):
    self.user_profiles = ReviewsDataset.load_user_profiles(user_profile_filepath)
    self.item_profiles = ReviewsDataset.load_item_profiles(item_profile_filepath)
    self.ratings = pd.read_csv(ratings_filepath)
    self.tokenizer = tokenizer
    self.CLIPmodel = CLIPModel.from_pretrained('openai/clip-vit-base-patch32')
    self.CLIPprocessor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')
    self.CLIPhidden_size = self.CLIPmodel.config.text_config.hidden_size

  def __len__(self):
    return len(self.ratings)

  def __getitem__(self, idx):
    rating = self.ratings.iloc[idx]['rating']
    true_rating = self.ratings.iloc[idx]['true_rating']
    user_profile = self.user_profiles[self.ratings.iloc[idx]['user_id']]
    item_profile = self.item_profiles[self.ratings.iloc[idx]['item_id']][0]
    image_link = self.item_profiles[self.ratings.iloc[idx]['item_id']][1]
    if pd.isna(image_link):
      image = np.zeros((224, 224, 3), dtype=np.uint8)
    else:
      try:
          image = np.array(Image.open(BytesIO(requests.get(image_link).content)))
          if len(image.shape) == 2:
            image = np.stack((image,)*3, axis=-1)
      except Exception as e:
          print(f'Failed to load image because of error : {e}')
          image = np.zeros((224, 224, 3), dtype=np.uint8)
    CLIPinput = self.CLIPprocessor(images=image, return_tensors="pt")
    CLIPoutput = self.CLIPmodel.get_image_features(**CLIPinput)
    user_profile_tokens = self.tokenizer(text=user_profile,
                                        padding="max_length",
                                        max_length=128,
                                        add_special_tokens=True,
                                        return_attention_mask=True,
                                        truncation=True,
                                        return_tensors="pt")

    item_profile_tokens = self.tokenizer(text=item_profile,
                                          padding="max_length",
                                          max_length=128,
                                          add_special_tokens=True,
                                          return_attention_mask=True,
                                          truncation=True,
                                          return_tensors="pt")

    return self.ratings.iloc[idx]['user_id'], self.ratings.iloc[idx]['item_id'], user_profile_tokens, item_profile_tokens, rating, true_rating, CLIPoutput


In [None]:
class BertWithCustomHead(nn.Module):
    def __init__(self, output_dim=200, fine_tune_last_n_layers=1):
        super(BertWithCustomHead, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        for name, param in self.bert.named_parameters():
            if name.startswith('encoder.layer') and int(name.split('.')[2]) < (self.bert.config.num_hidden_layers - fine_tune_last_n_layers):
                param.requires_grad = False

        self.feed_forward = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, LATENT_DIM),
            nn.ReLU(),
            nn.Linear(LATENT_DIM, LATENT_DIM),
            nn.ReLU(),
            nn.Linear(LATENT_DIM, output_dim),
            nn.ReLU()
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        custom_output = self.feed_forward(pooled_output)
        return custom_output

In [None]:
class BertWithCustomHeadCLIP(nn.Module):
    def __init__(self, output_dim=200, fine_tune_last_n_layers=1):
        super(BertWithCustomHeadCLIP, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        for name, param in self.bert.named_parameters():
            if name.startswith('encoder.layer') and int(name.split('.')[2]) < (self.bert.config.num_hidden_layers - fine_tune_last_n_layers):
                param.requires_grad = False

        self.CLIPmodel = CLIPModel.from_pretrained('openai/clip-vit-base-patch32')
        self.CLIPhidden_size = self.CLIPmodel.config.text_config.hidden_size

        self.feed_forward = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size + self.CLIPhidden_size, LATENT_DIM),
            nn.ReLU(),
            nn.Linear(LATENT_DIM, LATENT_DIM),
            nn.ReLU(),
            nn.Linear(LATENT_DIM, output_dim),
            nn.ReLU()
        )

    def forward(self, input_ids, attention_mask, image_embedding):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output

        concatenated_output = torch.cat((pooled_output, image_embedding.squeeze(dim=1)), dim=1)
        custom_output = self.feed_forward(concatenated_output)
        return custom_output

In [None]:
class TwoTowerModel(nn.Module):
    def __init__(self, user_embeddings_dict=None, item_embeddings_dict=None, output_dim=200):
        super(TwoTowerModel, self).__init__()
        self.user_tower = BertWithCustomHead(output_dim=output_dim)
        self.item_tower = BertWithCustomHead(output_dim=output_dim)

        if user_embeddings_dict is not None:
          self.user_embeddings_dict = {k: torch.tensor(v, dtype=torch.float32) for k, v in user_embeddings_dict.items()}
        if item_embeddings_dict is not None:
          self.item_embeddings_dict = {k: torch.tensor(v, dtype=torch.float32) for k, v in item_embeddings_dict.items()}

    def forward(self, user_raw_ids, item_raw_ids, user_input_ids, user_attention_mask, item_input_ids, item_attention_mask, train=True):
        user_repr = self.user_tower(user_input_ids, user_attention_mask)
        item_repr = self.item_tower(item_input_ids, item_attention_mask)

        if train == True:
          user_additional_embeddings = torch.stack([self.user_embeddings_dict[id].to(user_input_ids.device) for id in user_raw_ids])
          item_additional_embeddings = torch.stack([self.item_embeddings_dict[id].to(item_input_ids.device) for id in item_raw_ids])

          user_repr = torch.cat((user_repr, user_additional_embeddings), dim=1)
          item_repr = torch.cat((item_repr, item_additional_embeddings), dim=1)

        score = torch.sum(user_repr * item_repr, dim=1)
        return score

In [None]:
class TwoTowerModelCLIP(nn.Module):
    def __init__(self, user_embeddings_dict=None, item_embeddings_dict=None, output_dim=200):
        super(TwoTowerModelCLIP, self).__init__()
        self.user_tower = BertWithCustomHeadCLIP(output_dim=output_dim)
        self.item_tower = BertWithCustomHeadCLIP(output_dim=output_dim)

        if user_embeddings_dict is not None:
          self.user_embeddings_dict = {k: torch.tensor(v, dtype=torch.float32) for k, v in user_embeddings_dict.items()}
        if item_embeddings_dict is not None:
          self.item_embeddings_dict = {k: torch.tensor(v, dtype=torch.float32) for k, v in item_embeddings_dict.items()}

    def forward(self, user_raw_ids, item_raw_ids, user_input_ids, user_attention_mask, item_input_ids, item_attention_mask, image_embedding, train=True):
        user_repr = self.user_tower(user_input_ids, user_attention_mask, image_embedding)
        item_repr = self.item_tower(item_input_ids, item_attention_mask, image_embedding)

        if train == True:
          user_additional_embeddings = torch.stack([self.user_embeddings_dict[id].to(user_input_ids.device) for id in user_raw_ids])
          item_additional_embeddings = torch.stack([self.item_embeddings_dict[id].to(item_input_ids.device) for id in item_raw_ids])

          user_repr = torch.cat((user_repr, user_additional_embeddings), dim=1)
          item_repr = torch.cat((item_repr, item_additional_embeddings), dim=1)

        score = torch.sum(user_repr * item_repr, dim=1)
        return score

In [None]:
dg = DataGeneration(
    source_domain_name = config['source_domain'],
    target_domain_name = config['target_domain'],
    save_path = config['save_path']
)

In [None]:
dg.generation_pipline()

Loading and Filtering Data...


Downloading data:   0%|          | 0.00/327M [00:00<?, ?B/s]

Generating full split: 0 examples [00:00, ? examples/s]

Downloading data:   0%|          | 0.00/213M [00:00<?, ?B/s]

Generating full split: 0 examples [00:00, ? examples/s]

Downloading data:   0%|          | 0.00/1.05G [00:00<?, ?B/s]

Generating full split: 0 examples [00:00, ? examples/s]

Downloading data:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Generating full split: 0 examples [00:00, ? examples/s]

Unique common users: 4772
Generating source user-item csv...
Positive samples: 18565, Negative samples: 5432, Total samples: 23997
(92825, 4)
Generating target user-item csv...
Positive samples: 20679, Negative samples: 6090, Total samples: 26769
(103395, 4)
Generating source user profile csv...


Token indices sequence length is longer than the specified maximum sequence length for this model (727 > 512). Running this sequence through the model will result in indexing errors


Generating source item profile csv...
Generating target item profile csv...


In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
train_ds = ReviewsDataset(f"{config['save_path']}/{config['source_domain']}_user_profiles.csv", f"{config['save_path']}/{config['source_domain']}_item_profiles.csv", f"{config['save_path']}/{config['source_domain']}_train_user_item_ratings.csv", tokenizer)
val_ds = ReviewsDataset(f"{config['save_path']}/{config['source_domain']}_user_profiles.csv", f"{config['save_path']}/{config['source_domain']}_item_profiles.csv", f"{config['save_path']}/{config['source_domain']}_val_user_item_ratings.csv", tokenizer)

test_ds = ReviewsDataset(f"{config['save_path']}/{config['source_domain']}_user_profiles.csv", f"{config['save_path']}/{config['target_domain']}_item_profiles.csv", f"{config['save_path']}/{config['target_domain']}_user_item_ratings.csv", tokenizer)

In [None]:
train_dl = DataLoader(train_ds, batch_size=16, shuffle=True, drop_last=True)
val_dl = DataLoader(val_ds, batch_size=16, shuffle=False, drop_last=True)
test_dl = DataLoader(test_ds, batch_size=16, shuffle=False, drop_last=True)

In [None]:
# MF
def get_latent_factors(path):

  def preprocess_data(path):
    data = pd.read_csv(path)

    user_id_map = {user_id: index for index, user_id in enumerate(data['user_id'].unique())}
    item_id_map = {item_id: index for index, item_id in enumerate(data['item_id'].unique())}

    data['user_index'] = data['user_id'].map(user_id_map)
    data['item_index'] = data['item_id'].map(item_id_map)

    sparse_matrix = csr_matrix((data['rating'], (data['user_index'], data['item_index'])),
                               shape=(len(user_id_map), len(item_id_map)))

    return sparse_matrix, user_id_map, item_id_map

  def train_model(sparse_matrix, factors=100, regularization=0.01):
    # ALS model
    model = AlternatingLeastSquares(factors=factors, regularization=regularization)
    model.fit(sparse_matrix)
    return model

  def get_embeddings_map(id_map, factors):
    embeddings_dict = {}

    for id, index in id_map.items():
      embedding = factors[index]
      embeddings_dict[id] = embedding

    return embeddings_dict

  sparse_matrix, user_id_map, item_id_map = preprocess_data(path)

  model = train_model(sparse_matrix)

  user_embeddings_dict = get_embeddings_map(user_id_map, model.user_factors)
  item_embeddings_dict = get_embeddings_map(item_id_map, model.item_factors)

  return user_embeddings_dict, item_embeddings_dict

user_embeddings_dict, item_embeddings_dict = get_latent_factors(config['save_path']+'/'+config['source_domain']+'_train_user_item_ratings.csv')

  check_blas_config()


  0%|          | 0/15 [00:00<?, ?it/s]

In [None]:
import pickle

def save_embeddings(embeddings_dict, file_name):
    with open(file_name, 'wb') as file:
        pickle.dump(embeddings_dict, file)

def load_embeddings(file_name):
    with open(file_name, 'rb') as file:
        return pickle.load(file)

save_path = config['save_path']

save_embeddings(user_embeddings_dict, save_path + '/' + config['source_domain']+ '_als_user_embeddings.pkl')
save_embeddings(item_embeddings_dict, save_path + '/' + config['source_domain']+ '_als_item_embeddings.pkl')


In [None]:
user_embeddings_dict, item_embeddings_dict = load_embeddings(save_path + '/' + config['source_domain']+ '_als_user_embeddings.pkl'), load_embeddings(save_path + '/' + config['source_domain']+ '_als_item_embeddings.pkl')

In [None]:
# model = TwoTowerModel(user_embeddings_dict=user_embeddings_dict, item_embeddings_dict=item_embeddings_dict, output_dim=200).to(device)
model = TwoTowerModelCLIP(user_embeddings_dict=user_embeddings_dict, item_embeddings_dict=item_embeddings_dict, output_dim=200).to(device)

optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-5)

loss_function = nn.BCEWithLogitsLoss()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]



In [None]:
train_user_item_ratingsdf = pd.read_csv(f"{config['save_path']}/{config['source_domain']}_train_user_item_ratings.csv")
train_users = train_user_item_ratingsdf['user_id'].tolist()

val_user_item_ratingsdf = pd.read_csv(f"{config['save_path']}/{config['source_domain']}_val_user_item_ratings.csv")
val_users = val_user_item_ratingsdf['user_id'].tolist()

infer_user_item_ratingsdf = pd.read_csv(f"{config['save_path']}/{config['target_domain']}_user_item_ratings.csv")
infer_users = infer_user_item_ratingsdf['user_id'].tolist()

In [None]:
def dcg_at_k(r, k, method=1):
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
    return 0.

def ndcg_at_k(r, k, method=1):
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max


num_epochs = 3


for epoch in range(num_epochs):
    model.train()
    epoch_precision, epoch_recall, epoch_ndcg = [], [], []
    epoch_loss = 0
    train_users_dict = {user: [[], [], []] for user in train_users}
    val_users_dict = {user: [[], [], []] for user in val_users}
#     print(train_users_dict)
    for batch in tqdm(train_dl):
        user_ids_raw = batch[0]
        item_ids_raw = batch[1]
        user_input_ids = batch[2]['input_ids'].squeeze(1).to(device)
        user_attention_mask = batch[2]['attention_mask'].to(device)
        item_input_ids = batch[3]['input_ids'].squeeze(1).to(device)
        item_attention_mask = batch[3]['attention_mask'].to(device)
        ratings = batch[4].to(device)
        image = batch[6].to(device)
        # true_ratings = batch[5].to(device)


        # scores = model(user_ids_raw, item_ids_raw, user_input_ids, user_attention_mask, item_input_ids, item_attention_mask, True)
        scores = model(user_ids_raw, item_ids_raw, user_input_ids, user_attention_mask, item_input_ids, item_attention_mask, image, True)


        loss = loss_function(scores, ratings)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        epoch_loss += loss.item()

        predictions = torch.sigmoid(scores)
        predictions = (predictions >= 0.5)*1
        for i, user in enumerate(user_ids_raw):
          train_users_dict[user][0].append(scores[i].item())
          train_users_dict[user][1].append(predictions[i].cpu())
          train_users_dict[user][2].append(ratings[i].cpu())
        # precision, recall = calculate_precision_recall(predictions, ratings)
    for user in tqdm(train_users_dict):
      scores  = train_users_dict[user][0]
      predictions = train_users_dict[user][1]
      ratings = train_users_dict[user][2]
      if len(scores) and np.sum(ratings, axis=0):
          if len(scores) < 5:
                k = len(scores)
          else:
            k = 5

          top_k_indices = np.argpartition(scores, -k, axis=0)[-k:]
          # Get predicted and true labels for top k indices
          top_k_preds = np.take_along_axis(np.array(predictions), top_k_indices, axis=0)
          top_k_true = np.take_along_axis(np.array(ratings), top_k_indices, axis=0)

          # Calculate true positives
          true_positives = np.sum(top_k_preds * top_k_true, axis=0)

          # Calculate precision and recall
          precision = true_positives / k
          recall = true_positives / np.sum(ratings, axis=0)
          epoch_precision.append(precision)
          epoch_recall.append(recall)
          epoch_ndcg.append(ndcg_at_k(top_k_true, k))
    average_epoch_precision = np.mean(epoch_precision)
    average_epoch_recall = np.mean(epoch_recall)
    average_epoch_ndcg = np.mean(epoch_ndcg)

    model.eval()
    val_loss = 0
    val_precision = []
    val_recall = []
    val_ndcg = []
    with torch.no_grad():
      for batch in tqdm(val_dl):
        user_ids_raw = batch[0]
        item_ids_raw = batch[1]
        user_input_ids = batch[2]['input_ids'].squeeze(1).to(device)
        user_attention_mask = batch[2]['attention_mask'].to(device)
        item_input_ids = batch[3]['input_ids'].squeeze(1).to(device)
        item_attention_mask = batch[3]['attention_mask'].to(device)
        ratings = batch[4].to(device)
        image = batch[6].to(device)

        # scores = model(user_ids_raw, item_ids_raw, user_input_ids, user_attention_mask, item_input_ids, item_attention_mask, False)
        scores = model(user_ids_raw, item_ids_raw, user_input_ids, user_attention_mask, item_input_ids, item_attention_mask, image, False)
        loss = loss_function(scores, ratings)
        val_loss += loss.item()

        predictions = torch.sigmoid(scores)
        predictions = (predictions >= 0.5 )*1


        for i, user in enumerate(user_ids_raw):
          val_users_dict[user][0].append(scores[i].item())
          val_users_dict[user][1].append(predictions[i].cpu())
          val_users_dict[user][2].append(ratings[i].cpu())
        # precision, recall = calculate_precision_recall(predictions, ratings)

      for user in tqdm(val_users_dict):

        scores  = val_users_dict[user][0]
        predictions = val_users_dict[user][1]
        ratings = val_users_dict[user][2]
        if len(scores) and np.sum(ratings, axis=0):
            if len(scores) < 5:
                k = len(scores)
            else:
                k = 5
            top_k_indices = np.argpartition(scores, -k, axis=0)[-k:]
            # Get predicted and true labels for top k indices
            top_k_preds = np.take_along_axis(np.array(predictions), top_k_indices, axis=0)
            top_k_true = np.take_along_axis(np.array(ratings), top_k_indices, axis=0)

            # Calculate true positives
            true_positives = np.sum(top_k_preds * top_k_true, axis=0)

            # Calculate precision and recall
            precision = true_positives / k
            recall = true_positives / np.sum(ratings, axis=0)
            val_precision.append(precision)
            val_recall.append(recall)
            val_ndcg.append(ndcg_at_k(top_k_true, k))
      average_val_precision = np.mean(val_precision)
      average_val_recall = np.mean(val_recall)
      average_val_ndcg = np.mean(val_ndcg)
    print(f'Saving model for epoch {epoch}')
    model_path = config['save_path'] + '/models/'+ f'Epoch{epoch}_model_state_dict.pth'



    if not os.path.exists('/'.join(model_path.split('/')[:-1])):
        os.makedirs('/'.join(model_path.split('/')[:-1]))
        print(f"Directory created: {'/'.join(model_path.split('/')[:-1])}")
    else:
        print(f"Directory already exists: {'/'.join(model_path.split('/')[:-1])}")

    torch.save(model.state_dict(), model_path)


    print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {epoch_loss/len(train_dl)}, Validation Loss: {val_loss/len(val_dl)}, Training Precision: {average_epoch_precision}, Training Recall: {average_epoch_recall}, Training NDCG: {average_epoch_ndcg}, Val Precision: {average_val_precision}, Val Recall: {average_val_recall}, Val NDCG: {average_val_ndcg}')

  0%|          | 0/428 [00:00<?, ?it/s]

KeyboardInterrupt: 

Inference

In [None]:
model.eval()
infer_users_dict = {user: [[], [], []] for user in infer_users}
infer_loss = 0
infer_precision = []
infer_recall = []
with torch.no_grad():
  for batch in tqdm(test_dl):
    user_ids_raw = batch[0]
    item_ids_raw = batch[1]
    user_input_ids = batch[2]['input_ids'].squeeze(1).to(device)
    user_attention_mask = batch[2]['attention_mask'].to(device)
    item_input_ids = batch[3]['input_ids'].squeeze(1).to(device)
    item_attention_mask = batch[3]['attention_mask'].to(device)
    ratings = batch[4].to(device)
    image = batch[6].to(device)

    # scores = model(user_ids_raw, item_ids_raw, user_input_ids, user_attention_mask, item_input_ids, item_attention_mask, False)
    scores = model(user_ids_raw, item_ids_raw, user_input_ids, user_attention_mask, item_input_ids, item_attention_mask, image, False)
    loss = loss_function(scores, ratings)
    val_loss += loss.item()

    predictions = torch.sigmoid(scores)
    predictions = (predictions >= 0.5 )*1


    for i, user in enumerate(user_ids_raw):
      infer_users_dict[user][0].append(scores[i].item())
      infer_users_dict[user][1].append(predictions[i].cpu())
      infer_users_dict[user][2].append(ratings[i].cpu())

  for i, user in tqdm(enumerate(infer_users_dict)):
    scores  = infer_users_dict[user][0]
    predictions = infer_users_dict[user][1]
    ratings = infer_users_dict[user][2]
    if len(scores) and np.sum(ratings, axis=0):
        if len(scores) < 5:
            k = len(scores)
        else:
            k = 5
        top_k_indices = np.argpartition(scores, -k, axis=0)[-k:]
        # Get predicted and true labels for top k indices
        top_k_preds = np.take_along_axis(np.array(predictions), top_k_indices, axis=0)
        top_k_true = np.take_along_axis(np.array(ratings), top_k_indices, axis=0)
        # print(top_k_preds, top_k_true)

        # Calculate true positives
        true_positives = np.sum(top_k_preds * top_k_true, axis=0)

        # Calculate precision and recall
        precision = true_positives / k
        recall = true_positives / np.sum(ratings, axis=0)
        infer_precision.append(precision)
        infer_recall.append(recall)
  average_infer_precision = np.mean(infer_precision)
  average_infer_recall = np.mean(infer_recall)
  print(f'Training Precision: {average_infer_precision}, Training Recall: {average_infer_recall}')

  0%|          | 0/211 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
model_path = config['save_path'] + '/models/'+ 'model_state_dict.pth'



# if not os.path.exists(model_path):
#     os.makedirs(model_path)
#     print(f"Directory created: {model_path}")
# else:
#     print(f"Directory already exists: {model_path}")

torch.save(model.state_dict(), model_path)