In [1]:
import numpy as np
import pandas as pd
import os
import sklearn as sk
import torch
from torch_sparse import SparseTensor
from scipy import sparse

from GNNfuncs import *

In [2]:
cwd = '..'
artists = pd.read_csv(os.path.join(cwd,'data','artists.dat'), delimiter='\t')
tags = pd.read_csv(os.path.join(cwd,'data','tags.dat'), delimiter='\t',encoding='ISO-8859-1')
user_artists = pd.read_csv(os.path.join(cwd,'data','user_artists.dat'), delimiter='\t')
user_friends = pd.read_csv(os.path.join(cwd,'data','user_friends.dat'), delimiter='\t')
user_taggedartists_timestamps = pd.read_csv(os.path.join(cwd,'data','user_taggedartists-timestamps.dat'), delimiter='\t')
user_taggedartists = pd.read_csv(os.path.join(cwd,'data','user_taggedartists.dat'), delimiter='\t')

In [3]:
users = user_artists['userID'].unique()
singleartistusers = [user for user in users if len(get_artists(user,user_artists)) == 1]
singleartistusersdf = user_artists[user_artists['userID'].isin(singleartistusers)]
user_artists_temp = user_artists[~user_artists['userID'].isin(singleartistusers)]

In [4]:
from sklearn.model_selection import train_test_split

user_artists_train, user_artists_test = train_test_split(user_artists_temp, test_size = 0.2, stratify = user_artists_temp['userID'], random_state = 47)

user_artists_train = pd.concat([user_artists_train,singleartistusersdf])

In [5]:
user_taggedartists_test = user_taggedartists.merge(user_artists_test[['userID','artistID']], on = ['userID','artistID'], how = 'inner')
user_taggedartists_train = user_taggedartists.merge(user_artists_test[['userID','artistID']], on = ['userID','artistID'], how = 'left', indicator = True)
user_taggedartists_train = user_taggedartists_train[user_taggedartists_train['_merge'] == 'left_only'].drop(columns = ['_merge'])

In [20]:
#dfs = [user_artists_train,user_artists_test,user_taggedartists_train,user_taggedartists_test]
dfs = [user_artists_train,user_artists_test,user_friends]

filepath = os.path.join(cwd,'SheridanH','LightGCN','data','lastfm2')
if not os.path.exists(filepath):
    os.makedirs(filepath)

for df in dfs:
    df.to_csv(os.path.join(filepath,get_df_name(df, globals()) + '.txt'),sep='\t',header=False,index=False)

## Adjacency Matrix

### Interaction Matrix

In [11]:
user_vertices = user_artists['userID'].unique()
artist_vertices = artists['id'].unique()
user_artist_edges = user_artists_train[['userID','artistID']]
user_artist_edges = user_artist_edges.sort_values(by = ['userID','artistID'])
user_artist_edges

Unnamed: 0,userID,artistID
0,2,51
1,2,52
2,2,53
3,2,54
6,2,57
...,...,...
92828,2100,18725
92830,2100,18727
92831,2100,18728
92832,2100,18729


In [12]:
src = list(user_artist_edges['userID'])
dst = list(user_artist_edges['artistID'])
edges = [src,dst]

num_users = int(max(user_vertices)) + 1
num_artists = int(max(artist_vertices)) + 1

interaction_matrix = np.zeros([num_users,num_artists])

for src,dst in zip(edges[0],edges[1]):
    interaction_matrix[src][dst] = 1

## LightGCN

In [13]:
import subprocess

repo_url = "https://github.com/gusye1234/LightGCN-PyTorch"
clone_directory = "../SheridanH/LightGCN"

subprocess.run(["git", "clone", repo_url, clone_directory])

CompletedProcess(args=['git', 'clone', 'https://github.com/gusye1234/LightGCN-PyTorch', '../SheridanH/LightGCN'], returncode=128)

In [35]:
print(user_artists_train.sort_values(by=['userID','artistID']).iloc[:10])
print(user_artists_test.sort_values(by=['userID','artistID']).iloc[:10])

    userID  artistID  weight
0        2        51   13883
1        2        52   11690
2        2        53   11351
3        2        54   10300
6        2        57    5955
7        2        58    4616
9        2        60    4147
10       2        61    3923
12       2        63    3735
13       2        64    3644
    userID  artistID  weight
4        2        55    8983
5        2        56    6152
8        2        59    4337
11       2        62    3782
27       2        78    2119
34       2        85    1638
41       2        92    1411
46       2        97    1337
47       2        98    1332
48       2        99    1330


Defining the dataset

already has lastfm dataset defined but it is slightly different to ours, so we will add ours to dataloader.py

In [21]:
print(max(users))
artistlist = list(artists['id'].unique())
print(max(artistlist))

2100
18745


In [22]:
import os
from os.path import join
import sys
import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from scipy.sparse import csr_matrix
import scipy.sparse as sp
import world
from time import time

code_to_write = '''
class LastFM2(BasicDataset):
    """
    Dataset type for pytorch
    LastFM dataset 2
    """
    def __init__(self, path="../data/lastfm2"):
        # train or test
        cprint("loading [last fm]")
        self.mode_dict = {'train':0, "test":1}
        self.mode    = self.mode_dict['train']
        # self.n_users = 2100
        # self.m_items = 18745
        trainData = pd.read_table(join(path, 'user_artists_train.txt'), header=None)
        print(trainData.head())
        testData  = pd.read_table(join(path, 'user_artists_test.txt'), header=None)
        print(testData.head())
        trustNet  = pd.read_table(join(path, 'user_friends.txt'), header=None).to_numpy()
        print(trustNet[:5])
        trustNet -= 1
        trainData-= 1
        testData -= 1
        self.trustNet  = trustNet
        self.trainData = trainData
        self.testData  = testData
        self.trainUser = np.array(trainData[:][0])
        self.trainUniqueUsers = np.unique(self.trainUser)
        self.trainItem = np.array(trainData[:][1])
        # self.trainDataSize = len(self.trainUser)
        self.testUser  = np.array(testData[:][0])
        self.testUniqueUsers = np.unique(self.testUser)
        self.testItem  = np.array(testData[:][1])
        self.Graph = None
        print(f"LastFm2 Sparsity : {(len(self.trainUser) + len(self.testUser))/self.n_users/self.m_items}")
        
        # (users,users)
        self.socialNet    = csr_matrix((np.ones(len(trustNet)), (trustNet[:,0], trustNet[:,1]) ), shape=(self.n_users,self.n_users))
        # (users,items), bipartite graph
        self.UserItemNet  = csr_matrix((np.ones(len(self.trainUser)), (self.trainUser, self.trainItem) ), shape=(self.n_users,self.m_items)) 
        
        # pre-calculate
        self._allPos = self.getUserPosItems(list(range(self.n_users)))
        self.allNeg = []
        allItems    = set(range(self.m_items))
        for i in range(self.n_users):
            pos = set(self._allPos[i])
            neg = allItems - pos
            self.allNeg.append(np.array(list(neg)))
        self.__testDict = self.__build_test()

    @property
    def n_users(self):
        return 2100
    
    @property
    def m_items(self):
        return 18745
    
    @property
    def trainDataSize(self):
        return len(self.trainUser)
    
    @property
    def testDict(self):
        return self.__testDict

    @property
    def allPos(self):
        return self._allPos

    def getSparseGraph(self):
        if self.Graph is None:
            user_dim = torch.LongTensor(self.trainUser)
            item_dim = torch.LongTensor(self.trainItem)
            
            first_sub = torch.stack([user_dim, item_dim + self.n_users])
            second_sub = torch.stack([item_dim+self.n_users, user_dim])
            index = torch.cat([first_sub, second_sub], dim=1)
            data = torch.ones(index.size(-1)).int()
            self.Graph = torch.sparse.IntTensor(index, data, torch.Size([self.n_users+self.m_items, self.n_users+self.m_items]))
            dense = self.Graph.to_dense()
            D = torch.sum(dense, dim=1).float()
            D[D==0.] = 1.
            D_sqrt = torch.sqrt(D).unsqueeze(dim=0)
            dense = dense/D_sqrt
            dense = dense/D_sqrt.t()
            index = dense.nonzero()
            data  = dense[dense >= 1e-9]
            assert len(index) == len(data)
            self.Graph = torch.sparse.FloatTensor(index.t(), data, torch.Size([self.n_users+self.m_items, self.n_users+self.m_items]))
            self.Graph = self.Graph.coalesce().to(world.device)
        return self.Graph

    def __build_test(self):
        """
        return:
            dict: {user: [items]}
        """
        test_data = {}
        for i, item in enumerate(self.testItem):
            user = self.testUser[i]
            if test_data.get(user):
                test_data[user].append(item)
            else:
                test_data[user] = [item]
        return test_data
    
    def getUserItemFeedback(self, users, items):
        """
        users:
            shape [-1]
        items:
            shape [-1]
        return:
            feedback [-1]
        """
        # print(self.UserItemNet[users, items])
        return np.array(self.UserItemNet[users, items]).astype('uint8').reshape((-1, ))
    
    def getUserPosItems(self, users):
        posItems = []
        for user in users:
            posItems.append(self.UserItemNet[user].nonzero()[1])
        return posItems
    
    def getUserNegItems(self, users):
        negItems = []
        for user in users:
            negItems.append(self.allNeg[user])
        return negItems
            
    
    
    def __getitem__(self, index):
        user = self.trainUniqueUsers[index]
        # return user_id and the positive items of the user
        return user
    
    def switch2test(self):
        """
        change dataset mode to offer test data to dataloader
        """
        self.mode = self.mode_dict['test']
    
    def __len__(self):
        return len(self.trainUniqueUsers)
'''

In [None]:
with open("../SheridanH/LightGCN/code/dataloader.py", "a") as file:
    file.write(code_to_write)

In [None]:
!cd ../SheridanH/LightGCN/code && python main.py --decay=1e-4 --lr=0.001 --layer=3 --seed=2020 --dataset="lastfm2" --topks="[20]" --recdim=64

^C
