In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import torch_geometric.transforms as T
from torch_geometric.data import HeteroData
from torch_geometric.data import download_url, extract_zip
from torch_geometric.nn import SAGEConv, to_hetero

url = 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
extract_zip(download_url(url, '.'), '.')


movie_path = './ml-latest-small/movies.csv'
rating_path = './ml-latest-small/ratings.csv'
movies = pd.read_csv(movie_path)
movies
# display(movies)
# display(ratings)

  from .autonotebook import tqdm as notebook_tqdm
Downloading https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Extracting ./ml-latest-small.zip


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [3]:

class datapipeline:
    def __init__(self):
        movie_path = './ml-latest-small/movies.csv'
        rating_path = './ml-latest-small/ratings.csv'
        movies = pd.read_csv(movie_path)
        ratings = pd.read_csv(rating_path)

        genres_expanded = movies['genres'].str.get_dummies('|')
        movies_expanded = pd.concat([movies, genres_expanded], axis=1)

        movie_map = movies_expanded.movieId.drop_duplicates().reset_index(drop=True).to_dict()
        movie_map = {v: k for k, v in movie_map.items()}

        user_map  = ratings.userId.drop_duplicates().reset_index(drop=True).to_dict()
        user_map = {v: k for k, v in user_map.items()}

        ratings.userId = ratings.userId.map(user_map)
        ratings.movieId = ratings.movieId.map(movie_map)
        movies.movieId = movies.movieId.map(movie_map)

        edges = ratings[['userId', 'movieId']].to_numpy()
        edges = np.transpose(edges)

        y = ratings['rating'].to_numpy()

        movie_x = movies_expanded.drop(columns=['movieId', 'title', 'genres']).to_numpy()

        def create_heterograph(self):
            #Create heterograph
            data = HeteroData()

            data['user'].x = torch.eye(len(user_map))
            data['movie'].x = torch.from_numpy(x).to(torch.float)

            edge_index = torch.from_numpy(edges)
            edge_labels = torch.from_numpy(y).to(torch.float)

            data[('user' , 'rates' , 'movie')].edge_index = edge_index
            data[('user' , 'rates' , 'movie')].edge_label = edge_labels

            data = T.ToUndirected()(data)
""""""
            del data['movie', 'rev_rates', 'user'].edge_label

            assert edge_index.size(1) == len(edge_labels)


        def run(self):
            create_heterograph()


HeteroData(
  user={
    num_nodes=610,
    x=[610, 610],
  },
  movie={ x=[9742, 20] },
  (user, rates, movie)={
    edge_index=[2, 100836],
    edge_label=[100836],
  },
  (movie, rev_rates, user)={ edge_index=[2, 100836] }
)


In [193]:



class GraphSAGEEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        # print (x)
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = torch.nn.Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = torch.nn.Linear(hidden_channels, 1)

    def forward(self, z_dict, edge_label_index):
        global zz_dict
        zz_dict = z_dict
        global z
        row, col = edge_label_index
        z = torch.cat([z_dict['user'][row], z_dict['movie'][col]], dim=-1)
        z = self.lin1(z).relu()
        z = self.lin2(z)

        return z.view(-1)

class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.encoder = GraphSAGEEncoder(hidden_channels, hidden_channels)
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
        self.decoder = EdgeDecoder(hidden_channels)

    def forward(self, x_dict, edge_index_dict, edge_label_index):
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict, edge_label_index)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Model(hidden_channels=32).to(device)
print (model)

Model(
  (encoder): GraphModule(
    (conv1): ModuleDict(
      (user__rates__movie): SAGEConv((-1, -1), 32, aggr=mean)
      (movie__rev_rates__user): SAGEConv((-1, -1), 32, aggr=mean)
    )
    (conv2): ModuleDict(
      (user__rates__movie): SAGEConv((-1, -1), 32, aggr=mean)
      (movie__rev_rates__user): SAGEConv((-1, -1), 32, aggr=mean)
    )
  )
  (decoder): EdgeDecoder(
    (lin1): Linear(in_features=64, out_features=32, bias=True)
    (lin2): Linear(in_features=32, out_features=1, bias=True)
  )
)


In [150]:
train_data, val_data, test_data = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    neg_sampling_ratio=0.0,
    edge_types=[('user', 'rates', 'movie')],
    rev_edge_types=[('movie', 'rev_rates', 'user')],
)(data)



tensor([[  95,  605,  589,  ...,  238,  152,  609],
        [2218,  948, 5314,  ...,  337, 8467, 9274]])

In [195]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

def train():
    model.train()
    optimizer.zero_grad()
    pred = model(train_data.x_dict, train_data.edge_index_dict,
                 train_data['user', 'movie'].edge_label_index)
    target = train_data['user', 'movie'].edge_label
    loss = F.mse_loss(pred, target)
    loss.backward()
    optimizer.step()
    return float(loss)

@torch.no_grad()
def test(data):
    data = data.to(device)
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict,
                 data['user', 'movie'].edge_label_index)
    pred = pred.clamp(min=0, max=5)
    target = data['user', 'movie'].edge_label.float()
    rmse = F.mse_loss(pred, target).sqrt()
    return float(rmse)


for epoch in range(1, 3):
        train_data = train_data.to(device)
        loss = train()
        train_rmse = test(train_data)
        val_rmse = test(val_data)
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_rmse:.4f}, '
            f'Val: {val_rmse:.4f}')

Epoch: 001, Loss: 1.1979, Train: 1.7262, Val: 1.7328
Epoch: 002, Loss: 2.9798, Train: 1.2308, Val: 1.2371


In [207]:
zz_dict['user'].size()
zz_dict['movie'].size()

torch.Size([9742, 32])

In [237]:
print (train_data.x_dict['user'].size())
print (train_data.x_dict['movie'].size())

train_data.x_dict


torch.Size([610, 610])
torch.Size([9742, 20])


{'user': tensor([[1., 0., 0.,  ..., 0., 0., 0.],
         [0., 1., 0.,  ..., 0., 0., 0.],
         [0., 0., 1.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 1., 0., 0.],
         [0., 0., 0.,  ..., 0., 1., 0.],
         [0., 0., 0.,  ..., 0., 0., 1.]]),
 'movie': tensor([[0., 0., 1.,  ..., 0., 0., 0.],
         [0., 0., 1.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 1., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]])}

In [241]:
print( train_data.edge_index_dict[('user', 'rates', 'movie')].size())
print( train_data.edge_index_dict[('movie', 'rev_rates', 'user')].size())

train_data.edge_index_dict

torch.Size([2, 80670])
torch.Size([2, 80670])


{('user',
  'rates',
  'movie'): tensor([[  95,  605,  589,  ...,  238,  152,  609],
         [2218,  948, 5314,  ...,  337, 8467, 9274]]),
 ('movie',
  'rev_rates',
  'user'): tensor([[2218,  948, 5314,  ...,  337, 8467, 9274],
         [  95,  605,  589,  ...,  238,  152,  609]])}

In [243]:
print(train_data['user', 'movie'].edge_label_index.size())
train_data['user', 'movie'].edge_label_index

torch.Size([2, 80670])


tensor([[  95,  605,  589,  ...,  238,  152,  609],
        [2218,  948, 5314,  ...,  337, 8467, 9274]])