In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import GroupShuffleSplit # used for splitting data 
from torch_geometric.data import Data


In [2]:
data_dir = os.path.normpath(r'D:\Projects\cs224-multimodal-recommender-system\processed_data\nowplaying')


train = pd.read_parquet(
    os.path.join(data_dir, 'session_candidates_train.parquet')
)

test = pd.read_parquet(
    os.path.join(data_dir, 'session_candidates_test.parquet')
)

features = pd.read_parquet(
    os.path.join(data_dir, 'Lyrics_HSP-L_Nowplay_Data.parquet')
)

track_map = pd.read_parquet(
    os.path.join(data_dir, 'track_id_map.parquet')
).set_index(['Artist', 'Title'])


In [22]:
track_features = (features
                  .drop_duplicates(subset= ['Artist', 'Title'], keep = 'first')
                  .drop(columns = ['user id', 'source of the tweet', 'track title', 'artist name', ])
                  )
print('Initial number of tracks:', track_features.shape[0])
track_features = track_features.merge(track_map, on = ['Artist', 'Title'], how = 'inner')
track_features = (track_features
                  .set_index('track_id')
                  .lyrics_embedding
                  .apply(lambda x: pd.Series(x))
                  )

print('Number of tracks post-inner join:', track_features.shape[0])
print('Track features')
print(79*'-')
display(track_features.head())

train_user_listening_history = train.loc[:, ['user', 'past_interactions']]
print('Train: User listening history')
print(79*'-')
display(train_user_listening_history.head())


train_user_item_interactions = train.loc[:, ['user', 'positive_song_id']]
print('Train: User-item interactions')
print(79*'-')
display(train_user_item_interactions.head())

train_user_item_all_interactions = train.loc[: , ['user', 'candidates']].explode('candidates').rename(columns = {'candidates': 'song_id'})
print('Train: User-item negative + positive interactions')
print(79*'-')
display(train_user_item_all_interactions.head())



test_user_listening_history = test.loc[:, ['user', 'past_interactions']]
print('Test: User listening history')
print(79*'-')
display(test_user_listening_history.head())


test_user_item_interactions = test.loc[:, ['user', 'positive_song_id']]
print('Test: User-item interactions')
print(79*'-')
display(test_user_item_interactions.head())

test_user_item_all_interactions = test.loc[: , ['user', 'candidates']].explode('candidates').rename(columns = {'candidates': 'song_id'})
print('Test: User-item negative + positive interactions')
print(79*'-')
display(test_user_item_all_interactions.head())


Initial number of tracks: 2471
Number of tracks post-inner join: 2155
Track features
-------------------------------------------------------------------------------


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.016555,0.016258,0.032462,-0.065711,0.035892,-0.030516,0.04403,0.046941,0.031009,-0.051097,...,0.137975,-0.032969,-0.178082,0.032461,0.081872,0.018873,0.134802,-0.184255,-0.02097,-0.009385
2,-0.043866,0.079759,0.031227,-0.009023,0.198229,-0.117961,0.023687,0.126027,0.002055,-0.055694,...,0.082887,-0.094865,-0.103203,0.029579,0.168657,0.054824,0.156257,0.036643,-0.045506,-0.051694
3,0.040213,0.071726,-0.022398,-0.148623,0.250008,-0.033887,0.026806,0.242937,0.030847,0.016596,...,0.093019,-0.006108,-0.10845,0.160098,0.071841,0.118777,0.233991,0.042564,-0.028533,-0.093544
4,-0.009909,-0.046708,0.058077,0.009227,0.064153,-0.003633,0.027486,0.102278,0.117365,-0.024824,...,0.10089,-0.193991,-0.133274,0.1081,0.134432,0.114251,0.241544,-0.116317,-0.074772,-0.075406
5,0.003207,0.011662,0.044229,-0.107282,0.164013,-0.027828,0.0179,0.089839,0.104875,-0.010808,...,0.083276,-0.045836,-0.046923,0.081197,0.055439,0.086655,0.196141,-0.123199,-0.025011,0.013837


Train: User listening history
-------------------------------------------------------------------------------


Unnamed: 0,user,past_interactions
0,1001b22c8e4adeb77ef10481ad06ff9c35006cb3,"[96, 46, 46, 46, 46, 232, 325, 232, 125, 325]"
1,1001b22c8e4adeb77ef10481ad06ff9c35006cb3,"[46, 46, 46, 46, 232, 325, 232, 125, 325, 125]"
2,1001b22c8e4adeb77ef10481ad06ff9c35006cb3,"[46, 46, 46, 232, 325, 232, 125, 325, 125, 232]"
3,1001b22c8e4adeb77ef10481ad06ff9c35006cb3,"[46, 46, 232, 325, 232, 125, 325, 125, 232, 46]"
4,1001b22c8e4adeb77ef10481ad06ff9c35006cb3,"[46, 232, 325, 232, 125, 325, 125, 232, 46, 96]"


Train: User-item interactions
-------------------------------------------------------------------------------


Unnamed: 0,user,positive_song_id
0,1001b22c8e4adeb77ef10481ad06ff9c35006cb3,125
1,1001b22c8e4adeb77ef10481ad06ff9c35006cb3,232
2,1001b22c8e4adeb77ef10481ad06ff9c35006cb3,46
3,1001b22c8e4adeb77ef10481ad06ff9c35006cb3,96
4,1001b22c8e4adeb77ef10481ad06ff9c35006cb3,325


Train: User-item negative + positive interactions
-------------------------------------------------------------------------------


Unnamed: 0,user,song_id
0,1001b22c8e4adeb77ef10481ad06ff9c35006cb3,278
0,1001b22c8e4adeb77ef10481ad06ff9c35006cb3,1690
0,1001b22c8e4adeb77ef10481ad06ff9c35006cb3,366
0,1001b22c8e4adeb77ef10481ad06ff9c35006cb3,1984
0,1001b22c8e4adeb77ef10481ad06ff9c35006cb3,795


Test: User listening history
-------------------------------------------------------------------------------


Unnamed: 0,user,past_interactions
0,103aaf06662cc01fe8b1ddaa39e41f59a8332832,"[595, 84, 152, 614, 688, 48, 34, 191, 1362, 171]"
1,103aaf06662cc01fe8b1ddaa39e41f59a8332832,"[84, 152, 614, 688, 48, 34, 191, 1362, 171, 111]"
2,103aaf06662cc01fe8b1ddaa39e41f59a8332832,"[152, 614, 688, 48, 34, 191, 1362, 171, 111, 196]"
3,103aaf06662cc01fe8b1ddaa39e41f59a8332832,"[614, 688, 48, 34, 191, 1362, 171, 111, 196, 152]"
4,103aaf06662cc01fe8b1ddaa39e41f59a8332832,"[688, 48, 34, 191, 1362, 171, 111, 196, 152, 643]"


Test: User-item interactions
-------------------------------------------------------------------------------


Unnamed: 0,user,positive_song_id
0,103aaf06662cc01fe8b1ddaa39e41f59a8332832,111
1,103aaf06662cc01fe8b1ddaa39e41f59a8332832,196
2,103aaf06662cc01fe8b1ddaa39e41f59a8332832,152
3,103aaf06662cc01fe8b1ddaa39e41f59a8332832,643
4,103aaf06662cc01fe8b1ddaa39e41f59a8332832,144


Test: User-item negative + positive interactions
-------------------------------------------------------------------------------


Unnamed: 0,user,song_id
0,103aaf06662cc01fe8b1ddaa39e41f59a8332832,427
0,103aaf06662cc01fe8b1ddaa39e41f59a8332832,46
0,103aaf06662cc01fe8b1ddaa39e41f59a8332832,995
0,103aaf06662cc01fe8b1ddaa39e41f59a8332832,1350
0,103aaf06662cc01fe8b1ddaa39e41f59a8332832,1548


In [63]:
import torch

def prepare_graph(user_item_interactions, user_item_all_interactions , item_features):
    # item feature manipulation 
    # get user id and item id
    user_ids = list(set([ui for ui, ii in user_item_all_interactions]))
    item_ids = item_features.index
    # create map for users and items
    user_id_map = {uid: idx for idx, uid in enumerate(user_ids)}
    num_users = len(user_ids)
    item_id_map = {iid: idx + num_users for idx, iid in enumerate(item_ids)}
    print('Number of users: ', num_users)
    num_items = len(item_ids)
    print('Number of items: ', num_items)
    
    
    
    num_features = item_features.shape[1]
    num_nodes = num_users + num_items
    x = torch.zeros(num_nodes, num_features)
    for iid, idx in item_id_map.items():
        x[idx] = torch.tensor(item_features.loc[iid, :].values, dtype = torch.float)
    
    edge_index = []
    for uid, iid in user_item_all_interactions:
        u_idx = user_id_map[uid]
        i_idx = item_id_map[iid]
        # add undirected edges
        # TODO: Undirected edges is a major caveat of this architecture 
        #  However it is necessary for us to consider this style of architecture because
        #  otherwise the maximum depth of the GNN is 1-hop (user -> item x ) [item has no outgoing nodes]
        # Add edges in both directions 
        edge_index.append([u_idx, i_idx])
        edge_index.append([i_idx, u_idx])
    edge_index = torch.tensor(edge_index).t().contiguous()
    # Masks for users and items
    user_mask = torch.zeros(num_users + num_items, dtype=torch.bool)
    item_mask = torch.zeros(num_users + num_items, dtype=torch.bool)
    user_mask[:num_users] = True
    item_mask[num_users:] = True    # Create data object
    data = Data(x=x, edge_index=edge_index)
    data.user_mask = user_mask
    data.item_mask = item_mask
    # positive edge index
    pos_edge_index = []
    for uid, iid in user_item_interactions:
        if iid in item_id_map:
            u_idx = user_id_map[uid]
            i_idx = item_id_map[iid]
            pos_edge_index.append([u_idx, i_idx])
        else:
            print(f'{iid} not in item_id_map')
    data.pos_edge_index = torch.tensor(pos_edge_index, dtype = torch.long).t().contiguous()
    return  data

train_data = prepare_graph(
    user_item_interactions = train_user_item_interactions.to_numpy(),
    user_item_all_interactions = train_user_item_all_interactions.to_numpy(),
    item_features = track_features
)

test_data = prepare_graph(
    user_item_interactions = test_user_item_all_interactions.to_numpy(),
    user_item_all_interactions = test_user_item_interactions.to_numpy(),
    item_features = track_features 
    
)


Number of users:  453
Number of items:  2155
Number of users:  114
Number of items:  2155


In [66]:
# torch.save(train_data,os.path.join(data_dir, 'train_graph.pt'))
# torch.save(test_data, os.path.join(data_dir, 'test_graph.pt'))

## Descriptive statistics

In [None]:
train_songs = set(train['Artist'] + train['Title'])
test_songs = set(test['Artist'] + test['Title'])

songs_in_both = train_songs.intersection(test_songs)


In [None]:
import torch
import torch_geometric
