In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import GroupShuffleSplit # used for splitting data 
from torch_geometric.data import Data


In [2]:
data_dir = os.path.normpath(r'D:\Projects\cs224-multimodal-recommender-system\processed_data\nowplaying')


train = pd.read_parquet(
    os.path.join(data_dir, 'session_candidates_train.parquet')
)

test = pd.read_parquet(
    os.path.join(data_dir, 'session_candidates_test.parquet')
)

features = pd.read_parquet(
    os.path.join(data_dir, 'Lyrics_HSP-L_Nowplay_Data.parquet')
)


In [7]:
track_features = features.drop_duplicates(subset= ['Artist', 'Title'], keep = 'first')
train_user_listening_history = train.loc[:, ['user', 'past_interactions']]
train_user_item_interactions = train.loc[:, ['user', 'positive_song_id']]

In [8]:
track_features.head()

Unnamed: 0,user id,source of the tweet,timestamp,track title,artist name,musicbrainz identifier of the recording,highlevel.danceability.value,highlevel.genre_dortmund.value,highlevel.ismir04_rhythm.value,highlevel.mood_acoustic.value,...,lowlevel.melbands.min,lowlevel.melbands.var,lowlevel.mfcc.mean,rhythm.bpm,tonal.key_key,tonal.key_scale,Title,Artist,lyrics,lyrics_embedding
0,59ad8591eaec6e8ee11f23fc198c817940c015f5,SoundTracking,2014-10-24 22:02:29,Pepper,Butthole Surfers,576f9c49-a0a1-4a67-bd40-a150cab60fcc,danceable,electronic,ChaChaCha,not_acoustic,...,"[7.37167954507e-24, 1.03600976465e-23, 7.19945...","[3.43737010553e-05, 1.07946398202e-05, 3.28441...","[-675.406982422, 123.255058289, -6.49030923843...",161.605255,C,major,Pepper,Butthole Surfers,[Verse 1]\nMarky got with Sharon\nSharon got C...,"[-0.016555475, 0.016257573, 0.03246234, -0.065..."
1,cd513cbff5104a006cdbe12b7b8f10d81f526d2b,Polly,2014-04-28 23:12:46,Sour Times,Portishead,e796ca49-037f-499b-a5a4-f19940aa20df,danceable,alternative,VienneseWaltz,not_acoustic,...,"[1.21528061417e-23, 1.80488945423e-23, 8.07589...","[1.30592898131e-05, 1.01659006759e-05, 5.22284...","[-643.84161377, 117.44203949, -4.67587757111, ...",163.533264,C#,major,Sour Times,Portishead,[Verse 1]\nTo pretend no one can find\nThe fal...,"[-0.043865778, 0.07975947, 0.031227203, -0.009..."
2,38fea4967e7dab5a07626bc5d0cf1e934cf2c851,Securenet Systems Radio Playlist Update,2014-09-14 13:36:44,Again,Lenny Kravitz,942c1e62-a7d3-4340-833c-65430fc13c75,not_danceable,blues,VienneseWaltz,not_acoustic,...,"[3.298852617e-24, 5.92850900607e-24, 4.5505188...","[3.89024462493e-05, 2.10120324482e-05, 2.76269...","[-637.288818359, 116.868415833, 1.32667195797,...",156.19574,A,major,Again,Lenny Kravitz,[Verse 1]\nI've been searching for you\nI hear...,"[0.040212587, 0.07172622, -0.022398397, -0.148..."
3,c8a7e0758ce8f57fb16e7f40dfe2bd0db95bd98a,Marci,2014-12-07 20:00:28,Hey Joe,Jimi Hendrix,7bf8af79-f26c-4d8a-95a8-6e2911e03665,not_danceable,rock,VienneseWaltz,not_acoustic,...,"[1.51580080478e-23, 4.68376104498e-24, 7.81818...","[1.57910926646e-06, 3.12275483338e-06, 2.62098...","[-596.793945312, 113.418014526, -15.2721614838...",151.390167,D#,minor,Hey Joe,Jimi Hendrix,"[Instrumental Intro]\n\n[Verse 1]\nHey Joe, wh...","[-0.009908957, -0.046707917, 0.058077063, 0.00..."
4,5155f14ee1a107756039dfed62ca98e89807c462,Securenet Systems Radio Playlist Update,2014-04-28 23:13:31,Change,Blind Melon,eeb6fe28-b13b-4673-bf2b-c7b4a16e785a,not_danceable,blues,ChaChaCha,not_acoustic,...,"[9.38992100651e-24, 7.64752290438e-24, 9.25747...","[3.02472329849e-07, 6.8389513217e-06, 1.315784...","[-637.808532715, 120.438911438, -8.66980934143...",156.063263,D,major,Change,Blind Melon,"[Verse 1]\nI don't feel the sun's coming out, ...","[0.0032072496, 0.011662157, 0.044229183, -0.10..."


In [None]:
user_item_interactions = train_user_item_interactions.copy()
item_features = track_features.loc[track_features['']]

In [9]:
train.head()

Unnamed: 0,user,past_interactions,candidates,positive_song_id
0,1001b22c8e4adeb77ef10481ad06ff9c35006cb3,"[718af634-a516-4d4f-92a6-1a7c791497cb, 54a3657...","[f54d72e0-e3d9-4634-8b84-0ebb323bcc54, ad5617e...",ad5617ec-c400-4dbb-8ee5-0fa992ffaf19
1,1001b22c8e4adeb77ef10481ad06ff9c35006cb3,"[54a36572-a2f7-4067-8fcf-302c2be84f51, 54a3657...","[5ac073ca-d58b-4afd-8faa-2b8c1313c1e1, b2b6c3a...",b2b6c3a5-999b-4d90-adb5-2218312a2b8c
2,1001b22c8e4adeb77ef10481ad06ff9c35006cb3,"[54a36572-a2f7-4067-8fcf-302c2be84f51, 54a3657...","[9c0a023a-d4c3-459a-92fa-afea7f7fee26, 39b50a7...",54a36572-a2f7-4067-8fcf-302c2be84f51
3,1001b22c8e4adeb77ef10481ad06ff9c35006cb3,"[54a36572-a2f7-4067-8fcf-302c2be84f51, 54a3657...","[5dd65d1d-6f0f-41ba-977e-b1e49e764e34, c34658c...",718af634-a516-4d4f-92a6-1a7c791497cb
4,1001b22c8e4adeb77ef10481ad06ff9c35006cb3,"[54a36572-a2f7-4067-8fcf-302c2be84f51, b2b6c3a...","[6c78f7be-38ec-4ed5-8d19-6af2bf318fa6, 6f97d58...",744fa32a-211e-4627-96eb-1597337c446c


In [6]:
train_user_item_interactions.head()

Unnamed: 0,user,positive_song_id
0,1001b22c8e4adeb77ef10481ad06ff9c35006cb3,ad5617ec-c400-4dbb-8ee5-0fa992ffaf19
1,1001b22c8e4adeb77ef10481ad06ff9c35006cb3,b2b6c3a5-999b-4d90-adb5-2218312a2b8c
2,1001b22c8e4adeb77ef10481ad06ff9c35006cb3,54a36572-a2f7-4067-8fcf-302c2be84f51
3,1001b22c8e4adeb77ef10481ad06ff9c35006cb3,718af634-a516-4d4f-92a6-1a7c791497cb
4,1001b22c8e4adeb77ef10481ad06ff9c35006cb3,744fa32a-211e-4627-96eb-1597337c446c


In [7]:
train_edge_index = train.loc[: , ['user', 'positive_song_id']]
test_edge_index = test.loc[:, ['user', 'positive_song_id']]

## Descriptive statistics

In [15]:
train_songs = set(train['Artist'] + train['Title'])
test_songs = set(test['Artist'] + test['Title'])

songs_in_both = train_songs.intersection(test_songs)


In [18]:
import torch
import torch_geometric


0