In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from graphframes import GraphFrame
import pandas as pd
import os
import findspark
findspark.init() 

In [48]:
artists = pd.read_csv(os.path.join('..','data','artists.dat'), delimiter='\t')
tags = pd.read_csv(os.path.join('..','data','tags.dat'), delimiter='\t',encoding='ISO-8859-1')
user_artists = pd.read_csv(os.path.join('..','data','user_artists.dat'), delimiter='\t')
user_friends = pd.read_csv(os.path.join('..','data','user_friends.dat'), delimiter='\t')
user_taggedartists_timestamps = pd.read_csv(os.path.join('..','data','user_taggedartists-timestamps.dat'), delimiter='\t')
user_taggedartists = pd.read_csv(os.path.join('..','data','user_taggedartists.dat'), delimiter='\t')

In [49]:
user_artists['userID'] = 'user' + user_artists['userID'].astype(str)
user_artists['artistID'] = 'artist' + user_artists['artistID'].astype(str)
user_friends['userID'] = 'user' + user_friends['userID'].astype(str)
user_friends['friendID'] = 'user' + user_friends['friendID'].astype(str)
user_taggedartists['artistID'] = 'artist' + user_taggedartists['artistID'].astype(str)
user_taggedartists['tagID'] = 'tag' + user_taggedartists['tagID'].astype(str)
user_taggedartists['userID'] = 'user' + user_taggedartists['userID'].astype(str)
artists['id'] = 'artist' + artists['id'].astype(str)
tags['tagID'] = 'tag' + tags['tagID'].astype(str)

In [3]:
python_path = r'C:\Users\Sheri\AppData\Local\Programs\Python\Python311\python.exe'  
os.environ['PYSPARK_PYTHON'] = python_path
os.environ['PYSPARK_DRIVER_PYTHON'] = python_path

spark = SparkSession.builder \
    .appName("GNN") \
    .getOrCreate()

In [78]:
# Define vertices
users_vertices = pd.DataFrame(user_artists['userID'].unique(), columns = ['id'])

artists_vertices = pd.DataFrame(artists['id'].unique(), columns = ["id"])

tags_vertices = pd.DataFrame(tags['tagID'].unique(), columns = ["id"])

vertices = pd.concat([users_vertices,artists_vertices,tags_vertices])

# Define edges
user_artist_edges = user_artists.drop('weight', axis = 1).rename(columns = {'userID' : 'src', 'artistID' : 'dst'})
user_artist_edges['type'] = 'listens'

user_tag_edges = user_taggedartists.rename(columns = {'userID' : 'src', 'tagID' : 'dst'})
for col in ['day','month','year','artistID']:
    user_tag_edges = user_tag_edges.drop(col, axis = 1)
user_tag_edges['type'] = 'tag_used'


artist_tag_edges = user_taggedartists.rename(columns = {'artistID' : 'src', 'tagID' : 'dst'})
for col in ['day','month','year','userID']:
    artist_tag_edges = artist_tag_edges.drop(col, axis = 1)
artist_tag_edges['type'] = 'tagged_as'

user_user_edges = user_friends.rename(columns = {'userID' : 'src', 'friendID' : 'dst'})
user_user_edges['type'] = 'friend'

edges = pd.concat([user_artist_edges,user_tag_edges,user_user_edges,artist_tag_edges])

In [83]:
print(f'There are {vertices.shape[0]} vertices.')
print(f'There are {edges.shape[0]} edges.')

There are 31470 vertices.
There are 491226 edges.


In [73]:
vertices = spark.createDataFrame(vertices)
edges = spark.createDataFrame(edges)

# Create the graph
graph = GraphFrame(vertices, edges)



In [87]:
graph.edges.show(10)
graph.vertices.show(10)

+-----+--------+-------+
|  src|     dst|   type|
+-----+--------+-------+
|user2|artist51|listens|
|user2|artist52|listens|
|user2|artist53|listens|
|user2|artist54|listens|
|user2|artist55|listens|
|user2|artist56|listens|
|user2|artist57|listens|
|user2|artist58|listens|
|user2|artist59|listens|
|user2|artist60|listens|
+-----+--------+-------+
only showing top 10 rows

+------+
|    id|
+------+
| user2|
| user3|
| user4|
| user5|
| user6|
| user7|
| user8|
| user9|
|user10|
|user11|
+------+
only showing top 10 rows

