In [None]:
%%pyspark

from pyspark.sql.types import * 
from graphframes import *

blob_account_name = "cjoakimstorage"
blob_container_name = "synapse"
from pyspark.sql import SparkSession

sc = SparkSession.builder.getOrCreate()
token_library = sc._jvm.com.microsoft.azure.synapse.tokenlibrary.TokenLibrary
blob_sas_token = token_library.getConnectionString("cjoakimstorageAzureBlobStorage")

vertices_csv_blob = 'wasbs://synapse@cjoakimstorage.blob.core.windows.net/graphframes/imdb_vertices.csv'
edges_csv_blob    = 'wasbs://synapse@cjoakimstorage.blob.core.windows.net/graphframes/imdb_edges.csv'

spark.conf.set(
    'fs.azure.sas.%s.%s.blob.core.windows.net' % (blob_container_name, blob_account_name),
    blob_sas_token)

v_fields = [
    StructField("id", StringType(), True),
    StructField("label", StringType(), True),
    StructField("name", StringType(), True),
    StructField("attributes", StringType(), True)
]

e_fields = [
    StructField("src", StringType(), True),
    StructField("dst", StringType(), True),
    StructField("relationship", StringType(), True),
    StructField("attributes", StringType(), True)
]

df_v = spark.read.load(
    vertices_csv_blob, 
    format='csv', 
    header=True, 
    delimiter='|',
    schema=StructType(v_fields))

df_e = spark.read.load(
    edges_csv_blob,
    format='csv',
    header=True,
    delimiter='|',
    schema=StructType(e_fields))

print('dv_v')
print(str(type(df_v)))  # <class 'pyspark.sql.dataframe.DataFrame'>
df_v.printSchema()
display(df_v.limit(10))


print('dv_e')
print(str(type(df_e)))
df_e.printSchema()
display(df_e.limit(10))


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

import pyspark.sql.functions as F 

def write_df_to_csv_blob(df, out_csv):
    # See https://github.com/Azure-Samples/Synapse/tree/main/Notebooks/PySpark

    # Azure storage account info
    blob_account_name   = 'cjoakimstorage'
    blob_container_name = 'synapse'
    blob_relative_path  = 'graphframes'
    linked_service_name = 'cjoakimstorageAzureBlobStorage'

    blob_sas_token = mssparkutils.credentials.getConnectionStringOrCreds(linked_service_name)
    #print('blob_sas_token: {}'.format(blob_sas_token))

    # Allow Spark to access from Blob remotely
    wasbs_path = 'wasbs://%s@%s.blob.core.windows.net/%s' % (
        blob_container_name, blob_account_name, blob_relative_path)

    spark.conf.set('fs.azure.sas.%s.%s.blob.core.windows.net' % (
        blob_container_name, blob_account_name), blob_sas_token)

    csv_path = '{}{}'.format(wasbs_path, out_csv)

    print('wasbs_path: ' + wasbs_path)
    print('csv_path:   ' + csv_path)

    # Write to blob storage, coalesce it into one CSV file
    df.coalesce(1).write.csv(csv_path, mode='overwrite', header='true')
    print('written')

def write_df_to_json_blob(df, out):
    # See https://github.com/Azure-Samples/Synapse/tree/main/Notebooks/PySpark

    # Azure storage account info
    blob_account_name   = 'cjoakimstorage'
    blob_container_name = 'synapse'
    blob_relative_path  = 'graphframes'
    linked_service_name = 'cjoakimstorageAzureBlobStorage'

    blob_sas_token = mssparkutils.credentials.getConnectionStringOrCreds(linked_service_name)
    #print('blob_sas_token: {}'.format(blob_sas_token))

    # Allow Spark to access from Blob remotely
    wasbs_path = 'wasbs://%s@%s.blob.core.windows.net/%s' % (
        blob_container_name, blob_account_name, blob_relative_path)

    spark.conf.set('fs.azure.sas.%s.%s.blob.core.windows.net' % (
        blob_container_name, blob_account_name), blob_sas_token)

    out_path = '{}{}'.format(wasbs_path, out)

    print('wasbs_path: ' + wasbs_path)
    print('out_path:   ' + out_path)

    # Write to blob storage, coalesce it into one file
    df.coalesce(1).write.json(out_path, mode='overwrite')
    print('written')


In [None]:
%%pyspark

# https://docs.microsoft.com/en-us/azure/synapse-analytics/spark/apache-spark-development-using-notebooks

df_v.createOrReplaceTempView('dfv')
print('df_v cached')

df_e.createOrReplaceTempView('dfe')
print('df_e cached')


In [None]:
%%sql
select * from dfv limit 3


In [None]:
%%sql
select * from dfe limit 3

In [None]:
%%spark

val df_v = spark.sql("select * from dfv")
df_v.printSchema()


val df_e = spark.sql("select * from dfe")
df_e.printSchema()


In [None]:
%%pyspark 

# Create the GraphFrame, g, from the Vertices DataFrame and Edges DataFrame

from graphframes import *

g = GraphFrame(df_v, df_e)
print('done')


In [None]:
# inDegrees

display(g.inDegrees)

In [None]:
# Search the paths from Kevin Bacon (nm0000102) to Charlotte Rampling (nm0001648).

paths = g.bfs("id = 'nm0000102'", "id = 'nm0001648'")
paths.show()


In [None]:
# 

total_degree = g.degrees
in_degree = g.inDegrees
out_degree = g.outDegrees

result = (total_degree.join(in_degree, "id", how="left")
    .join(out_degree, "id", how="left")
    .fillna(0)
    .sort("inDegree", ascending=False))

print(str(type(result)))  # <class 'pyspark.sql.dataframe.DataFrame'>
result.show()

write_df_to_csv_blob(result, 'degrees')


In [None]:
# Search the paths from Kevin Bacon (nm0000102) to Charlotte Rampling (nm0001648).

result = g.bfs("id = 'nm0000102'", "id = 'nm0001648'")
result.show()

print(str(type(result)))  # <class 'pyspark.sql.dataframe.DataFrame'>

write_df_to_json_blob(result, 'paths_bacon_rampling')


# CSV data source does not support struct<id:string,label:string,name:string,attributes:string> data type.


In [None]:
# Search the Breadth First Search (bfs) paths from Lori Singer (nm0001742) to Charlotte Rampling (nm0001648).

paths = g.bfs("id = 'nm0001742'", "id = 'nm0001648'")
paths.show()

In [None]:
# Search the paths from Kevin Bacon (nm0001742) to Charlotte Rampling (nm0001648).

paths = g.shortestPaths(['nm0000102','nm0001648'])
paths.show()

write_df_to_csv_blob(paths, 'shortest_paths.csv')

#  CSV data source does not support map<string,int> data type
