In [42]:
%%pyspark

from pyspark.sql.types import * 
from graphframes import *

blob_account_name = "cjoakimstorage"
blob_container_name = "synapse"
from pyspark.sql import SparkSession

sc = SparkSession.builder.getOrCreate()
token_library = sc._jvm.com.microsoft.azure.synapse.tokenlibrary.TokenLibrary
blob_sas_token = token_library.getConnectionString("cjoakimstorageAzureBlobStorage")

vertices_csv_blob = 'wasbs://synapse@cjoakimstorage.blob.core.windows.net/graphframes/imdb_vertices.csv'
edges_csv_blob    = 'wasbs://synapse@cjoakimstorage.blob.core.windows.net/graphframes/imdb_edges.csv'

spark.conf.set(
    'fs.azure.sas.%s.%s.blob.core.windows.net' % (blob_container_name, blob_account_name),
    blob_sas_token)

v_fields = [
    StructField("id", StringType(), True),
    StructField("label", StringType(), True),
    StructField("name", StringType(), True),
    StructField("attributes", StringType(), True)
]

e_fields = [
    StructField("src", StringType(), True),
    StructField("dst", StringType(), True),
    StructField("relationship", StringType(), True),
    StructField("attributes", StringType(), True)
]

df_v = spark.read.load(
    vertices_csv_blob, 
    format='csv', 
    header=True, 
    delimiter='|',
    schema=StructType(v_fields))

df_e = spark.read.load(
    edges_csv_blob,
    format='csv',
    header=True,
    delimiter='|',
    schema=StructType(e_fields))

print('dv_v')
print(str(type(df_v)))  # <class 'pyspark.sql.dataframe.DataFrame'>
df_v.printSchema()
display(df_v.limit(10))


print('dv_e')
print(str(type(df_e)))
df_e.printSchema()
display(df_e.limit(10))


StatementMeta(poolspark3s, 31, 1, Finished, Available)

dv_v
<class 'pyspark.sql.dataframe.DataFrame'>
root
 |-- id: string (nullable = true)
 |-- label: string (nullable = true)
 |-- name: string (nullable = true)
 |-- attributes: string (nullable = true)


dv_e
<class 'pyspark.sql.dataframe.DataFrame'>
root
 |-- src: string (nullable = true)
 |-- dst: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- attributes: string (nullable = true)



SynapseWidget(Synapse.DataFrame, b7277cea-0be8-4fc1-855d-eaf3960d6c0a)

SynapseWidget(Synapse.DataFrame, 44fc454f-2b0f-4e61-adb8-cfd34d6f556a)

In [43]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

import pyspark.sql.functions as F 

def write_df_to_csv_blob(df, out_csv):
    # See https://github.com/Azure-Samples/Synapse/tree/main/Notebooks/PySpark

    # Azure storage account info
    blob_account_name   = 'cjoakimstorage'
    blob_container_name = 'synapse'
    blob_relative_path  = 'graphframes'
    linked_service_name = 'cjoakimstorageAzureBlobStorage'

    blob_sas_token = mssparkutils.credentials.getConnectionStringOrCreds(linked_service_name)
    #print('blob_sas_token: {}'.format(blob_sas_token))

    # Allow Spark to access from Blob remotely
    wasbs_path = 'wasbs://%s@%s.blob.core.windows.net/%s' % (
        blob_container_name, blob_account_name, blob_relative_path)

    spark.conf.set('fs.azure.sas.%s.%s.blob.core.windows.net' % (
        blob_container_name, blob_account_name), blob_sas_token)

    csv_path = '{}{}'.format(wasbs_path, out_csv)

    print('wasbs_path: ' + wasbs_path)
    print('csv_path:   ' + csv_path)

    # Write to blob storage, coalesce it into one CSV file
    df.coalesce(1).write.csv(csv_path, mode='overwrite', header='true')
    print('written')

def write_df_to_json_blob(df, out):
    # See https://github.com/Azure-Samples/Synapse/tree/main/Notebooks/PySpark

    # Azure storage account info
    blob_account_name   = 'cjoakimstorage'
    blob_container_name = 'synapse'
    blob_relative_path  = 'graphframes'
    linked_service_name = 'cjoakimstorageAzureBlobStorage'

    blob_sas_token = mssparkutils.credentials.getConnectionStringOrCreds(linked_service_name)
    #print('blob_sas_token: {}'.format(blob_sas_token))

    # Allow Spark to access from Blob remotely
    wasbs_path = 'wasbs://%s@%s.blob.core.windows.net/%s' % (
        blob_container_name, blob_account_name, blob_relative_path)

    spark.conf.set('fs.azure.sas.%s.%s.blob.core.windows.net' % (
        blob_container_name, blob_account_name), blob_sas_token)

    out_path = '{}{}'.format(wasbs_path, out)

    print('wasbs_path: ' + wasbs_path)
    print('out_path:   ' + out_path)

    # Write to blob storage, coalesce it into one file
    df.coalesce(1).write.json(out_path, mode='overwrite')
    print('written')


StatementMeta(poolspark3s, 31, 2, Finished, Available)

In [44]:
%%pyspark

# https://docs.microsoft.com/en-us/azure/synapse-analytics/spark/apache-spark-development-using-notebooks

df_v.createOrReplaceTempView('dfv')
print('df_v cached')

df_e.createOrReplaceTempView('dfe')
print('df_e cached')


StatementMeta(poolspark3s, 31, 3, Finished, Available)

df_v cached
df_e cached

In [45]:
%%sql
select * from dfv limit 3


StatementMeta(poolspark3s, 31, 4, Finished, Available)

<Spark SQL result set with 3 rows and 4 fields>

In [46]:
%%sql
select * from dfe limit 3

StatementMeta(poolspark3s, 31, 5, Finished, Available)

<Spark SQL result set with 3 rows and 4 fields>

In [47]:
%%spark

val df_v = spark.sql("select * from dfv")
df_v.printSchema()


val df_e = spark.sql("select * from dfe")
df_e.printSchema()


StatementMeta(poolspark3s, 31, 7, Finished, Available)

df_v: org.apache.spark.sql.DataFrame = [id: string, label: string ... 2 more fields]
root
 |-- id: string (nullable = true)
 |-- label: string (nullable = true)
 |-- name: string (nullable = true)
 |-- attributes: string (nullable = true)

df_e: org.apache.spark.sql.DataFrame = [src: string, dst: string ... 2 more fields]
root
 |-- src: string (nullable = true)
 |-- dst: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- attributes: string (nullable = true)



In [48]:
%%pyspark 

# Create the GraphFrame, g, from the Vertices DataFrame and Edges DataFrame

from graphframes import *

g = GraphFrame(df_v, df_e)
print('done')


StatementMeta(poolspark3s, 31, 8, Finished, Available)

done

In [49]:
# inDegrees

display(g.inDegrees)

StatementMeta(poolspark3s, 31, 9, Finished, Available)

SynapseWidget(Synapse.DataFrame, cea4853c-0f88-4438-a651-7f634d751798)

In [50]:
# Search the paths from Kevin Bacon (nm0000102) to Charlotte Rampling (nm0001648).

paths = g.bfs("id = 'nm0000102'", "id = 'nm0001648'")
paths.show()


StatementMeta(poolspark3s, 31, 10, Finished, Available)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                from|                  e0|                  v1|                  e1|                  v2|                  e2|                  v3|                  e3|                  to|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|{nm0000102, Perso...|{nm0000102, tt127...|{tt1270798, Movie...|{tt1270798, nm222...|{nm2225369, Perso...|{nm2225369, tt287...|{tt2873282, Movie...|{tt2873282, nm000...|{nm0001648, Perso...|
|{nm0000102, Perso...|{nm0000102, tt009...|{tt0097125, Movie...|{tt0097125, nm094...|{nm0949744, Perso...|{nm0949744, tt038...|{tt0381690, Movie...|{tt0381690, nm000...|{nm0001648, Perso...|
|{nm0000102, Perso...|{nm0000102, tt011...|{t

In [51]:
# 

total_degree = g.degrees
in_degree = g.inDegrees
out_degree = g.outDegrees

result = (total_degree.join(in_degree, "id", how="left")
    .join(out_degree, "id", how="left")
    .fillna(0)
    .sort("inDegree", ascending=False))

print(str(type(result)))  # <class 'pyspark.sql.dataframe.DataFrame'>
result.show()

write_df_to_csv_blob(result, 'degrees')


StatementMeta(poolspark3s, 31, 11, Finished, Available)

<class 'pyspark.sql.dataframe.DataFrame'>
+---------+------+--------+---------+
|       id|degree|inDegree|outDegree|
+---------+------+--------+---------+
|nm0103977|   586|     293|      293|
|nm0482320|   474|     237|      237|
|nm0007123|   454|     227|      227|
|nm0149822|   408|     204|      204|
|nm0695177|   348|     174|      174|
|nm0621937|   286|     143|      143|
|nm0451600|   276|     138|      138|
|nm0019382|   260|     130|      130|
|nm0007106|   258|     129|      129|
|nm0154146|   252|     126|      126|
|nm0419688|   252|     126|      126|
|nm0474774|   242|     121|      121|
|nm0000821|   230|     115|      115|
|nm0154164|   230|     115|      115|
|nm0006763|   224|     112|      112|
|nm0158112|   218|     109|      109|
|nm0893449|   218|     109|      109|
|nm0004569|   216|     108|      108|
|nm0595934|   214|     107|      107|
|nm0415549|   210|     105|      105|
+---------+------+--------+---------+
only showing top 20 rows

wasbs_path: wasbs://

In [52]:
# Search the paths from Kevin Bacon (nm0000102) to Charlotte Rampling (nm0001648).

result = g.bfs("id = 'nm0000102'", "id = 'nm0001648'")
result.show()

print(str(type(result)))  # <class 'pyspark.sql.dataframe.DataFrame'>

write_df_to_json_blob(result, 'paths_bacon_rampling')


StatementMeta(poolspark3s, 31, 12, Finished, Available)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                from|                  e0|                  v1|                  e1|                  v2|                  e2|                  v3|                  e3|                  to|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|{nm0000102, Perso...|{nm0000102, tt127...|{tt1270798, Movie...|{tt1270798, nm222...|{nm2225369, Perso...|{nm2225369, tt287...|{tt2873282, Movie...|{tt2873282, nm000...|{nm0001648, Perso...|
|{nm0000102, Perso...|{nm0000102, tt009...|{tt0097125, Movie...|{tt0097125, nm094...|{nm0949744, Perso...|{nm0949744, tt038...|{tt0381690, Movie...|{tt0381690, nm000...|{nm0001648, Perso...|
|{nm0000102, Perso...|{nm0000102, tt011...|{t

In [53]:
# Search the Breadth First Search (bfs) paths from Lori Singer (nm0001742) to Charlotte Rampling (nm0001648).

paths = g.bfs("id = 'nm0001742'", "id = 'nm0001648'")
paths.show()

StatementMeta(poolspark3s, 31, 13, Finished, Available)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                from|                  e0|                  v1|                  e1|                  v2|                  e2|                  v3|                  e3|                  to|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|{nm0001742, Perso...|{nm0001742, tt010...|{tt0109765, Movie...|{tt0109765, nm000...|{nm0000620, Perso...|{nm0000620, tt009...|{tt0092563, Movie...|{tt0092563, nm000...|{nm0001648, Perso...|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+

In [54]:
# Search the paths from Kevin Bacon (nm0001742) to Charlotte Rampling (nm0001648).

paths = g.shortestPaths(['nm0000102','nm0001648'])
paths.show()

write_df_to_csv_blob(paths, 'shortest_paths.csv')

#  CSV data source does not support map<string,int> data type


StatementMeta(poolspark3s, 31, 14, Finished, Available)

AnalysisException: CSV data source does not support map<string,int> data type.