### Library Imports

In [15]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
from datetime import datetime

Create a `SparkSession`. No need to create `SparkContext` as you automatically get it as part of the `SparkSession`.

In [3]:
spark = SparkSession.builder \
    .master("local") \
    .appName("Exploring Joins") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

sc = spark.sparkContext

In [58]:
df = spark.createDataFrame(
    [
        (1, 'facebook.com'),
        (1, 'facebook.com'),
        (2, 'snapchat.com'),
        (2, None),
        (None, 'twitter.com'),
    ], ['shop_id', 'shop_domain']
)

df.toPandas()

Unnamed: 0,shop_id,shop_domain
0,1.0,facebook.com
1,1.0,facebook.com
2,2.0,snapchat.com
3,2.0,
4,,twitter.com


In [101]:
join_keys = ['shop_id', 'shop_domain']

df = df.withColumn('natural_key', F.struct(*(F.col(col) for col in join_keys)))
df.toPandas()

Unnamed: 0,shop_id,shop_domain,join_key,natural_key
0,1.0,facebook.com,"(1, facebook.com)","(1, facebook.com)"
1,1.0,facebook.com,"(1, facebook.com)","(1, facebook.com)"
2,2.0,snapchat.com,"(2, snapchat.com)","(2, snapchat.com)"
3,2.0,,"(2, None)","(2, None)"
4,,twitter.com,"(None, twitter.com)","(None, twitter.com)"


In [94]:
freq_df = df.freqItems(
    ['join_key'], 
    support=0.6
).withColumn('temp', F.explode('join_key_freqItems'))
freq_df.toPandas()

Unnamed: 0,join_key_freqItems,temp
0,"[(None, twitter.com)]","(None, twitter.com)"
