In [3]:
from lib.session import get_spark_session

spark  = get_spark_session("challenge 0 - ")

spark

In [6]:
# create auto increment column
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, monotonically_increasing_id

df = spark.createDataFrame([
("Alice", 1),
("Bob", 2),
("Charlie", 3),
], ["Name", "Value"])


wind = Window.orderBy(monotonically_increasing_id())

df.withColumn("index", row_number().over(wind)).show()

+-------+-----+-----+
|   Name|Value|index|
+-------+-----+-----+
|  Alice|    1|    1|
|    Bob|    2|    2|
|Charlie|    3|    3|
+-------+-----+-----+



In [8]:
# lists to dataframe
list1 = ["a", "b", "c", "d"]
list2 = [1, 2, 3, 4]

rdd_from_list = spark.sparkContext.parallelize(list(zip(list1, list2)))
df_from_list = rdd_from_list.toDF(["col1", "col2"])
df_from_list.show()

+----+----+
|col1|col2|
+----+----+
|   a|   1|
|   b|   2|
|   c|   3|
|   d|   4|
+----+----+



In [18]:
# Get list of A does not exists in B
list_A = [1, 2, 3, 4, 5]
list_B = [4, 5, 6, 7, 8]

ls_a = spark.sparkContext.parallelize(list_A)
ls_b = spark.sparkContext.parallelize(list_B)

'''
    subtract: exists in A but not in A 
    union: all elements

    collect: convert the rdd to list, or dataframe to Array[Row]  and return it to driver
'''

# in A not in B
diff = ls_a.subtract(ls_b).collect()
print(diff)


# in A not in B or in B not in A
not_in_B = ls_a.subtract(ls_b)
not_in_A = ls_b.subtract(ls_a)


print(not_in_B.union(not_in_A).collect())

[1, 2, 3]
[1, 2, 3, 6, 7, 8]


In [19]:
# quantiles

data = [("A", 10), ("B", 20), ("C", 30), ("D", 40), ("E", 50), ("F", 15), ("G", 28), ("H", 54), ("I", 41), ("J", 86)]
df = spark.createDataFrame(data, ["Name", "Age"])

# column, ntiles needed, error if 0 exact quantiles (expensive)
quantiles = df.approxQuantile("Age", [0.0, 0.25, 0.5, 0.75, 1.0], 0.01)

print(quantiles)

[10.0, 20.0, 30.0, 50.0, 86.0]


In [23]:
# frequency
from pyspark.sql import Row
from pyspark.sql.functions import count

data = [
    Row(name="a", ),
    Row(name="b", ),
    Row(name="b", ),
    Row(name="a", ),
    Row(name="c", ),
    Row(name="c", ),
    Row(name="c", ),
]

df = spark.createDataFrame(data)
df.groupBy("name").agg(count("name").alias("freq")).show()

+----+----+
|name|freq|
+----+----+
|   a|   2|
|   b|   2|
|   c|   3|
+----+----+

