In [3]:
from lib.session import get_spark_session

spark  = get_spark_session("challenge 0 - ")

spark

In [6]:
# create auto increment column
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, monotonically_increasing_id

df = spark.createDataFrame([
("Alice", 1),
("Bob", 2),
("Charlie", 3),
], ["Name", "Value"])


wind = Window.orderBy(monotonically_increasing_id())

df.withColumn("index", row_number().over(wind)).show()

+-------+-----+-----+
|   Name|Value|index|
+-------+-----+-----+
|  Alice|    1|    1|
|    Bob|    2|    2|
|Charlie|    3|    3|
+-------+-----+-----+



In [8]:
# lists to dataframe
list1 = ["a", "b", "c", "d"]
list2 = [1, 2, 3, 4]

rdd_from_list = spark.sparkContext.parallelize(list(zip(list1, list2)))
df_from_list = rdd_from_list.toDF(["col1", "col2"])
df_from_list.show()

+----+----+
|col1|col2|
+----+----+
|   a|   1|
|   b|   2|
|   c|   3|
|   d|   4|
+----+----+



In [18]:
# Get list of A does not exists in B
list_A = [1, 2, 3, 4, 5]
list_B = [4, 5, 6, 7, 8]

ls_a = spark.sparkContext.parallelize(list_A)
ls_b = spark.sparkContext.parallelize(list_B)

'''
    subtract: exists in A but not in A 
    union: all elements

    collect: convert the rdd to list, or dataframe to Array[Row]  and return it to driver
'''

# in A not in B
diff = ls_a.subtract(ls_b).collect()
print(diff)


# in A not in B or in B not in A
not_in_B = ls_a.subtract(ls_b)
not_in_A = ls_b.subtract(ls_a)


print(not_in_B.union(not_in_A).collect())

[1, 2, 3]
[1, 2, 3, 6, 7, 8]


In [19]:
# quantiles

data = [("A", 10), ("B", 20), ("C", 30), ("D", 40), ("E", 50), ("F", 15), ("G", 28), ("H", 54), ("I", 41), ("J", 86)]
df = spark.createDataFrame(data, ["Name", "Age"])

# column, ntiles needed, error if 0 exact quantiles (expensive)
quantiles = df.approxQuantile("Age", [0.0, 0.25, 0.5, 0.75, 1.0], 0.01)

print(quantiles)

[10.0, 20.0, 30.0, 50.0, 86.0]


In [23]:
# frequency
from pyspark.sql import Row
from pyspark.sql.functions import count

data = [
    Row(name="a", ),
    Row(name="b", ),
    Row(name="b", ),
    Row(name="a", ),
    Row(name="c", ),
    Row(name="c", ),
    Row(name="c", ),
]

df = spark.createDataFrame(data)
df.groupBy("name").agg(count("name").alias("freq")).show()

+----+----+
|name|freq|
+----+----+
|   a|   2|
|   b|   2|
|   c|   3|
+----+----+



In [15]:
# keep only top 2 most frequent values
from pyspark.sql import Row
from pyspark.sql.functions import count, lit, col, when

# Sample data
data = [
Row(name='John', job='Engineer'),
Row(name='John', job='Engineer'),
Row(name='Mary', job='Scientist'),
Row(name='Bob', job='Engineer'),
Row(name='Bob', job='Engineer'),
Row(name='Bob', job='Scientist'),
Row(name='Sam', job='Doctor'),
]

# create DataFrame
df = spark.createDataFrame(data)


most_2freq = df.groupBy('job').agg(count(lit(1)).alias('freq')).orderBy(col('freq').desc()).limit(2).select('job').rdd.flatMap(lambda x: x).collect()

df.withColumn('job', when(col('job').isin(most_2freq),col('job')).otherwise(lit('other'))).show()

+----+---------+
|name|      job|
+----+---------+
|John| Engineer|
|John| Engineer|
|Mary|Scientist|
| Bob| Engineer|
| Bob| Engineer|
| Bob|Scientist|
| Sam|    other|
+----+---------+



In [20]:
# remove rows with null in a column
df = spark.createDataFrame([
("A", 1, None),
("B", None, "123" ),
("B", 3, "456"),
("D", None, None),
], ["Name", "Value", "id"])

df.show()

df.dropna(subset=['id']).show()

+----+-----+----+
|Name|Value|  id|
+----+-----+----+
|   A|    1|null|
|   B| null| 123|
|   B|    3| 456|
|   D| null|null|
+----+-----+----+

+----+-----+---+
|Name|Value| id|
+----+-----+---+
|   B| null|123|
|   B|    3|456|
+----+-----+---+



In [22]:
# rename columns based on old new names lists
df = spark.createDataFrame([(1, 2, 3), (4, 5, 6)], ["col1", "col2", "col3"])

# old column names
old_names = ["col1", "col2", "col3"]

# new column names
new_names = ["new_col1", "new_col2", "new_col3"]

for old,new in zip(old_names, new_names):
    df = df.withColumnRenamed(old,new)
    
df.show()

+--------+--------+--------+
|new_col1|new_col2|new_col3|
+--------+--------+--------+
|       1|       2|       3|
|       4|       5|       6|
+--------+--------+--------+



In [24]:
# stack two dataframes
df_A = spark.createDataFrame([("apple", 3, 5), ("banana", 1, 10), ("orange", 2, 8)], ["Name", "Col_1", "Col_2"])
# Create DataFrame for region B
df_B = spark.createDataFrame([("apple", 3, 5), ("banana", 1, 15), ("grape", 4, 6)], ["Name", "Col_1", "Col_3"])


df_A.union(df_B).show()

+------+-----+-----+
|  Name|Col_1|Col_2|
+------+-----+-----+
| apple|    3|    5|
|banana|    1|   10|
|orange|    2|    8|
| apple|    3|    5|
|banana|    1|   15|
| grape|    4|    6|
+------+-----+-----+



In [29]:
# convert first to upper
from pyspark.sql.functions import initcap, length

data = [("john",), ("alice",), ("bob",)]
df = spark.createDataFrame(data, ["name"])

df.withColumn('name', initcap('name')).withColumn('length', length('name')).show()

+-----+------+
| name|length|
+-----+------+
| John|     4|
|Alice|     5|
|  Bob|     3|
+-----+------+



In [34]:
# difference with previous salary
from pyspark.sql.window import Window
from pyspark.sql.functions import monotonically_increasing_id, lag

data = [('James', 34, 55000),
('Michael', 30, 70000),
('Robert', 37, 60000),
('Maria', 29, 80000),
('Jen', 32, 65000)]

df = spark.createDataFrame(data, ["name", "age" , "salary"])
df = df.withColumn("id", monotonically_increasing_id())
w = Window.orderBy("id")

df.withColumn("prev_salary", lag('salary').over(w)).withColumn("diff_from_prev", col('salary') - when(col('prev_salary').isNull(), 0).otherwise(col('prev_salary'))).show()

+-------+---+------+------------+-----------+--------------+
|   name|age|salary|          id|prev_salary|diff_from_prev|
+-------+---+------+------------+-----------+--------------+
|  James| 34| 55000| 25769803776|       null|         55000|
|Michael| 30| 70000| 51539607552|      55000|         15000|
| Robert| 37| 60000| 77309411328|      70000|        -10000|
|  Maria| 29| 80000|103079215104|      60000|         20000|
|    Jen| 32| 65000|128849018880|      80000|        -15000|
+-------+---+------+------------+-----------+--------------+

