In [None]:
!pip install pyspark



In [4]:

from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.types import StructType,StructField,StringType,IntegerType
from pyspark.sql.window import Window


In [5]:
spark = (
    SparkSession.builder
    .appName("DineshPractice.me")
    .getOrCreate())


In [6]:
schema = StructType([
    StructField("user_id", IntegerType(), True),
    StructField("kit_id", IntegerType(), True),
    StructField("login_date", StringType(), True),
    StructField("sessions_count", IntegerType(), True)
])

In [7]:
data = [
    (1, 2, "2016-03-01", 5),
    (1, 2, "2016-03-02", 6),
    (2, 3, "2017-06-25", 1),
    (3, 1, "2016-03-02", 0),
    (3, 4, "2018-07-03", 5)
]

In [8]:
input_df = spark.createDataFrame(data=data,schema=schema)

input_df.show()

+-------+------+----------+--------------+
|user_id|kit_id|login_date|sessions_count|
+-------+------+----------+--------------+
|      1|     2|2016-03-01|             5|
|      1|     2|2016-03-02|             6|
|      2|     3|2017-06-25|             1|
|      3|     1|2016-03-02|             0|
|      3|     4|2018-07-03|             5|
+-------+------+----------+--------------+



In [9]:
GroupedDF = input_df.groupBy("user_id").agg(
    f.min("login_date").alias("first_login"))

GroupedDF.show()

+-------+-----------+
|user_id|first_login|
+-------+-----------+
|      1| 2016-03-01|
|      2| 2017-06-25|
|      3| 2016-03-02|
+-------+-----------+



In [10]:
windowSpec = Window.partitionBy("user_id").orderBy("login_date")


rankDf = input_df.withColumn("rank", f.rank().over(windowSpec))

rankDf.show()


+-------+------+----------+--------------+----+
|user_id|kit_id|login_date|sessions_count|rank|
+-------+------+----------+--------------+----+
|      1|     2|2016-03-01|             5|   1|
|      1|     2|2016-03-02|             6|   2|
|      2|     3|2017-06-25|             1|   1|
|      3|     1|2016-03-02|             0|   1|
|      3|     4|2018-07-03|             5|   2|
+-------+------+----------+--------------+----+



In [12]:
result = rankDf.select(f.col("user_id"),
                       f.col("kit_id"),
                       f.col("login_date").alias("first_login"),
                       f.col("rank")
).filter(f.col("rank")==1)

result.show()

+-------+------+-----------+----+
|user_id|kit_id|first_login|rank|
+-------+------+-----------+----+
|      1|     2| 2016-03-01|   1|
|      2|     3| 2017-06-25|   1|
|      3|     1| 2016-03-02|   1|
+-------+------+-----------+----+

