<a href="https://colab.research.google.com/github/dineshkumarDE/learnPython/blob/main/pysparkLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**PYSPARK learning**

# Day1

In [2]:
!pip install pyspark



In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum,desc

In [9]:
spark = SparkSession.builder.appName("TopCustomers").getOrCreate()

data = [("C001", 500), ("C002", 1000), ("C003", 750),
 ("C001", 300), ("C002", 500), ("C003", 250), ("C004", 200)]

columns = ["customer_id", "transaction_amount"]

df = spark.createDataFrame(data,columns)
df.show()

+-----------+------------------+
|customer_id|transaction_amount|
+-----------+------------------+
|       C001|               500|
|       C002|              1000|
|       C003|               750|
|       C001|               300|
|       C002|               500|
|       C003|               250|
|       C004|               200|
+-----------+------------------+



In [18]:
df.groupBy("customer_id").agg(sum("transaction_amount").alias("total_amt")).orderBy(desc("total_amt")).limit(3).show()



+-----------+---------+
|customer_id|total_amt|
+-----------+---------+
|       C002|     1500|
|       C003|     1000|
|       C001|      800|
+-----------+---------+



# Day 2

In [25]:
from pyspark.sql.functions import min,max

In [19]:


data = [("U001", "2024-03-10 08:00:00"), ("U002", "2024-03-10 09:30:00"),
 ("U001", "2024-03-10 12:45:00"), ("U002", "2024-03-10 14:00:00"),
 ("U003", "2024-03-10 11:15:00"), ("U001", "2024-03-10 18:30:00"),
 ("U003", "2024-03-10 22:00:00")]

columns = ["user_id", "login_time"]

In [20]:
df1 = spark.createDataFrame(data,columns)
df1.show()

+-------+-------------------+
|user_id|         login_time|
+-------+-------------------+
|   U001|2024-03-10 08:00:00|
|   U002|2024-03-10 09:30:00|
|   U001|2024-03-10 12:45:00|
|   U002|2024-03-10 14:00:00|
|   U003|2024-03-10 11:15:00|
|   U001|2024-03-10 18:30:00|
|   U003|2024-03-10 22:00:00|
+-------+-------------------+



In [29]:
df1.groupBy("user_id").agg(min("login_time").alias("first_login"),max("login_time").alias("last_login")).show()

+-------+-------------------+-------------------+
|user_id|        first_login|         last_login|
+-------+-------------------+-------------------+
|   U001|2024-03-10 08:00:00|2024-03-10 18:30:00|
|   U002|2024-03-10 09:30:00|2024-03-10 14:00:00|
|   U003|2024-03-10 11:15:00|2024-03-10 22:00:00|
+-------+-------------------+-------------------+



# Day4

In [82]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank,lag,lead,col,to_timestamp,to_unix_timestamp,coalesce,when

In [30]:
data = [
 ("U001", "login", "2024-03-18 10:00:00"),
 ("U001", "click", "2024-03-18 10:00:05"),
 ("U001", "purchase", "2024-03-18 10:10:00"),
 ("U002", "login", "2024-03-18 10:15:00"),
 ("U002", "click", "2024-03-18 10:15:30"),
 ("U003", "login", "2024-03-18 10:20:00"),
 ("U003", "click", "2024-03-18 10:20:08"),
]
df2=spark.createDataFrame(data,["user_id","event","event_timestamp"])

In [61]:
df2=df2.withColumn("event_timestamp",to_unix_timestamp("event_timestamp"))
df2.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- event: string (nullable = true)
 |-- event_timestamp: long (nullable = true)



In [92]:
window_spec = Window.partitionBy("user_id").orderBy("event_timestamp")
df3 = df2.select(
    "user_id" ,
    lag("event_timestamp",1).over(window_spec).alias("prev_event_timestamp"),
    "event_timestamp",
    lead("event_timestamp",1).over(window_spec).alias("next_event_timestamp")
    ,(col("event_timestamp") - ( when(col("prev_event_timestamp").isNull(), 0).otherwise(col("prev_event_timestamp")))).alias("timediff")
    )

df3.filter(col("timediff")<10).select("user_id","timediff").show()

+-------+--------+
|user_id|timediff|
+-------+--------+
|   U001|       5|
|   U003|       8|
+-------+--------+

