In [1]:
import time
import pandas as pd

from pyspark.sql import functions as F
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Pyspark demo") \
    .master("local[*]").getOrCreate()
sc = spark.sparkContext
rdd = sc.textFile("logs.txt")
for line in rdd.collect():
    print(line)


In [2]:
rdd = sc.textFile("customer.json")

In [3]:
rdd.collect()

['{"cust_id":101,"name":"Amit","city":"Delhi"}',
 '{"cust_id":102,"name":"Ravi","city":"Mumbai"}',
 '{"cust_id":103,"name":"Neha","city":"Pune"}']

In [4]:
rdd = sc.textFile("logs.txt")

In [6]:
for line in rdd.collect():
    print(line)

2026-02-07 10:15:20 INFO user=101 action=login status=success
2026-02-07 10:16:05 ERROR user=102 action=payment status=failed
2026-02-07 10:17:11 WARN user=103 action=upload status=slow
2026-02-07 10:18:50 INFO user=104 action=logout status=success
2026-02-07 10:19:33 ERROR user=105 action=payment status=failed


In [7]:
type(rdd)

pyspark.core.rdd.RDD

In [8]:
rdd.cache

<bound method RDD.cache of logs.txt MapPartitionsRDD[3] at textFile at NativeMethodAccessorImpl.java:0>

In [13]:
df = spark.read.option("header", "true") \
               .option("inferSchema", "true") \
               .option("delimiter", ",") \
               .csv("emp_details_source.csv")


In [14]:
df.show()

+----+--------------+----------+------+------+
| eno|         ename|       doj|deptno|salary|
+----+--------------+----------+------+------+
|1001|  Arjun Sharma|2022-01-15|    10|  1000|
|1002|   Priya Patel|2023-03-22|    20| 67387|
|1003|    Ravi Kumar|2021-05-10|    30| 70421|
|1004|   Sneha Gupta|2020-08-25|    40| 68902|
|1005|    Anil Reddy|2024-02-13|    50| 78005|
|1006|    Neha Verma|2019-11-30|    60| 59843|
|1007|   Vijay Singh|2022-07-08|    10|  1000|
|1008|   Ananya Iyer|2021-03-15|    20| 74625|
|1009|  Manish Joshi|2020-12-19|    30| 67560|
|1010|   Simran Kaur|2023-01-11|    40| 68902|
|1011|  Rajesh Mehta|2021-09-01|    50| 62497|
|1012| Deepika Desai|2020-06-10|    60| 70179|
|1013|Shivani Sharma|2022-09-13|    10| 74363|
|1014|   Harish Nair|2023-04-01|    20| 61487|
|1015|   Komal Yadav|2021-11-23|    30| 61894|
|1016|    Ajay Bhatt|2020-04-25|    40| 70109|
|1017|   Pooja Malik|2022-02-28|    50| 77450|
|1018|  Raghav Joshi|2021-06-14|    60| 62873|
|1019|    Mee

In [18]:
df.orderBy(df.salary.asc()).show()

+----+---------------+----------+------+------+
| eno|          ename|       doj|deptno|salary|
+----+---------------+----------+------+------+
|1001|   Arjun Sharma|2022-01-15|    10|  1000|
|1007|    Vijay Singh|2022-07-08|    10|  1000|
|1019|     Meera Nair|2020-09-17|    10|  1000|
|1020|    Krishna Rao|2023-10-02|    20|  2000|
|1034|     Sonal Shah|2023-08-13|    20|  2000|
|1006|     Neha Verma|2019-11-30|    60| 59843|
|1050|  Sandeep Yadav|2021-10-12|    60| 60921|
|1023|     Vikas Soni|2020-11-15|    30| 60933|
|1014|    Harish Nair|2023-04-01|    20| 61487|
|1015|    Komal Yadav|2021-11-23|    30| 61894|
|1011|   Rajesh Mehta|2021-09-01|    50| 62497|
|1018|   Raghav Joshi|2021-06-14|    60| 62873|
|1036|   Shweta Yadav|2021-06-03|    40| 63559|
|1032|     Geeta Rani|2022-09-07|    60| 63916|
|1040|Pratibha Sharma|2021-09-05|    20| 65072|
|1048|  Meenal Pandey|2021-07-03|    40| 66228|
|1027|  Harshit Kumar|2022-08-16|    10| 66535|
|1025|  Nitin Chauhan|2020-03-30|    50|

In [19]:
df.groupBy('deptno').agg(F.sum('salary').alias('sum_Values')).show()

+------+----------+
|deptno|sum_Values|
+------+----------+
|    20|    477820|
|    40|    545880|
|    10|    426545|
|    50|    570135|
|    60|    525908|
|    30|    536809|
+------+----------+



In [21]:
df.groupBy('deptno').agg( F.sum('salary').alias('sum'), F.mean('salary').alias('mean'), F.avg('salary').alias('avg') ).show()

+------+------+-----------------+-----------------+
|deptno|   sum|             mean|              avg|
+------+------+-----------------+-----------------+
|    20|477820|53091.11111111111|53091.11111111111|
|    40|545880|          68235.0|          68235.0|
|    10|426545|47393.88888888889|47393.88888888889|
|    50|570135|        71266.875|        71266.875|
|    60|525908|          65738.5|          65738.5|
|    30|536809|        67101.125|        67101.125|
+------+------+-----------------+-----------------+



In [22]:
filtered = df.filter(df.salary > 60000)

In [23]:
filtered.show()

+----+------------------+----------+------+------+
| eno|             ename|       doj|deptno|salary|
+----+------------------+----------+------+------+
|1002|       Priya Patel|2023-03-22|    20| 67387|
|1003|        Ravi Kumar|2021-05-10|    30| 70421|
|1004|       Sneha Gupta|2020-08-25|    40| 68902|
|1005|        Anil Reddy|2024-02-13|    50| 78005|
|1008|       Ananya Iyer|2021-03-15|    20| 74625|
|1009|      Manish Joshi|2020-12-19|    30| 67560|
|1010|       Simran Kaur|2023-01-11|    40| 68902|
|1011|      Rajesh Mehta|2021-09-01|    50| 62497|
|1012|     Deepika Desai|2020-06-10|    60| 70179|
|1013|    Shivani Sharma|2022-09-13|    10| 74363|
|1014|       Harish Nair|2023-04-01|    20| 61487|
|1015|       Komal Yadav|2021-11-23|    30| 61894|
|1016|        Ajay Bhatt|2020-04-25|    40| 70109|
|1017|       Pooja Malik|2022-02-28|    50| 77450|
|1018|      Raghav Joshi|2021-06-14|    60| 62873|
|1021|    Prakash Pillai|2022-12-11|    10| 73569|
|1022|Aishwarya Deshmukh|2021-0

In [26]:
filtered.groupBy('deptno').agg(F.count('salary').alias('total_salary')).show()

+------+------------+
|deptno|total_salary|
+------+------------+
|    20|           7|
|    40|           8|
|    10|           6|
|    50|           8|
|    60|           7|
|    30|           8|
+------+------------+

