In [None]:
import os
os.environ['JAVA_HOME'] = 'C:\\Program Files\\Java\\jdk1.8.0_202'
import os

os.environ["PYSPARK_PYTHON"] = "python"
os.environ["PYSPARK_DRIVER_PYTHON"] = "python"

# Limit Python worker ports to avoid conflicts
os.environ["PYSPARK_WORKER_PORT"] = "40000"
os.environ["PYSPARK_WORKER_PORT_RANGE"] = "40000-50000"

In [58]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.getOrCreate()

In [None]:
#data for the dataframe
data=[("Dilip","Husband"),("Charu","Wife")]

In [None]:
#creating Dataframe using pyspark
df=spark.createDataFrame(data,schema='name string, relation string')

In [51]:
df.show()

+-----+--------+
| name|relation|
+-----+--------+
|Dilip| Husband|
|Charu|    Wife|
+-----+--------+



In [62]:
from pyspark.sql import Row
from datetime import date

In [None]:
test=spark.createDataFrame([Row(name="dilip",age=23,dob=date(2002,5,8)),Row(name="Charu",age=24,dob=date(2000,11,15))])

In [53]:
test.show()

+-----+---+----------+
| name|age|       dob|
+-----+---+----------+
|dilip| 23|2002-05-08|
|Charu| 24|2000-11-15|
+-----+---+----------+



In [54]:
test.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- dob: date (nullable = true)



In [55]:
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

In [56]:
df

name,relation
Dilip,Husband
Charu,Wife


In [57]:
df.show(vertical=True)

-RECORD 0-----------
 name     | Dilip   
 relation | Husband 
-RECORD 1-----------
 name     | Charu   
 relation | Wife    



In [None]:
from pyspark.sql.types import StructType, StructField, StringType, DateType
from datetime import date

schema = StructType([
    StructField("id", StringType(), True),
    StructField("event_date", DateType(), True)
])

data = [
    ("A1", date(2023, 10, 26)),
    ("B2", date(2023, 11, 15)),
    ("C3", None)
]

df = spark.createDataFrame(data, schema)
print(df)
df.printSchema()  

+---+----------+
| id|event_date|
+---+----------+
| A1|2023-10-26|
| B2|2023-11-15|
| C3|      NULL|
+---+----------+

root
 |-- id: string (nullable = true)
 |-- event_date: date (nullable = true)



In [67]:
test.select("name","age")

name,age
dilip,23
Charu,24


In [68]:
test.select("name","age").describe()

summary,name,age
count,2,2.0
mean,,23.5
stddev,,0.7071067811865476
min,Charu,23.0
max,dilip,24.0


In [None]:
spark.conf.set('spark.sql.repl.eagerEval.enabled', False)#it's consuming much time when it's true

In [71]:
test.collect()

[Row(name='dilip', age=23, dob=datetime.date(2002, 5, 8)),
 Row(name='Charu', age=24, dob=datetime.date(2000, 11, 15))]

In [74]:
pip install pandas

Collecting pandas
  Downloading pandas-2.0.3-cp38-cp38-win_amd64.whl.metadata (18 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.1 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting numpy>=1.20.3 (from pandas)
  Downloading numpy-1.24.4-cp38-cp38-win_amd64.whl.metadata (5.6 kB)
Downloading pandas-2.0.3-cp38-cp38-win_amd64.whl (10.8 MB)
   ---------------------------------------- 0.0/10.8 MB ? eta -:--:--
   - -------------------------------------- 0.5/10.8 MB 5.6 MB/s eta 0:00:02
   ------ --------------------------------- 1.8/10.8 MB 5.0 MB/s eta 0:00:02
   ------------ --------------------------- 3.4/10.8 MB 5.9 MB/s eta 0:00:02
   ---------------- ----------------------- 4.5/10.8 MB 5.6 MB/s eta 0:00:02
   --------------------- ------------------ 5.8/10.8 MB 5.7 MB/s eta 0:00:01
   ------------------------- -------------- 6.8/10.8 MB 5.7 MB/s eta 0:00:01
   -

In [75]:
test.toPandas()

Unnamed: 0,name,age,dob
0,dilip,23,2002-05-08
1,Charu,24,2000-11-15


In [None]:
test.name

Column<'name'>

In [77]:
import pyspark.sql.functions as func

In [79]:
test.select(func.upper("name"),"age").toPandas()

Unnamed: 0,upper(name),age
0,DILIP,23
1,CHARU,24


In [81]:
test.filter(test.age<24).toPandas()

Unnamed: 0,name,age,dob
0,dilip,23,2002-05-08
