In [1]:
import os
from pyspark.sql import SparkSession

os.environ["SPARK_HOME"] = "/mnt/d/Ubuntu/Programs/spark-3.1.1-bin-hadoop3.2"
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3"
os.environ["PYSPARK_DRIVER_PYTHON"] = "python3"
os.environ["PYSPARK_SUBMIT_ARGS"] = "pyspark-shell"

In [2]:
FILE_PATH = '/mnt/d/Projects/BigData/notebooks/data/test.csv'

In [3]:
master = "local"
spark = SparkSession.builder.master(master).appName("spark_test").getOrCreate()

[Open spark jobs](http://localhost:4040/jobs/)

In [4]:
df = spark.read.load(FILE_PATH, format='csv', inferSchema='true', header='true')
df.show()

+---------+--------+------+------+
|firstname|lastname|gender|salary|
+---------+--------+------+------+
|    James|   Smith|     M|    30|
|     Anna|    Rose|     F|    41|
|   Robert|Williams|     M|    62|
+---------+--------+------+------+



In [5]:
df.rdd.map(lambda x: f'[{x.firstname} {x.lastname}, gender={x.gender}, salary={x.salary * 2}]').collect()

['[James Smith, gender=M, salary=60]',
 '[Anna Rose, gender=F, salary=82]',
 '[Robert Williams, gender=M, salary=124]']

In [6]:
df.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



In [7]:
df.columns

['firstname', 'lastname', 'gender', 'salary']

In [8]:
df.describe().show()

+-------+---------+--------+------+------------------+
|summary|firstname|lastname|gender|            salary|
+-------+---------+--------+------+------------------+
|  count|        3|       3|     3|                 3|
|   mean|     null|    null|  null|44.333333333333336|
| stddev|     null|    null|  null|16.258331197676263|
|    min|     Anna|    Rose|     F|                30|
|    max|   Robert|Williams|     M|                62|
+-------+---------+--------+------+------------------+



In [9]:
df.select('firstname', 'salary')

DataFrame[firstname: string, salary: int]

In [10]:
df.drop('firstname', 'lastname').columns

['gender', 'salary']

In [11]:
df.withColumnRenamed('gender','sex').columns

['firstname', 'lastname', 'sex', 'salary']

In [12]:
df.select('firstname', 'lastname').filter('firstname LIKE "An%"').show()

+---------+--------+
|firstname|lastname|
+---------+--------+
|     Anna|    Rose|
+---------+--------+



In [14]:
spark.stop()