**Apache Spark** is a unified analytics engine for large-scale data processing.It is a fast and general purpose cluster computing system.It provides high-level APIs in Scala,Java and Python that make parallel jobs easy to write, and an optimized engine that supports general computation graphs.

- **Speed** - Runs workloads 100x faster
- **Ease of Use** - Write applications quickly in Java,Scala,Python,R and SQL
- **Generality** - Combine SQL,streaming, and complex analytics
- **Runs Everywhere** - Spark runs on Hadoop,Apache Mesos,Kubernetes,standalone or in the cloud.It can access diverse data sources

Spark 3.1.1

In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting py4j==0.10.9.7 (from pyspark)
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 kB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425347 sha256=26f06e0a0d5a259698393b22e49969259546cd6d3387f93ee4a364cd9c8c010d
  Stored in directory: /Users/priyathamkrishnakodeboyina/Library/Caches/pip/wheels/38/df/61/8c121f50c3cffd77f8178180dd232d90b3b99d1bd61fb6d6be
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j

In [1]:
import pyspark

In [2]:
import pandas as pd
type(pd.read_csv('test1.csv'))

pandas.core.frame.DataFrame

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark=SparkSession.builder.appName('Practice').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/01 14:47:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
spark

In [7]:
df_pyspark=spark.read.csv('test1.csv')

In [21]:
## read the dataset
df_pyspark=spark.read.option('header','true').csv('test1.csv',inferSchema=True)

In [15]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [22]:
## Check the Schema
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [6]:
## read the dataset option 2
df_pyspark=spark.read.csv('test1.csv',header=True,inferSchema=True)
df_pyspark.show()

+-------+----+----------+------+
|   Name| age|Experience|Salary|
+-------+----+----------+------+
|Michael|  25|        10|100000|
|    Tom|  26|         8|150000|
| Antony|  27|         4|175000|
| George|  28|         2|200000|
|   Bill|  29|         5|250000|
|   Mark|  30|         3|300000|
|Patrick|  29|         4|350000|
|   Zhan|NULL|      NULL|400000|
|   NULL|  34|        10|450000|
|   NULL|  36|      NULL|  NULL|
+-------+----+----------+------+



In [25]:
df_pyspark.head(3)

[Row(Name='Michael', age=25, Experience=10),
 Row(Name='Tom', age=26, Experience=8),
 Row(Name='Antony', age=27, Experience=4)]

In [26]:
df_pyspark.show()

+-------+---+----------+
|   Name|age|Experience|
+-------+---+----------+
|Michael| 25|        10|
|    Tom| 26|         8|
| Antony| 27|         4|
+-------+---+----------+



In [31]:
df_pyspark.select(['Name','Experience']).show()

+-------+----------+
|   Name|Experience|
+-------+----------+
|Michael|        10|
|    Tom|         8|
| Antony|         4|
+-------+----------+



In [32]:
df_pyspark['Name']

Column<'Name'>

In [33]:
df_pyspark.dtypes

[('Name', 'string'), ('age', 'int'), ('Experience', 'int')]

In [35]:
df_pyspark.describe().show()

+-------+------+----+-----------------+
|summary|  Name| age|       Experience|
+-------+------+----+-----------------+
|  count|     3|   3|                3|
|   mean|  NULL|26.0|7.333333333333333|
| stddev|  NULL| 1.0|3.055050463303893|
|    min|Antony|  25|                4|
|    max|   Tom|  27|               10|
+-------+------+----+-----------------+



In [38]:
## Adding Columns in data frame
df_pyspark=df_pyspark.withColumn('Experience in Python',df_pyspark['Experience']-2)

In [39]:
df_pyspark.show()

+-------+---+----------+--------------------+
|   Name|age|Experience|Experience in Python|
+-------+---+----------+--------------------+
|Michael| 25|        10|                   8|
|    Tom| 26|         8|                   6|
| Antony| 27|         4|                   2|
+-------+---+----------+--------------------+



In [41]:
## Drop the columns
df_pyspark=df_pyspark.drop('Experience in Python')

In [42]:
df_pyspark.show()

+-------+---+----------+
|   Name|age|Experience|
+-------+---+----------+
|Michael| 25|        10|
|    Tom| 26|         8|
| Antony| 27|         4|
+-------+---+----------+



In [44]:
### Rename the columns
df_pyspark.withColumnRenamed('Name','Employee Name').show()

+-------------+---+----------+
|Employee Name|age|Experience|
+-------------+---+----------+
|      Michael| 25|        10|
|          Tom| 26|         8|
|       Antony| 27|         4|
+-------------+---+----------+



In [7]:
## drop the columns
df_pyspark.drop('Name').show()

+----+----------+------+
| age|Experience|Salary|
+----+----------+------+
|  25|        10|100000|
|  26|         8|150000|
|  27|         4|175000|
|  28|         2|200000|
|  29|         5|250000|
|  30|         3|300000|
|  29|         4|350000|
|NULL|      NULL|400000|
|  34|        10|450000|
|  36|      NULL|  NULL|
+----+----------+------+



In [8]:
df_pyspark.show()

+-------+----+----------+------+
|   Name| age|Experience|Salary|
+-------+----+----------+------+
|Michael|  25|        10|100000|
|    Tom|  26|         8|150000|
| Antony|  27|         4|175000|
| George|  28|         2|200000|
|   Bill|  29|         5|250000|
|   Mark|  30|         3|300000|
|Patrick|  29|         4|350000|
|   Zhan|NULL|      NULL|400000|
|   NULL|  34|        10|450000|
|   NULL|  36|      NULL|  NULL|
+-------+----+----------+------+



In [9]:
df_pyspark.na.drop().show()

+-------+---+----------+------+
|   Name|age|Experience|Salary|
+-------+---+----------+------+
|Michael| 25|        10|100000|
|    Tom| 26|         8|150000|
| Antony| 27|         4|175000|
| George| 28|         2|200000|
|   Bill| 29|         5|250000|
|   Mark| 30|         3|300000|
|Patrick| 29|         4|350000|
+-------+---+----------+------+



In [11]:
# any==how
df_pyspark.na.drop(how='any').show()

+-------+---+----------+------+
|   Name|age|Experience|Salary|
+-------+---+----------+------+
|Michael| 25|        10|100000|
|    Tom| 26|         8|150000|
| Antony| 27|         4|175000|
| George| 28|         2|200000|
|   Bill| 29|         5|250000|
|   Mark| 30|         3|300000|
|Patrick| 29|         4|350000|
+-------+---+----------+------+



In [12]:
# threshold
df_pyspark.na.drop(how="any",thresh=2).show()

+-------+----+----------+------+
|   Name| age|Experience|Salary|
+-------+----+----------+------+
|Michael|  25|        10|100000|
|    Tom|  26|         8|150000|
| Antony|  27|         4|175000|
| George|  28|         2|200000|
|   Bill|  29|         5|250000|
|   Mark|  30|         3|300000|
|Patrick|  29|         4|350000|
|   Zhan|NULL|      NULL|400000|
|   NULL|  34|        10|450000|
+-------+----+----------+------+



In [13]:
# Subset
df_pyspark.na.drop(how="any",subset=['Experience']).show()

+-------+---+----------+------+
|   Name|age|Experience|Salary|
+-------+---+----------+------+
|Michael| 25|        10|100000|
|    Tom| 26|         8|150000|
| Antony| 27|         4|175000|
| George| 28|         2|200000|
|   Bill| 29|         5|250000|
|   Mark| 30|         3|300000|
|Patrick| 29|         4|350000|
|   NULL| 34|        10|450000|
+-------+---+----------+------+



In [16]:
# Fill the missing value
df_pyspark.na.fill('Missing Values',['Experience','age','Salary']).show()

+-------+----+----------+------+
|   Name| age|Experience|Salary|
+-------+----+----------+------+
|Michael|  25|        10|100000|
|    Tom|  26|         8|150000|
| Antony|  27|         4|175000|
| George|  28|         2|200000|
|   Bill|  29|         5|250000|
|   Mark|  30|         3|300000|
|Patrick|  29|         4|350000|
|   Zhan|NULL|      NULL|400000|
|   NULL|  34|        10|450000|
|   NULL|  36|      NULL|  NULL|
+-------+----+----------+------+



In [19]:
from pyspark.sql.functions import col
df_pyspark.filter(col("age").isNull()).show()

+----+----+----------+------+
|Name| age|Experience|Salary|
+----+----+----------+------+
|Zhan|NULL|      NULL|400000|
+----+----+----------+------+



In [27]:
filled_df = df_pyspark.na.fill(0, subset=["age"])

In [28]:
filled_df.show()

+-------+---+----------+------+
|   Name|age|Experience|Salary|
+-------+---+----------+------+
|Michael| 25|        10|100000|
|    Tom| 26|         8|150000|
| Antony| 27|         4|175000|
| George| 28|         2|200000|
|   Bill| 29|         5|250000|
|   Mark| 30|         3|300000|
|Patrick| 29|         4|350000|
|   Zhan|  0|      NULL|400000|
|   NULL| 34|        10|450000|
|   NULL| 36|      NULL|  NULL|
+-------+---+----------+------+



In [29]:
df_pyspark.show()

+-------+----+----------+------+
|   Name| age|Experience|Salary|
+-------+----+----------+------+
|Michael|  25|        10|100000|
|    Tom|  26|         8|150000|
| Antony|  27|         4|175000|
| George|  28|         2|200000|
|   Bill|  29|         5|250000|
|   Mark|  30|         3|300000|
|Patrick|  29|         4|350000|
|   Zhan|NULL|      NULL|400000|
|   NULL|  34|        10|450000|
|   NULL|  36|      NULL|  NULL|
+-------+----+----------+------+



In [30]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['age', 'Experience', 'Salary'],
    outputCols=["{}_imputed".format(c) for c in ['age','Experience','Salary']]).setStrategy("mean")

In [31]:
# Add imputation cols to df
imputer.fit(df_pyspark).transform(df_pyspark).show()

+-------+----+----------+------+-----------+------------------+--------------+
|   Name| age|Experience|Salary|age_imputed|Experience_imputed|Salary_imputed|
+-------+----+----------+------+-----------+------------------+--------------+
|Michael|  25|        10|100000|         25|                10|        100000|
|    Tom|  26|         8|150000|         26|                 8|        150000|
| Antony|  27|         4|175000|         27|                 4|        175000|
| George|  28|         2|200000|         28|                 2|        200000|
|   Bill|  29|         5|250000|         29|                 5|        250000|
|   Mark|  30|         3|300000|         30|                 3|        300000|
|Patrick|  29|         4|350000|         29|                 4|        350000|
|   Zhan|NULL|      NULL|400000|         29|                 5|        400000|
|   NULL|  34|        10|450000|         34|                10|        450000|
|   NULL|  36|      NULL|  NULL|         36|        

**Filter Operations**

In [33]:
# Salary of the people less than or equal to 20000
df_pyspark.filter("Salary<=200000").show()

+-------+---+----------+------+
|   Name|age|Experience|Salary|
+-------+---+----------+------+
|Michael| 25|        10|100000|
|    Tom| 26|         8|150000|
| Antony| 27|         4|175000|
| George| 28|         2|200000|
+-------+---+----------+------+



In [34]:
df_pyspark.filter("Salary<=200000").select(['Name','age']).show()

+-------+---+
|   Name|age|
+-------+---+
|Michael| 25|
|    Tom| 26|
| Antony| 27|
| George| 28|
+-------+---+



In [42]:
df_pyspark.filter((df_pyspark['Salary']<=200000) | 
                  (df_pyspark['Salary']>=150000)).show()

+-------+----+----------+------+
|   Name| age|Experience|Salary|
+-------+----+----------+------+
|Michael|  25|        10|100000|
|    Tom|  26|         8|150000|
| Antony|  27|         4|175000|
| George|  28|         2|200000|
|   Bill|  29|         5|250000|
|   Mark|  30|         3|300000|
|Patrick|  29|         4|350000|
|   Zhan|NULL|      NULL|400000|
|   NULL|  34|        10|450000|
+-------+----+----------+------+



In [44]:
df_pyspark.filter(~(df_pyspark['Salary']<=200000)).show()

+-------+----+----------+------+
|   Name| age|Experience|Salary|
+-------+----+----------+------+
|   Bill|  29|         5|250000|
|   Mark|  30|         3|300000|
|Patrick|  29|         4|350000|
|   Zhan|NULL|      NULL|400000|
|   NULL|  34|        10|450000|
+-------+----+----------+------+

