In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Practise').getOrCreate()

In [2]:
# spark.read.option('header', 'true').csv("test3.csv").show()
# df_pyspark = spark.read.option('header', 'true').csv("test3.csv")

# Une autre manière de faire:
spark.read.csv("test3.csv", header=True, inferSchema=True).show()
df_pyspark = spark.read.csv("test3.csv", header=True, inferSchema=True)

+--------+----+----------+------+
|    Name| age|Experience|Salary|
+--------+----+----------+------+
|   Krish|31.0|         5| 30000|
|Sudhansh|30.0|        10| 10000|
|   Sunny|29.0|         2| 15000|
|  Pierre|35.0|        10| 40000|
|  Benoit|55.0|        25| 75000|
|   Lucas|45.0|        20| 64000|
| Patrick|22.0|         2| 35000|
|    NULL|21.0|         5| 40000|
| Francis|NULL|      NULL| 75000|
|    NULL|NULL|      NULL|  NULL|
|    NULL|45.0|      NULL|  NULL|
+--------+----+----------+------+



## Différents types de suppression de NULL

In [3]:
# Supprime l'enregistrement complet si il contient UN OU PLUSIEURS null
df_pyspark.na.drop().show()

+--------+----+----------+------+
|    Name| age|Experience|Salary|
+--------+----+----------+------+
|   Krish|31.0|         5| 30000|
|Sudhansh|30.0|        10| 10000|
|   Sunny|29.0|         2| 15000|
|  Pierre|35.0|        10| 40000|
|  Benoit|55.0|        25| 75000|
|   Lucas|45.0|        20| 64000|
| Patrick|22.0|         2| 35000|
+--------+----+----------+------+



In [4]:
# Supprime l'enregistrement complet si il ne contient QUE DES null
df_pyspark.na.drop(how="all").show()

+--------+----+----------+------+
|    Name| age|Experience|Salary|
+--------+----+----------+------+
|   Krish|31.0|         5| 30000|
|Sudhansh|30.0|        10| 10000|
|   Sunny|29.0|         2| 15000|
|  Pierre|35.0|        10| 40000|
|  Benoit|55.0|        25| 75000|
|   Lucas|45.0|        20| 64000|
| Patrick|22.0|         2| 35000|
|    NULL|21.0|         5| 40000|
| Francis|NULL|      NULL| 75000|
|    NULL|45.0|      NULL|  NULL|
+--------+----+----------+------+



In [5]:
# Supprime l'enregistrement complet si il ne contient QUE DES null SUR LA COLONNE SPECIFIER
df_pyspark.na.drop(how="all", subset=['Experience']).show()

+--------+----+----------+------+
|    Name| age|Experience|Salary|
+--------+----+----------+------+
|   Krish|31.0|         5| 30000|
|Sudhansh|30.0|        10| 10000|
|   Sunny|29.0|         2| 15000|
|  Pierre|35.0|        10| 40000|
|  Benoit|55.0|        25| 75000|
|   Lucas|45.0|        20| 64000|
| Patrick|22.0|         2| 35000|
|    NULL|21.0|         5| 40000|
+--------+----+----------+------+



## remplacement de valeurs null

In [6]:
# Remplace toutes les valeurs null par un string "missing value" ou une int '0'
df_pyspark.na.fill("Missing value").show()
df_pyspark.na.fill(0).show()

+-------------+----+----------+------+
|         Name| age|Experience|Salary|
+-------------+----+----------+------+
|        Krish|31.0|         5| 30000|
|     Sudhansh|30.0|        10| 10000|
|        Sunny|29.0|         2| 15000|
|       Pierre|35.0|        10| 40000|
|       Benoit|55.0|        25| 75000|
|        Lucas|45.0|        20| 64000|
|      Patrick|22.0|         2| 35000|
|Missing value|21.0|         5| 40000|
|      Francis|NULL|      NULL| 75000|
|Missing value|NULL|      NULL|  NULL|
|Missing value|45.0|      NULL|  NULL|
+-------------+----+----------+------+

+--------+----+----------+------+
|    Name| age|Experience|Salary|
+--------+----+----------+------+
|   Krish|31.0|         5| 30000|
|Sudhansh|30.0|        10| 10000|
|   Sunny|29.0|         2| 15000|
|  Pierre|35.0|        10| 40000|
|  Benoit|55.0|        25| 75000|
|   Lucas|45.0|        20| 64000|
| Patrick|22.0|         2| 35000|
|    NULL|21.0|         5| 40000|
| Francis| 0.0|         0| 75000|
|    N

In [7]:
# Remplace toutes les valeurs null de "age", "Experience" par un int "0"
df_pyspark.na.fill(0, ['age', "Experience"]).show()

+--------+----+----------+------+
|    Name| age|Experience|Salary|
+--------+----+----------+------+
|   Krish|31.0|         5| 30000|
|Sudhansh|30.0|        10| 10000|
|   Sunny|29.0|         2| 15000|
|  Pierre|35.0|        10| 40000|
|  Benoit|55.0|        25| 75000|
|   Lucas|45.0|        20| 64000|
| Patrick|22.0|         2| 35000|
|    NULL|21.0|         5| 40000|
| Francis| 0.0|         0| 75000|
|    NULL| 0.0|         0|  NULL|
|    NULL|45.0|         0|  NULL|
+--------+----+----------+------+



## Remplacement de valeur null par Imputation a la moyenne:

In [8]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['age', 'Experience', 'Salary'],
    outputCols=["{}_imputed".format(c) for c in ['age', 'Experience', 'Salary']]
).setStrategy('mean')

In [9]:
imputer.fit(df_pyspark).transform(df_pyspark).show()

+--------+----+----------+------+-----------------+------------------+--------------+
|    Name| age|Experience|Salary|      age_imputed|Experience_imputed|Salary_imputed|
+--------+----+----------+------+-----------------+------------------+--------------+
|   Krish|31.0|         5| 30000|             31.0|                 5|         30000|
|Sudhansh|30.0|        10| 10000|             30.0|                10|         10000|
|   Sunny|29.0|         2| 15000|             29.0|                 2|         15000|
|  Pierre|35.0|        10| 40000|             35.0|                10|         40000|
|  Benoit|55.0|        25| 75000|             55.0|                25|         75000|
|   Lucas|45.0|        20| 64000|             45.0|                20|         64000|
| Patrick|22.0|         2| 35000|             22.0|                 2|         35000|
|    NULL|21.0|         5| 40000|             21.0|                 5|         40000|
| Francis|NULL|      NULL| 75000|34.77777777777778|   