# Cleaning DataFrames

* Much of the data that we manage needs to be continually cleaned
* This chapter is dedicated on techniques for cleaning our data for tasks like:
  * Reporting
  * Machine Learning
* Please open a tab that shows [the API for dealing with `null` values](https://spark.apache.org/docs/2.3.0/api/scala/index.html?org/apache/spark/sql/Dataset.html#org.apache.spark.sql.DataFrameNaFunctions)

## Bringing in a small `DataFrame`

Here we will bring in a small dataframe with country data from the United Nations

In [180]:
import org.apache.spark.sql.types._
val bookSchema = new StructType(Array(
   new StructField("bookID", IntegerType, false),
   new StructField("title", StringType, false),
   new StructField("authors", StringType, false),
   new StructField("average_rating", FloatType, false),
   new StructField("isbn", StringType, false),
   new StructField("isbn13", StringType, false),
   new StructField("language_code", StringType, false),
   new StructField("num_pages", IntegerType, false),
   new StructField("ratings_count", IntegerType, false),
   new StructField("text_reviews_count", IntegerType, false)))

val booksDF = spark.read.format("csv")
                         .schema(bookSchema)
                         .option("header", "true")
                         .load("../data/books.csv")
booksDF.printSchema()

root
 |-- bookID: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- average_rating: float (nullable = true)
 |-- isbn: string (nullable = true)
 |-- isbn13: string (nullable = true)
 |-- language_code: string (nullable = true)
 |-- num_pages: integer (nullable = true)
 |-- ratings_count: integer (nullable = true)
 |-- text_reviews_count: integer (nullable = true)



import org.apache.spark.sql.types._
bookSchema: org.apache.spark.sql.types.StructType = StructType(StructField(bookID,IntegerType,false), StructField(title,StringType,false), StructField(authors,StringType,false), StructField(average_rating,FloatType,false), StructField(isbn,StringType,false), StructField(isbn13,StringType,false), StructField(language_code,StringType,false), StructField(num_pages,IntegerType,false), StructField(ratings_count,IntegerType,false), StructField(text_reviews_count,IntegerType,false))
booksDF: org.apache.spark.sql.DataFrame = [bookID: int, title: string ... 8 more fields]


In [181]:
booksDF.filter(_.anyNull).show(1000)

+------+-----+-------+--------------+----+------+-------------+---------+-------------+------------------+
|bookID|title|authors|average_rating|isbn|isbn13|language_code|num_pages|ratings_count|text_reviews_count|
+------+-----+-------+--------------+----+------+-------------+---------+-------------+------------------+
|  null| null|   null|          null|null|  null|         null|     null|         null|              null|
|  null| null|   null|          null|null|  null|         null|     null|         null|              null|
|  null| null|   null|          null|null|  null|         null|     null|         null|              null|
|  null| null|   null|          null|null|  null|         null|     null|         null|              null|
|  null| null|   null|          null|null|  null|         null|     null|         null|              null|
+------+-----+-------+--------------+----+------+-------------+---------+-------------+------------------+



In [182]:
val cleanBooksDF = booksDF.na.drop(how="any")
cleanBooksDF.show(100)

+------+--------------------+--------------------+--------------+----------+-------------+-------------+---------+-------------+------------------+
|bookID|               title|             authors|average_rating|      isbn|       isbn13|language_code|num_pages|ratings_count|text_reviews_count|
+------+--------------------+--------------------+--------------+----------+-------------+-------------+---------+-------------+------------------+
|     1|Harry Potter and ...|J.K. Rowling-Mary...|          4.56|0439785960|9780439785969|          eng|      652|      1944099|             26249|
|     2|Harry Potter and ...|J.K. Rowling-Mary...|          4.49|0439358078|9780439358071|          eng|      870|      1996446|             27613|
|     3|Harry Potter and ...|J.K. Rowling-Mary...|          4.47|0439554934|9780439554930|          eng|      320|      5629932|             70390|
|     4|Harry Potter and ...|        J.K. Rowling|          4.41|0439554896|9780439554893|          eng|      35




cleanBooksDF: org.apache.spark.sql.DataFrame = [bookID: int, title: string ... 8 more fields]


In [187]:
cleanBooksDF.groupBy("language_code").count().show(100)

+-------------+-----+
|language_code|count|
+-------------+-----+
|          fre|  209|
|          zho|   16|
|          glg|    4|
|        en-CA|    9|
|          rus|    7|
|          nor|    1|
|          ale|    1|
|          cat|    3|
|          ara|    2|
|          por|   27|
|          lat|    3|
|          swe|    6|
|          gla|    1|
|          mul|   21|
|          eng|10594|
|          jpn|   64|
|           nl|    7|
|          grc|   12|
|          dan|    1|
|          srp|    1|
|        en-GB|  341|
|          heb|    1|
|          tur|    3|
|          enm|    3|
|          msa|    1|
|          wel|    1|
|          ita|   19|
|        en-US| 1699|
|          spa|  419|
|          ger|  238|
+-------------+-----+



In [195]:
val engCleanDF = cleanBooksDF.na.replace(cols = Seq("language_code"), 
                                         replacement = Map("en-US" -> "eng", "en-CA" -> "eng", "en-GB" -> "eng"))
engCleanDF.groupBy("language_code").count().show(100)

+-------------+-----+
|language_code|count|
+-------------+-----+
|          fre|  209|
|          zho|   16|
|          glg|    4|
|          rus|    7|
|          nor|    1|
|          ale|    1|
|          cat|    3|
|          ara|    2|
|          por|   27|
|          lat|    3|
|          swe|    6|
|          gla|    1|
|          mul|   21|
|          eng|12643|
|          jpn|   64|
|           nl|    7|
|          grc|   12|
|          dan|    1|
|          srp|    1|
|          heb|    1|
|          tur|    3|
|          enm|    3|
|          msa|    1|
|          wel|    1|
|          ita|   19|
|          spa|  419|
|          ger|  238|
+-------------+-----+



engCleanDF: org.apache.spark.sql.DataFrame = [bookID: int, title: string ... 8 more fields]
