In [40]:
import pyspark

from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_unixtime, to_timestamp

In [2]:
spark = SparkSession.builder.appName('earthquake-data-cleaning').getOrCreate()

25/07/26 11:44:42 WARN Utils: Your hostname, recurSe resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/07/26 11:44:43 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


25/07/26 11:44:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
df = spark.read.option('header', 'true').option('inferSchema', 'true').csv('earthquake-data.csv')

                                                                                

In [5]:
df.show()

+--------------------+-------------+---------+----------+------------+-----+-----+-------+----+----------+
|               place|         time|magnitude|       lat|        long|depth|alert|tsunami|  tz|      type|
+--------------------+-------------+---------+----------+------------+-----+-----+-------+----+----------+
|8km SSW of Lytle ...|-631157391770|     2.58|34.1911667|    -117.522| 4.49| null|      0|null|earthquake|
|24km WNW of Searl...|-631215832260|     2.01|35.8593333|-117.6506667|  0.0| null|      0|null|earthquake|
|28km N of El Sauz...|-631241139690|      3.3|32.1433333|-116.6288333|  6.0| null|      0|null|earthquake|
|1km SSW of Artesi...|-631251141040|     1.83|33.8561667|-118.0893333| 0.25| null|      0|null|earthquake|
|16km SE of Primo ...|-631284369930|     3.02|    32.113|-116.8063333|  6.0| null|      0|null|earthquake|
|south of the Fiji...|-631286334600|     null|   -26.927|    -176.566| 15.0| null|      0|null|earthquake|
|10 km SSW of Sawa...|-631292588380| 

In [6]:
df.printSchema()

root
 |-- place: string (nullable = true)
 |-- time: long (nullable = true)
 |-- magnitude: double (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- depth: double (nullable = true)
 |-- alert: string (nullable = true)
 |-- tsunami: integer (nullable = true)
 |-- tz: double (nullable = true)
 |-- type: string (nullable = true)



I have targetted the data cleaning procedures needed base on the data exploration I've done in pandas. First, let's convert the time column into a useful and readable format. Also, let's check if there are no invalid time or out of range time.

In [50]:
df_fmt_time = df.withColumn('earthquake_datetime', from_unixtime(col('time')/1000)). \
    withColumn('earthquake_datetime', to_timestamp('earthquake_datetime'))

In [51]:
df_fmt_time.printSchema()

root
 |-- place: string (nullable = true)
 |-- time: long (nullable = true)
 |-- magnitude: double (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- depth: double (nullable = true)
 |-- alert: string (nullable = true)
 |-- tsunami: integer (nullable = true)
 |-- tz: double (nullable = true)
 |-- type: string (nullable = true)
 |-- earthquake_datetime: timestamp (nullable = true)



In [53]:
df_fmt_time.filter((df_fmt_time.earthquake_datetime > datetime.fromisoformat('1900-01-01')) & (df_fmt_time.earthquake_datetime < datetime.now())).count()

                                                                                

4608354

The filtered dataframe has still the same counts as the original dataframe. The column is also timestamp. Let's now proceed to data cleaning.

In [54]:
df = df_fmt_time

In [57]:
df_clean = df.withColumnRenamed('lat', 'latitude').withColumnRenamed('long', 'longitude'). \
    select('place', 'earthquake_datetime', 'magnitude', 'latitude', 'longitude', 'depth', 'alert', 'tsunami', 'type'). \
    filter((df.magnitude >= -1) & (df.magnitude <= 10) & (df.magnitude.isNotNull())). \
    filter((df.lat >= -90) & (df.lat <= 90)). \
    filter((df.long >= -180) & (df.long <= 180)). \
    dropDuplicates(subset=['place', 'earthquake_datetime']). \
    na.fill({'depth': 0})

In [58]:
df_clean.count()

                                                                                

4431443

In [59]:
df_clean.show()



+--------------------+-------------------+---------+----------------+-----------------+----------------+-----+-------+----------+
|               place|earthquake_datetime|magnitude|        latitude|        longitude|           depth|alert|tsunami|      type|
+--------------------+-------------------+---------+----------------+-----------------+----------------+-----+-------+----------+
|0 km  of The Geys...|2024-10-27 19:39:16|     0.79| 38.779167175293|-122.757331848145| 1.0900000333786| null|      0|earthquake|
|0 km  of The Geys...|2024-11-12 02:15:22|     0.36|           38.78|-122.757833333333|            1.36| null|      0|earthquake|
|0 km  of The Geys...|2024-12-08 06:36:23|     1.08|38.7799987792969|-122.757667541504|1.36000001430511| null|      0|earthquake|
|0 km E of Alamo, ...|2011-01-30 20:50:21|      0.9|           37.85|         -122.027|           4.077| null|      0|earthquake|
|0 km E of Aldrans...|1993-03-30 09:48:08|      0.9|          47.249|           11.458|   

                                                                                

In [60]:
df_clean.printSchema()

root
 |-- place: string (nullable = true)
 |-- earthquake_datetime: timestamp (nullable = true)
 |-- magnitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- depth: double (nullable = false)
 |-- alert: string (nullable = true)
 |-- tsunami: integer (nullable = true)
 |-- type: string (nullable = true)



For the cleaning, we have converted the time (milliseconds from the epoch) to a useful and readable datetime format. We have also removed the tz column which has very small non-null values.<br>
We have removed null magnitudes and values that are out of range. Same for depth, latitude, and longitude. We have transformed null depth values to 0 and lastly, we dropped duplicate rows base on place and time.<br>
We can now stage the data to the warehouse and do some further dbt transformations to create data models.

Note: This is just trial cleaning so we can interactively watch the process. Official script is stored in the pipeline folder.