In this notebook, we are gonna add country and region data for our raw dataset

In [1]:
import pandas as pd
import geopandas as gpd
import pyspark

from shapely.geometry import Point
from pyspark.sql import SparkSession
from tqdm import tqdm

In [2]:
df = pd.read_csv('earthquake-data.csv')

  df = pd.read_csv('earthquake-data.csv')


In [3]:
df = df.head(10000)

In [4]:
df.sample(10)

Unnamed: 0,place,time,magnitude,lat,long,depth,alert,tsunami,tz,type
9961,"34km NNW of San Miguel Is., CA",-1057836973350,3.28,34.328333,-120.469167,6.0,,0,,earthquake
6799,"129km SE of Maneadero, B.C., MX",-917133902820,4.32,30.967667,-115.534667,6.0,,0,,earthquake
8183,"174 km ESE of Iwaki, Japan",-982853045700,6.9,36.487,142.709,15.0,,0,,earthquake
4359,"8km WSW of Frazier Park, CA",-782526308910,2.94,34.7945,-119.027667,6.0,,0,,earthquake
2652,"40km E of Barstow, CA",-713454613850,2.74,34.909333,-116.585667,6.0,,0,,earthquake
4164,"84 km NNW of Claveria, Philippines",-770450871140,6.07,19.353,120.9,35.0,,0,,earthquake
9839,Carlsberg Ridge,-1052622837730,,-3.309,67.374,15.0,,0,,earthquake
55,"12km N of Big Bear City, CA",-632585466690,3.09,34.369667,-116.862833,5.62,,0,,earthquake
1599,"27km E of Lake Isabella, CA",-678490933310,2.65,35.614667,-118.1765,6.0,,0,,earthquake
7156,"9km ENE of Joshua Tree, CA",-933613531910,4.37,34.152333,-116.216,6.0,,0,,earthquake


We will get our countries data in ne_10m_admin_0_countries.shp file

In [6]:
geometry = [Point(xy) for xy in zip(df['long'], df['lat'])]
gdf_points = gpd.GeoDataFrame(df, geometry=geometry, crs='EPSG:4326')

world = gpd.read_file('world-boundaries/ne_10m_admin_0_countries.shp') 

gdf_with_country = gpd.sjoin(gdf_points, world[['geometry', 'ADMIN', 'REGION_UN']].rename(columns={
    'ADMIN': 'country',
    'REGION_UN': 'region'
}), how='left', predicate='within')

In [7]:
gdf_with_country.sample(10)

Unnamed: 0,place,time,magnitude,lat,long,depth,alert,tsunami,tz,type,geometry,index_right,country,region
7371,"10km N of Lake View Terrace, CA",-943775871980,2.57,34.361,-118.354167,0.01,,0,,quarry blast,POINT (-118.35417 34.361),154.0,United States of America,Americas
7825,"12 km SW of Noshiro, Japan",-967917583440,,40.124,139.927,15.0,,0,,earthquake,POINT (139.927 40.124),,,
8201,"116 km E of Namie, Japan",-983113564700,7.7,37.382,142.305,30.0,,0,,earthquake,POINT (142.305 37.382),,,
547,"233 km NE of Saipan, Northern Mariana Islands",-646891365700,7.1,16.529,147.463,15.0,,0,,earthquake,POINT (147.463 16.529),,,
8519,"271 km SSW of Severo-Kuril’sk, Russia",-996885108950,5.98,48.328,155.082,35.0,,0,,earthquake,POINT (155.082 48.328),,,
8483,"5 km NNE of Manhattan, Montana",-995718600000,,45.9,-111.3,,,0,,earthquake,POINT (-111.3 45.9),154.0,United States of America,Americas
5915,"11km ESE of Cabazon, CA",-867375516770,3.07,33.871167,-116.678833,6.0,,0,,earthquake,POINT (-116.67883 33.87117),154.0,United States of America,Americas
4438,"Bonin Islands, Japan region",-786823137040,,28.13,140.263,385.0,,0,,earthquake,POINT (140.263 28.13),,,
2624,"35km WNW of Ludlow, CA",-712628214110,2.82,34.883333,-116.484167,6.0,,0,,earthquake,POINT (-116.48417 34.88333),154.0,United States of America,Americas
875,"27km N of Santa Barbara Is., CA",-656542238210,2.69,33.713667,-119.078833,6.0,,0,,earthquake,POINT (-119.07883 33.71367),,,


In [23]:
gdf_with_country.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   place        10000 non-null  object  
 1   time         10000 non-null  int64   
 2   magnitude    7635 non-null   float64 
 3   lat          10000 non-null  float64 
 4   long         10000 non-null  float64 
 5   depth        9837 non-null   float64 
 6   alert        0 non-null      object  
 7   tsunami      10000 non-null  int64   
 8   tz           0 non-null      float64 
 9   type         10000 non-null  object  
 10  geometry     10000 non-null  geometry
 11  index_right  6178 non-null   float64 
 12  country      6178 non-null   object  
 13  region       6178 non-null   object  
dtypes: float64(6), geometry(1), int64(2), object(5)
memory usage: 1.1+ MB


In [25]:
study_df = gdf_with_country[gdf_with_country.country.isnull()]

In [28]:
study_df

Unnamed: 0,place,time,magnitude,lat,long,depth,alert,tsunami,tz,type,geometry,index_right,country,region
5,south of the Fiji Islands,-631286334600,,-26.927,-176.566,15.0,,0,,earthquake,POINT (-176.566 -26.927),,,
7,"91 km NNE of Lasem, Indonesia",-631331600890,,-5.923,111.759,605.0,,0,,earthquake,POINT (111.759 -5.923),,,
8,Kermadec Islands region,-631351025130,,-27.889,-177.108,135.0,,0,,earthquake,POINT (-177.108 -27.889),,,
14,"33 km SSE of Adak, Alaska",-631448307640,,51.584,-176.509,35.0,,0,,earthquake,POINT (-176.509 51.584),,,
15,east of the South Sandwich Islands,-631460084000,,-59.169,-20.328,15.0,,0,,earthquake,POINT (-20.328 -59.169),,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9990,"139 km NNW of Finschhafen, Papua New Guinea",-1059060996230,,-5.443,147.253,150.0,,0,,earthquake,POINT (147.253 -5.443),,,
9992,"144 km WNW of Pariaman, Indonesia",-1059117808160,6.03,-0.172,98.898,35.0,,0,,earthquake,POINT (98.898 -0.172),,,
9993,Greenland Sea,-1059333702770,5.68,73.128,6.098,15.0,,0,,earthquake,POINT (6.098 73.128),,,
9994,Norwegian Sea,-1059336071260,5.44,72.678,1.764,15.0,,0,,earthquake,POINT (1.764 72.678),,,


In [27]:
study_df.isnull().sum()

place             0
time              0
magnitude      1501
lat               0
long              0
depth            41
alert          3822
tsunami           0
tz             3822
type              0
geometry          0
index_right    3822
country        3822
region         3822
dtype: int64

As we can see, there are still null country values even though there is no null latitude and longitude. From a small search, this is due to unclaimed territories or discrepancy in our reference file. We can also see that we can fill some as the country is written in their place column.

Generated country and region column in gdf_with_country dataframe. But a lot of null countries. Let us try to fill some up.

In [8]:
world[['ADMIN', 'REGION_UN']]

Unnamed: 0,ADMIN,REGION_UN
0,Indonesia,Asia
1,Malaysia,Asia
2,Chile,Americas
3,Bolivia,Americas
4,Peru,Americas
...,...,...
253,Macao S.A.R,Asia
254,Ashmore and Cartier Islands,Oceania
255,Bajo Nuevo Bank (Petrel Is.),Americas
256,Serranilla Bank,Americas


In [10]:
world_data = world[['ADMIN', 'REGION_UN']].to_json()
print(type(world_data))
print(world_data)

<class 'str'>
{"ADMIN":{"0":"Indonesia","1":"Malaysia","2":"Chile","3":"Bolivia","4":"Peru","5":"Argentina","6":"Dhekelia Sovereign Base Area","7":"Cyprus","8":"India","9":"China","10":"Israel","11":"Palestine","12":"Lebanon","13":"Ethiopia","14":"South Sudan","15":"Somalia","16":"Kenya","17":"Malawi","18":"United Republic of Tanzania","19":"Syria","20":"Somaliland","21":"France","22":"Suriname","23":"Guyana","24":"South Korea","25":"North Korea","26":"Morocco","27":"Western Sahara","28":"Costa Rica","29":"Nicaragua","30":"Republic of the Congo","31":"Democratic Republic of the Congo","32":"Bhutan","33":"Ukraine","34":"Belarus","35":"Namibia","36":"South Africa","37":"Saint Martin","38":"Sint Maarten","39":"Oman","40":"Uzbekistan","41":"Kazakhstan","42":"Tajikistan","43":"Lithuania","44":"Brazil","45":"Uruguay","46":"Mongolia","47":"Russia","48":"Czechia","49":"Germany","50":"Estonia","51":"Latvia","52":"Norway","53":"Sweden","54":"Finland","55":"Vietnam","56":"Cambodia","57":"Luxembou

In [11]:
import json
world_dict = json.loads(world_data)

In [12]:
world_df = pd.DataFrame.from_dict(world_dict, orient='columns')

In [14]:
world_df.columns = ['place_country', 'region']

In [15]:
world_df

Unnamed: 0,place_country,region
0,Indonesia,Asia
1,Malaysia,Asia
2,Chile,Americas
3,Bolivia,Americas
4,Peru,Americas
...,...,...
253,Macao S.A.R,Asia
254,Ashmore and Cartier Islands,Oceania
255,Bajo Nuevo Bank (Petrel Is.),Americas
256,Serranilla Bank,Americas


In [81]:
world_df.to_csv('world_data.csv', index='False')

Generated world_df dataframe that contains list of countries and region. Let's search for countries in the place column of gdf_with_country

In [16]:
world_df['country_lower'] = world_df['place_country'].str.lower()

In [17]:
def get_country_from_place(place, country_list):
    place = str(place).lower()
    for country in country_list:
        if country in place:
            return country
    return None

Let us copy a dataframe containing null values while keeping the index

In [56]:
with_null_df = gdf_with_country[gdf_with_country['country'].isnull()].copy()

In [57]:
with_null_df

Unnamed: 0,place,time,magnitude,lat,long,depth,alert,tsunami,tz,type,geometry,index_right,country,region
5,south of the Fiji Islands,-631286334600,,-26.927,-176.566,15.0,,0,,earthquake,POINT (-176.566 -26.927),,,
7,"91 km NNE of Lasem, Indonesia",-631331600890,,-5.923,111.759,605.0,,0,,earthquake,POINT (111.759 -5.923),,,
8,Kermadec Islands region,-631351025130,,-27.889,-177.108,135.0,,0,,earthquake,POINT (-177.108 -27.889),,,
14,"33 km SSE of Adak, Alaska",-631448307640,,51.584,-176.509,35.0,,0,,earthquake,POINT (-176.509 51.584),,,
15,east of the South Sandwich Islands,-631460084000,,-59.169,-20.328,15.0,,0,,earthquake,POINT (-20.328 -59.169),,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9990,"139 km NNW of Finschhafen, Papua New Guinea",-1059060996230,,-5.443,147.253,150.0,,0,,earthquake,POINT (147.253 -5.443),,,
9992,"144 km WNW of Pariaman, Indonesia",-1059117808160,6.03,-0.172,98.898,35.0,,0,,earthquake,POINT (98.898 -0.172),,,
9993,Greenland Sea,-1059333702770,5.68,73.128,6.098,15.0,,0,,earthquake,POINT (6.098 73.128),,,
9994,Norwegian Sea,-1059336071260,5.44,72.678,1.764,15.0,,0,,earthquake,POINT (1.764 72.678),,,


In [58]:
with_null_df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 3822 entries, 5 to 9997
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   place        3822 non-null   object  
 1   time         3822 non-null   int64   
 2   magnitude    2321 non-null   float64 
 3   lat          3822 non-null   float64 
 4   long         3822 non-null   float64 
 5   depth        3781 non-null   float64 
 6   alert        0 non-null      object  
 7   tsunami      3822 non-null   int64   
 8   tz           0 non-null      float64 
 9   type         3822 non-null   object  
 10  geometry     3822 non-null   geometry
 11  index_right  0 non-null      float64 
 12  country      0 non-null      object  
 13  region       0 non-null      object  
dtypes: float64(6), geometry(1), int64(2), object(5)
memory usage: 447.9+ KB


In [59]:
country_list = world_df['country_lower'].tolist()
with_null_df['guess_country'] = with_null_df['place'].apply(lambda x: get_country_from_place(x, country_list))

with_null_df = with_null_df.reset_index().merge(world_df, how='left', left_on='guess_country', right_on='country_lower').set_index('index')

In [60]:
with_null_df

Unnamed: 0_level_0,place,time,magnitude,lat,long,depth,alert,tsunami,tz,type,geometry,index_right,country,region_x,guess_country,place_country,region_y,country_lower
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
5,south of the Fiji Islands,-631286334600,,-26.927,-176.566,15.0,,0,,earthquake,POINT (-176.566 -26.927),,,,fiji,Fiji,Oceania,fiji
7,"91 km NNE of Lasem, Indonesia",-631331600890,,-5.923,111.759,605.0,,0,,earthquake,POINT (111.759 -5.923),,,,indonesia,Indonesia,Asia,indonesia
8,Kermadec Islands region,-631351025130,,-27.889,-177.108,135.0,,0,,earthquake,POINT (-177.108 -27.889),,,,,,,
14,"33 km SSE of Adak, Alaska",-631448307640,,51.584,-176.509,35.0,,0,,earthquake,POINT (-176.509 51.584),,,,,,,
15,east of the South Sandwich Islands,-631460084000,,-59.169,-20.328,15.0,,0,,earthquake,POINT (-20.328 -59.169),,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9990,"139 km NNW of Finschhafen, Papua New Guinea",-1059060996230,,-5.443,147.253,150.0,,0,,earthquake,POINT (147.253 -5.443),,,,guinea,Guinea,Africa,guinea
9992,"144 km WNW of Pariaman, Indonesia",-1059117808160,6.03,-0.172,98.898,35.0,,0,,earthquake,POINT (98.898 -0.172),,,,indonesia,Indonesia,Asia,indonesia
9993,Greenland Sea,-1059333702770,5.68,73.128,6.098,15.0,,0,,earthquake,POINT (6.098 73.128),,,,greenland,Greenland,Americas,greenland
9994,Norwegian Sea,-1059336071260,5.44,72.678,1.764,15.0,,0,,earthquake,POINT (1.764 72.678),,,,,,,


In [61]:
with_null_df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 3822 entries, 5 to 9997
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   place          3822 non-null   object  
 1   time           3822 non-null   int64   
 2   magnitude      2321 non-null   float64 
 3   lat            3822 non-null   float64 
 4   long           3822 non-null   float64 
 5   depth          3781 non-null   float64 
 6   alert          0 non-null      object  
 7   tsunami        3822 non-null   int64   
 8   tz             0 non-null      float64 
 9   type           3822 non-null   object  
 10  geometry       3822 non-null   geometry
 11  index_right    0 non-null      float64 
 12  country        0 non-null      object  
 13  region_x       0 non-null      object  
 14  guess_country  2460 non-null   object  
 15  place_country  2460 non-null   object  
 16  region_y       2460 non-null   object  
 17  country_lower  2460 non-null  

In [62]:
with_null_df.to_csv('with_null_df.csv')

Fill the generated dataframe into the gdf_with_country dataframe

In [64]:
gdf_with_country['country'] = gdf_with_country['country'].fillna(
    with_null_df['place_country'])

gdf_with_country['region'] = gdf_with_country['region'].fillna(
    with_null_df['region_y'])

In [65]:
gdf_with_country = gdf_with_country.drop(columns=['geometry', 'index_right'])

In [66]:
gdf_with_country

Unnamed: 0,place,time,magnitude,lat,long,depth,alert,tsunami,tz,type,country,region
0,"8km SSW of Lytle Creek, CA",-631157391770,2.58,34.191167,-117.522000,4.49,,0,,earthquake,United States of America,Americas
1,"24km WNW of Searles Valley, CA",-631215832260,2.01,35.859333,-117.650667,0.00,,0,,earthquake,United States of America,Americas
2,"28km N of El Sauzal, B.C., MX",-631241139690,3.30,32.143333,-116.628833,6.00,,0,,earthquake,Mexico,Americas
3,"1km SSW of Artesia, CA",-631251141040,1.83,33.856167,-118.089333,0.25,,0,,earthquake,United States of America,Americas
4,"16km SE of Primo Tapia, B.C., MX",-631284369930,3.02,32.113000,-116.806333,6.00,,0,,earthquake,Mexico,Americas
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,"4km SE of Marina del Rey, CA",-1059339147180,1.89,33.953333,-118.426667,6.00,,0,,earthquake,United States of America,Americas
9996,"12km SE of Idyllwild, CA",-1059352614360,2.28,33.657167,-116.644167,6.00,,0,,earthquake,United States of America,Americas
9997,northern Mid-Atlantic Ridge,-1059377901680,5.65,26.923000,-43.335000,15.00,,0,,earthquake,,
9998,"14km WNW of Castaic, CA",-1059395737860,2.40,34.543000,-118.757167,6.00,,0,,earthquake,United States of America,Americas


In [68]:
gdf_with_country.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   place      10000 non-null  object 
 1   time       10000 non-null  int64  
 2   magnitude  7635 non-null   float64
 3   lat        10000 non-null  float64
 4   long       10000 non-null  float64
 5   depth      9837 non-null   float64
 6   alert      0 non-null      object 
 7   tsunami    10000 non-null  int64  
 8   tz         0 non-null      float64
 9   type       10000 non-null  object 
 10  country    8638 non-null   object 
 11  region     8638 non-null   object 
dtypes: float64(5), int64(2), object(5)
memory usage: 1015.6+ KB


In [69]:
gdf_with_country.to_csv('earthquake-data-wth-countries.csv', index=False)

As we can see, the non-null values for country and region is increased from 6178 to 8638