In this notebook, we are gonna add country and region data for our raw dataset

In [1]:
import pandas as pd
import geopandas as gpd
import pyspark
import re

from shapely.geometry import Point
from pyspark.sql import SparkSession
from tqdm import tqdm

In [2]:
df = pd.read_csv('earthquake-data.csv')

  df = pd.read_csv('earthquake-data.csv')


In [3]:
df.shape

(4608354, 10)

In [4]:
df = df.head(10000)

In [5]:
df.shape

(10000, 10)

In [6]:
df.sample(10)

Unnamed: 0,place,time,magnitude,lat,long,depth,alert,tsunami,tz,type
54,Kermadec Islands region,-632483599770,,-27.349,-177.154,35.0,,0,,earthquake
5417,"21km ESE of Olancha, CA",-841320100590,3.19,36.234667,-117.776833,6.0,,0,,earthquake
6083,"240 km WNW of Port McNeill, Canada",-876916830440,6.24,51.384,-130.267,15.0,,0,,earthquake
5843,"4 km NW of Cedar City, Utah",-862707120000,,37.7,-113.1,,,0,,earthquake
3303,"129 km ENE of Mawlaik, Myanmar",-735381755760,7.07,24.104,95.575,15.0,,0,,earthquake
8579,"73 km E of Iwaki, Japan",-999940493570,5.76,36.982,141.707,25.0,,0,,earthquake
4277,"32km SE of Avalon, CA",-778377795050,3.36,33.153,-118.062,6.0,,0,,earthquake
805,south of Tonga,-654048698830,,-25.083,-175.391,15.0,,0,,earthquake
6566,"27km ENE of Independence, CA",-901489320130,3.27,36.861833,-117.906833,6.0,,0,,earthquake
1860,"10 km SE of Naama, Algeria",-688060433980,5.41,33.194,-0.253,15.0,,0,,earthquake


We will get our countries data in ne_10m_admin_0_countries.shp file

In [7]:
geometry = [Point(xy) for xy in zip(df['long'], df['lat'])]
gdf_points = gpd.GeoDataFrame(df, geometry=geometry, crs='EPSG:4326')

world = gpd.read_file('world-boundaries/ne_10m_admin_0_countries.shp') 

gdf_with_country = gpd.sjoin(gdf_points, world[['geometry', 'ADMIN', 'REGION_UN']].rename(columns={
    'ADMIN': 'country',
    'REGION_UN': 'region'
}), how='left', predicate='within')

In [8]:
gdf_with_country.sample(10)

Unnamed: 0,place,time,magnitude,lat,long,depth,alert,tsunami,tz,type,geometry,index_right,country,region
1818,"2 km NNE of Kingsbury, Nevada",-686727220000,4.6,39.0,-119.9,,,0,,earthquake,POINT (-119.9 39),154.0,United States of America,Americas
9535,"34km W of San Miguel Is., CA",-1044497303300,2.89,34.066833,-120.735,6.0,,0,,earthquake,POINT (-120.735 34.06683),,,
7668,"63 km NNW of Kuqa, China",-962222878450,5.58,42.208,82.53,15.0,,0,,earthquake,POINT (82.53 42.208),9.0,China,Asia
5643,"115 km W of Hihifo, Tonga",-852925516220,6.3,-15.854,-174.874,15.0,,0,,earthquake,POINT (-174.874 -15.854),,,
3623,"18km WNW of Inyokern, CA",-746739288000,2.8,35.7,-118.0,6.0,,0,,earthquake,POINT (-118 35.7),154.0,United States of America,Americas
4054,"171 km W of Abepura, Indonesia",-764866039720,,-2.338,139.11,35.0,,0,,earthquake,POINT (139.11 -2.338),0.0,Indonesia,Asia
3239,"126 km S of Ierápetra, Greece",-732681321080,5.67,33.898,26.011,15.0,,0,,earthquake,POINT (26.011 33.898),,,
3972,"86 km NNW of Alianza Cristiana, Peru",-758874789430,,-2.75,-76.734,117.8,,0,,earthquake,POINT (-76.734 -2.75),4.0,Peru,Americas
7514,"9km E of Big Bear City, CA",-952740784070,2.29,34.2555,-116.746833,6.0,,0,,earthquake,POINT (-116.74683 34.2555),154.0,United States of America,Americas
8890,"4km WNW of Mead Valley, CA",-1015713782300,2.43,33.842167,-117.335167,6.0,,0,,earthquake,POINT (-117.33517 33.84217),154.0,United States of America,Americas


In [9]:
gdf_with_country.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   place        10000 non-null  object  
 1   time         10000 non-null  int64   
 2   magnitude    7635 non-null   float64 
 3   lat          10000 non-null  float64 
 4   long         10000 non-null  float64 
 5   depth        9837 non-null   float64 
 6   alert        0 non-null      object  
 7   tsunami      10000 non-null  int64   
 8   tz           0 non-null      float64 
 9   type         10000 non-null  object  
 10  geometry     10000 non-null  geometry
 11  index_right  6178 non-null   float64 
 12  country      6178 non-null   object  
 13  region       6178 non-null   object  
dtypes: float64(6), geometry(1), int64(2), object(5)
memory usage: 1.1+ MB


In [10]:
study_df = gdf_with_country[gdf_with_country.country.isnull()]

In [11]:
study_df

Unnamed: 0,place,time,magnitude,lat,long,depth,alert,tsunami,tz,type,geometry,index_right,country,region
5,south of the Fiji Islands,-631286334600,,-26.927,-176.566,15.0,,0,,earthquake,POINT (-176.566 -26.927),,,
7,"91 km NNE of Lasem, Indonesia",-631331600890,,-5.923,111.759,605.0,,0,,earthquake,POINT (111.759 -5.923),,,
8,Kermadec Islands region,-631351025130,,-27.889,-177.108,135.0,,0,,earthquake,POINT (-177.108 -27.889),,,
14,"33 km SSE of Adak, Alaska",-631448307640,,51.584,-176.509,35.0,,0,,earthquake,POINT (-176.509 51.584),,,
15,east of the South Sandwich Islands,-631460084000,,-59.169,-20.328,15.0,,0,,earthquake,POINT (-20.328 -59.169),,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9990,"139 km NNW of Finschhafen, Papua New Guinea",-1059060996230,,-5.443,147.253,150.0,,0,,earthquake,POINT (147.253 -5.443),,,
9992,"144 km WNW of Pariaman, Indonesia",-1059117808160,6.03,-0.172,98.898,35.0,,0,,earthquake,POINT (98.898 -0.172),,,
9993,Greenland Sea,-1059333702770,5.68,73.128,6.098,15.0,,0,,earthquake,POINT (6.098 73.128),,,
9994,Norwegian Sea,-1059336071260,5.44,72.678,1.764,15.0,,0,,earthquake,POINT (1.764 72.678),,,


In [12]:
study_df.isnull().sum()

place             0
time              0
magnitude      1501
lat               0
long              0
depth            41
alert          3822
tsunami           0
tz             3822
type              0
geometry          0
index_right    3822
country        3822
region         3822
dtype: int64

As we can see, there are still null country values even though there is no null latitude and longitude. From a small search, this is due to unclaimed territories or discrepancy in our reference file. We can also see that we can fill some as the country is written in their place column.

Generated country and region column in gdf_with_country dataframe. But a lot of null countries. Let us try to fill some up.

In [13]:
world[['ADMIN', 'REGION_UN']]

Unnamed: 0,ADMIN,REGION_UN
0,Indonesia,Asia
1,Malaysia,Asia
2,Chile,Americas
3,Bolivia,Americas
4,Peru,Americas
...,...,...
253,Macao S.A.R,Asia
254,Ashmore and Cartier Islands,Oceania
255,Bajo Nuevo Bank (Petrel Is.),Americas
256,Serranilla Bank,Americas


In [14]:
world_data = world[['ADMIN', 'REGION_UN']].to_json()
print(type(world_data))
print(world_data)

<class 'str'>
{"ADMIN":{"0":"Indonesia","1":"Malaysia","2":"Chile","3":"Bolivia","4":"Peru","5":"Argentina","6":"Dhekelia Sovereign Base Area","7":"Cyprus","8":"India","9":"China","10":"Israel","11":"Palestine","12":"Lebanon","13":"Ethiopia","14":"South Sudan","15":"Somalia","16":"Kenya","17":"Malawi","18":"United Republic of Tanzania","19":"Syria","20":"Somaliland","21":"France","22":"Suriname","23":"Guyana","24":"South Korea","25":"North Korea","26":"Morocco","27":"Western Sahara","28":"Costa Rica","29":"Nicaragua","30":"Republic of the Congo","31":"Democratic Republic of the Congo","32":"Bhutan","33":"Ukraine","34":"Belarus","35":"Namibia","36":"South Africa","37":"Saint Martin","38":"Sint Maarten","39":"Oman","40":"Uzbekistan","41":"Kazakhstan","42":"Tajikistan","43":"Lithuania","44":"Brazil","45":"Uruguay","46":"Mongolia","47":"Russia","48":"Czechia","49":"Germany","50":"Estonia","51":"Latvia","52":"Norway","53":"Sweden","54":"Finland","55":"Vietnam","56":"Cambodia","57":"Luxembou

In [15]:
import json
world_dict = json.loads(world_data)

In [16]:
world_df = pd.DataFrame.from_dict(world_dict, orient='columns')
world_df.loc[len(world_df)] = ['Alaska', 'Americas']

In [17]:
world_df.columns = ['place_country', 'region']

In [18]:
world_df

Unnamed: 0,place_country,region
0,Indonesia,Asia
1,Malaysia,Asia
2,Chile,Americas
3,Bolivia,Americas
4,Peru,Americas
...,...,...
254,Ashmore and Cartier Islands,Oceania
255,Bajo Nuevo Bank (Petrel Is.),Americas
256,Serranilla Bank,Americas
257,Scarborough Reef,Asia


In [12]:
world_df.to_csv('world_data.csv', index='False')

Generated world_df dataframe that contains list of countries and region. Let's search for countries in the place column of gdf_with_country

In [19]:
world_df['country_lower'] = world_df['place_country'].str.lower()

In [20]:
def get_country_from_place(place, country_list):
    place = str(place).lower()
    for country in country_list:
        pattern = r"\b" + re.escape(country.lower()) + r"\b"
        if re.search(pattern, place):
            return country
    return None

Let us copy a dataframe containing null values while keeping the index

In [21]:
with_null_df = gdf_with_country[gdf_with_country['country'].isnull()].copy()

In [22]:
with_null_df

Unnamed: 0,place,time,magnitude,lat,long,depth,alert,tsunami,tz,type,geometry,index_right,country,region
5,south of the Fiji Islands,-631286334600,,-26.927,-176.566,15.0,,0,,earthquake,POINT (-176.566 -26.927),,,
7,"91 km NNE of Lasem, Indonesia",-631331600890,,-5.923,111.759,605.0,,0,,earthquake,POINT (111.759 -5.923),,,
8,Kermadec Islands region,-631351025130,,-27.889,-177.108,135.0,,0,,earthquake,POINT (-177.108 -27.889),,,
14,"33 km SSE of Adak, Alaska",-631448307640,,51.584,-176.509,35.0,,0,,earthquake,POINT (-176.509 51.584),,,
15,east of the South Sandwich Islands,-631460084000,,-59.169,-20.328,15.0,,0,,earthquake,POINT (-20.328 -59.169),,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9990,"139 km NNW of Finschhafen, Papua New Guinea",-1059060996230,,-5.443,147.253,150.0,,0,,earthquake,POINT (147.253 -5.443),,,
9992,"144 km WNW of Pariaman, Indonesia",-1059117808160,6.03,-0.172,98.898,35.0,,0,,earthquake,POINT (98.898 -0.172),,,
9993,Greenland Sea,-1059333702770,5.68,73.128,6.098,15.0,,0,,earthquake,POINT (6.098 73.128),,,
9994,Norwegian Sea,-1059336071260,5.44,72.678,1.764,15.0,,0,,earthquake,POINT (1.764 72.678),,,


In [23]:
with_null_df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 3822 entries, 5 to 9997
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   place        3822 non-null   object  
 1   time         3822 non-null   int64   
 2   magnitude    2321 non-null   float64 
 3   lat          3822 non-null   float64 
 4   long         3822 non-null   float64 
 5   depth        3781 non-null   float64 
 6   alert        0 non-null      object  
 7   tsunami      3822 non-null   int64   
 8   tz           0 non-null      float64 
 9   type         3822 non-null   object  
 10  geometry     3822 non-null   geometry
 11  index_right  0 non-null      float64 
 12  country      0 non-null      object  
 13  region       0 non-null      object  
dtypes: float64(6), geometry(1), int64(2), object(5)
memory usage: 447.9+ KB


In [24]:
country_list = world_df['country_lower'].tolist()
with_null_df['guess_country'] = with_null_df['place'].apply(lambda x: get_country_from_place(x, country_list))

with_null_df = with_null_df.reset_index().merge(world_df, how='left', left_on='guess_country', right_on='country_lower').set_index('index')

In [25]:
with_null_df

Unnamed: 0_level_0,place,time,magnitude,lat,long,depth,alert,tsunami,tz,type,geometry,index_right,country,region_x,guess_country,place_country,region_y,country_lower
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
5,south of the Fiji Islands,-631286334600,,-26.927,-176.566,15.0,,0,,earthquake,POINT (-176.566 -26.927),,,,fiji,Fiji,Oceania,fiji
7,"91 km NNE of Lasem, Indonesia",-631331600890,,-5.923,111.759,605.0,,0,,earthquake,POINT (111.759 -5.923),,,,indonesia,Indonesia,Asia,indonesia
8,Kermadec Islands region,-631351025130,,-27.889,-177.108,135.0,,0,,earthquake,POINT (-177.108 -27.889),,,,,,,
14,"33 km SSE of Adak, Alaska",-631448307640,,51.584,-176.509,35.0,,0,,earthquake,POINT (-176.509 51.584),,,,alaska,Alaska,Americas,alaska
15,east of the South Sandwich Islands,-631460084000,,-59.169,-20.328,15.0,,0,,earthquake,POINT (-20.328 -59.169),,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9990,"139 km NNW of Finschhafen, Papua New Guinea",-1059060996230,,-5.443,147.253,150.0,,0,,earthquake,POINT (147.253 -5.443),,,,guinea,Guinea,Africa,guinea
9992,"144 km WNW of Pariaman, Indonesia",-1059117808160,6.03,-0.172,98.898,35.0,,0,,earthquake,POINT (98.898 -0.172),,,,indonesia,Indonesia,Asia,indonesia
9993,Greenland Sea,-1059333702770,5.68,73.128,6.098,15.0,,0,,earthquake,POINT (6.098 73.128),,,,greenland,Greenland,Americas,greenland
9994,Norwegian Sea,-1059336071260,5.44,72.678,1.764,15.0,,0,,earthquake,POINT (1.764 72.678),,,,,,,


In [26]:
with_null_df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 3822 entries, 5 to 9997
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   place          3822 non-null   object  
 1   time           3822 non-null   int64   
 2   magnitude      2321 non-null   float64 
 3   lat            3822 non-null   float64 
 4   long           3822 non-null   float64 
 5   depth          3781 non-null   float64 
 6   alert          0 non-null      object  
 7   tsunami        3822 non-null   int64   
 8   tz             0 non-null      float64 
 9   type           3822 non-null   object  
 10  geometry       3822 non-null   geometry
 11  index_right    0 non-null      float64 
 12  country        0 non-null      object  
 13  region_x       0 non-null      object  
 14  guess_country  2571 non-null   object  
 15  place_country  2571 non-null   object  
 16  region_y       2571 non-null   object  
 17  country_lower  2571 non-null  

In [27]:
with_null_df.to_csv('with_null_df.csv')

Fill the generated dataframe into the gdf_with_country dataframe

In [28]:
gdf_with_country['country'] = gdf_with_country['country'].fillna(
    with_null_df['place_country'])

gdf_with_country['region'] = gdf_with_country['region'].fillna(
    with_null_df['region_y'])

In [29]:
gdf_with_country = gdf_with_country.drop(columns=['geometry', 'index_right'])

In [30]:
gdf_with_country

Unnamed: 0,place,time,magnitude,lat,long,depth,alert,tsunami,tz,type,country,region
0,"8km SSW of Lytle Creek, CA",-631157391770,2.58,34.191167,-117.522000,4.49,,0,,earthquake,United States of America,Americas
1,"24km WNW of Searles Valley, CA",-631215832260,2.01,35.859333,-117.650667,0.00,,0,,earthquake,United States of America,Americas
2,"28km N of El Sauzal, B.C., MX",-631241139690,3.30,32.143333,-116.628833,6.00,,0,,earthquake,Mexico,Americas
3,"1km SSW of Artesia, CA",-631251141040,1.83,33.856167,-118.089333,0.25,,0,,earthquake,United States of America,Americas
4,"16km SE of Primo Tapia, B.C., MX",-631284369930,3.02,32.113000,-116.806333,6.00,,0,,earthquake,Mexico,Americas
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,"4km SE of Marina del Rey, CA",-1059339147180,1.89,33.953333,-118.426667,6.00,,0,,earthquake,United States of America,Americas
9996,"12km SE of Idyllwild, CA",-1059352614360,2.28,33.657167,-116.644167,6.00,,0,,earthquake,United States of America,Americas
9997,northern Mid-Atlantic Ridge,-1059377901680,5.65,26.923000,-43.335000,15.00,,0,,earthquake,,
9998,"14km WNW of Castaic, CA",-1059395737860,2.40,34.543000,-118.757167,6.00,,0,,earthquake,United States of America,Americas


In [31]:
gdf_with_country.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   place      10000 non-null  object 
 1   time       10000 non-null  int64  
 2   magnitude  7635 non-null   float64
 3   lat        10000 non-null  float64
 4   long       10000 non-null  float64
 5   depth      9837 non-null   float64
 6   alert      0 non-null      object 
 7   tsunami    10000 non-null  int64  
 8   tz         0 non-null      float64
 9   type       10000 non-null  object 
 10  country    8749 non-null   object 
 11  region     8749 non-null   object 
dtypes: float64(5), int64(2), object(5)
memory usage: 1015.6+ KB


In [32]:
gdf_with_country.to_csv('earthquake-data-wth-countries.csv', index=False)

As we can see, the non-null values for country and region is increased from 6178 to 8749