In [7]:
import sys
import pandas as pd
import numpy as np
import seaborn as sns

from matplotlib import pyplot as plt
%matplotlib inline
pd.options.display.max_columns = 500
pd.options.display.max_rows = 500

In [8]:
print("python version is: ", sys.version)
print("pandas version is: ", pd.__version__)
print("numpy version is: ", np.__version__)
print("seaborn version is: ", sns.__version__)

python version is:  3.6.3 |Anaconda custom (64-bit)| (default, Oct 13 2017, 12:02:49) 
[GCC 7.2.0]
pandas version is:  0.22.0
numpy version is:  1.14.0
seaborn version is:  0.8.0


In [9]:
df = pd.read_csv("all_earthquakes.csv")
df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,depth,depthError,dmin,gap,horizontalError,id,latitude,locationSource,longitude,mag,magError,magNst,magSource,magType,source_network,num_of_nw_stations,place,rms,status,time,type,updated,country
0,28.9,0.3,,,0.2,ak11715681,61.3325,ak,-147.9983,1.2,,,ak,ml,ak,,"59km ESE of Butte, Alaska",0.45,reviewed,2015-09-19 23:59:29.000,earthquake,2015-09-24T23:57:41.501Z,United States of America
1,16.36,0.88,0.05199,46.0,0.36,ci37245567,34.392667,ci,-118.980833,1.36,0.052,15.0,ci,ml,ci,38.0,"6km W of Fillmore, CA",0.29,reviewed,2015-09-19 23:57:55.660,earthquake,2016-03-11T06:37:02.297Z,United States of America
2,12.3,0.4,,,0.3,ak11715669,61.6439,ak,-151.352,0.8,,,ak,ml,ak,,"70km W of Willow, Alaska",0.51,reviewed,2015-09-19 23:55:06.000,earthquake,2015-09-28T17:09:49.203Z,United States of America
3,7.087,0.45,0.07658,38.0,0.17,nc72523765,39.235,nc,-123.209333,1.83,0.145,27.0,nc,md,nc,29.0,"3km S of Redwood Valley, California",0.06,reviewed,2015-09-19 23:35:53.740,earthquake,2017-02-08T11:43:43.779Z,United States of America
4,10.92,0.52,0.05772,96.0,0.3,ci37245559,33.495667,ci,-116.486333,0.96,0.166,26.0,ci,ml,ci,31.0,"19km ESE of Anza, CA",0.2,reviewed,2015-09-19 23:33:17.180,earthquake,2016-03-11T02:57:36.685Z,United States of America


`time` - Time when the event occurred. Times are reported in milliseconds since the epoch ( 1970-01-01T00:00:00.000Z), and do not include leap seconds. In certain output formats, the date is formatted for readability.

`latitude` - Decimal degrees latitude. Negative values for southern latitudes

`longitude` - Decimal degrees longitude. Negative values for western longitudes.

`depth` - Depth of the event in kilometers.

`mag` - magnitude of earthquake

`magType` - The method or algorithm used to calculate the preferred magnitude for the event.

`nst` - The total number of Number of seismic stations which reported P- and S-arrival times for this earthquake.

`gap` - gap between azimuthally adjacent stations (in degrees). In general, the smaller this number, the more reliable is the calculated horizontal position of the earthquake.

`dmin` - Horizontal distance from the epicenter to the nearest station (in degrees). 1 degree is approximately 111.2 kilometers. In general, the smaller this number, the more reliable is the calculated depth of the earthquake.

`rms` - This parameter provides a measure of the fit of the observed arrival times to the predicted arrival times for this location. Smaller numbers reflect a better fit of the data.

`net` - Identifies the network considered to be the preferred source of information for this event.

`id` - A code consisting of source, type, code, updateTime. Eg: us20002wt7

`updated` - Time when the event was most recently updated.

`place` - Textual description of named geographic region near to the event

`type` - Type of seismic event`

### Takeaways

- earthquake epicenters are estimated by a form of signal triangulation. https://www.youtube.com/watch?v=dx4OqT0PYnU
- The more stations measure this, more accurate the accuracy of an earthquake's coordinates and its depth would be.
`nst` captures this information.

#### Cleaning up the country names

In [10]:
from config import *
if 'country' not in df:
    # source - https://stackoverflow.com/a/46589405/170005
    from shapely.geometry import mapping, shape
    from shapely.prepared import prep
    from shapely.geometry import Point
    import requests


    data = requests.get("https://raw.githubusercontent.com/datasets/geo-countries/master/data/countries.geojson").json()

    countries = {}
    for feature in data["features"]:
        geom = feature["geometry"]
        country = feature["properties"]["ADMIN"]
        countries[country] = prep(shape(geom))

    print(len(countries))

    def get_country(lon, lat):
        point = Point(lon, lat)
        for country, geom in countries.items():
            if geom.contains(point):
                return country

        return "unknown"


    print("This takes a long time to execute. You've been warned.")
    df.loc[:, 'country'] = df.apply(lambda v: get_country(v['longitude'], v['latitude']), axis=1)
    
    mask = (df.country == 'unknown')
    def extract_region(v):
        place = v.split(',')[-1].strip().capitalize()
        replacements = [
            ' region', 'North of the ', 'Southern ',
            'Offshore ', 'Off the coast of central ',
            'Off the coast of southern ', 'South of the ',
            'Off the coast of ', 'Northern', 'South '
            'West of ', ' peninsula', 'East of the ',
            'Near the coast of ', ' - reunion',
            'Southwest of ', 'South of ',
            'Near the east coast of ', 'Central ',
            'North of ', 'Southern ', 'West of ', 
            'Southeastern ', 'Northwest of the '
        ]
        for phrase in replacements:
            place = place.replace(phrase, '').strip()
        return ' '.join([p.capitalize() for p in place.split() 
                         if p not in ('of', 'the', 'in')])
    df.loc[mask, 'country'] = df.loc[mask, 'place'].fillna('').apply(extract_region)
    df.loc[:, 'country'] = df.country.str.strip().replace(place_mapping).str.strip()

In [11]:
df.set_index()

Unnamed: 0,depth,depthError,dmin,gap,horizontalError,id,latitude,locationSource,longitude,mag,magError,magNst,magSource,magType,source_network,num_of_nw_stations,place,rms,status,time,type,updated,country
0,28.9,0.3,,,0.2,ak11715681,61.3325,ak,-147.9983,1.2,,,ak,ml,ak,,"59km ESE of Butte, Alaska",0.45,reviewed,2015-09-19 23:59:29.000,earthquake,2015-09-24T23:57:41.501Z,United States of America
1,16.36,0.88,0.05199,46.0,0.36,ci37245567,34.392667,ci,-118.980833,1.36,0.052,15.0,ci,ml,ci,38.0,"6km W of Fillmore, CA",0.29,reviewed,2015-09-19 23:57:55.660,earthquake,2016-03-11T06:37:02.297Z,United States of America
2,12.3,0.4,,,0.3,ak11715669,61.6439,ak,-151.352,0.8,,,ak,ml,ak,,"70km W of Willow, Alaska",0.51,reviewed,2015-09-19 23:55:06.000,earthquake,2015-09-28T17:09:49.203Z,United States of America
3,7.087,0.45,0.07658,38.0,0.17,nc72523765,39.235,nc,-123.209333,1.83,0.145,27.0,nc,md,nc,29.0,"3km S of Redwood Valley, California",0.06,reviewed,2015-09-19 23:35:53.740,earthquake,2017-02-08T11:43:43.779Z,United States of America
4,10.92,0.52,0.05772,96.0,0.3,ci37245559,33.495667,ci,-116.486333,0.96,0.166,26.0,ci,ml,ci,31.0,"19km ESE of Anza, CA",0.2,reviewed,2015-09-19 23:33:17.180,earthquake,2016-03-11T02:57:36.685Z,United States of America
