## Work with a sample JSON for quake data in WA and OR 

Learn how to work with the USGS ComCat formats

In [53]:

import json
import pandas as pd
from pprint import pprint

In [2]:
with open("WA-OR-catalog.json") as json_file:
    json_data = json.load(json_file)

In [3]:
type(json_data)

dict

In [51]:
json_data

{u'bbox': [-126.8669, 42.3306, 3.64, -118.5953333, 48.7741667, 56.45],
 u'features': [{u'geometry': {u'coordinates': [-122.6071667,
     48.7741667,
     16.27],
    u'type': u'Point'},
   u'id': u'uw61297016',
   u'properties': {u'alert': None,
    u'cdi': 3.8,
    u'code': u'61297016',
    u'detail': u'https://earthquake.usgs.gov/fdsnws/event/1/query?eventid=uw61297016&format=geojson',
    u'dmin': 0.1771,
    u'felt': 286,
    u'gap': 99,
    u'ids': u',uw61297016,us1000955h,',
    u'mag': 3.1,
    u'magType': u'ml',
    u'mmi': 2.9,
    u'net': u'uw',
    u'nst': 31,
    u'place': u'4km S of Marietta, Washington',
    u'rms': 0.21,
    u'sig': 257,
    u'sources': u',uw,us,',
    u'status': u'reviewed',
    u'time': 1498725603370,
    u'title': u'M 3.1 - 4km S of Marietta, Washington',
    u'tsunami': 0,
    u'type': u'earthquake',
    u'types': u',dyfi,geoserve,origin,phase-data,shakemap,',
    u'tz': -480,
    u'updated': 1500447547040,
    u'url': u'https://earthquake.usgs.gov/e

In [12]:
json_data['bbox']

[-126.8669, 42.3306, 3.64, -118.5953333, 48.7741667, 56.45]

In [28]:
json_data['metadata']['count']

30

In [61]:
event_list = []
for i in range(json_data['metadata']['count']):
    event_line = []
    event_line.append(json_data['features'][i]['id'])
    event_line += json_data['features'][i]['geometry']['coordinates']
    event_line.append(json_data['features'][i]['properties']['alert']),
    event_line.append(json_data['features'][i]['properties']['cdi']),
    event_line.append(json_data['features'][i]['properties']['code']),
    event_line.append(json_data['features'][i]['properties']['felt']),
    event_line.append(json_data['features'][i]['properties']['gap']),
    event_line.append(json_data['features'][i]['properties']['ids'][1:-1]),
    event_line.append(json_data['features'][i]['properties']['mag']),
    event_line.append(json_data['features'][i]['properties']['magType']),
    event_line.append(json_data['features'][i]['properties']['mmi']),
    event_line.append(json_data['features'][i]['properties']['net']),
    event_line.append(json_data['features'][i]['properties']['nst']),
    event_line.append(json_data['features'][i]['properties']['place']),
    event_line.append(json_data['features'][i]['properties']['rms']),
    event_line.append(json_data['features'][i]['properties']['sig']),
    event_line.append(json_data['features'][i]['properties']['sources']),
    event_line.append(json_data['features'][i]['properties']['status']),
    event_line.append(json_data['features'][i]['properties']['time']),
    event_line.append(json_data['features'][i]['properties']['title']),
    event_line.append(json_data['features'][i]['properties']['tsunami']),
    event_line.append(json_data['features'][i]['properties']['type']),
    event_line.append(json_data['features'][i]['properties']['types'][1:-1]),
    event_line.append(json_data['features'][i]['properties']['tz']),
    event_line.append(json_data['features'][i]['properties']['updated']),
    event_line.append(json_data['features'][i]['properties']['url']),

    event_list.append(event_line)
event_list    

[[u'uw61297016',
  -122.6071667,
  48.7741667,
  16.27,
  None,
  3.8,
  u'61297016',
  286,
  99,
  u'uw61297016,us1000955h',
  3.1,
  u'ml',
  2.9,
  u'uw',
  31,
  u'4km S of Marietta, Washington',
  0.21,
  257,
  u',uw,us,',
  u'reviewed',
  1498725603370,
  u'M 3.1 - 4km S of Marietta, Washington',
  0,
  u'earthquake',
  u'dyfi,geoserve,origin,phase-data,shakemap',
  -480,
  1500447547040,
  u'https://earthquake.usgs.gov/earthquakes/eventpage/uw61297016'],
 [u'uw61276967',
  -121.6675,
  48.2565,
  8.47,
  None,
  3.8,
  u'61276967',
  31,
  74,
  u'uw61276967,us20009pki',
  2.92,
  u'ml',
  None,
  u'uw',
  24,
  u'4km W of Darrington, Washington',
  0.32,
  143,
  u',uw,us,',
  u'reviewed',
  1498361114610,
  u'M 2.9 - 4km W of Darrington, Washington',
  0,
  u'earthquake',
  u'dyfi,geoserve,origin,phase-data',
  -480,
  1501131675040,
  u'https://earthquake.usgs.gov/earthquakes/eventpage/uw61276967'],
 [u'uw61293181',
  -122.8856667,
  47.8403333,
  56.45,
  None,
  3.8,
  u'

In [62]:
cols = ['id', 'lat', 'long', 'depth', 'alert', 'cdi', 'code', 'felt', 'gap', 'ids', 'mag', 'magType', \
        'mmi', 'net', 'nst', 'place', 'rms', 'sig', 'sources', 'status', 'time', 'title', 'tsunami', \
        'type', 'types', 'tz', 'updated', 'url']
quakes = pd.DataFrame(event_list, columns = cols)

In [64]:
quakes.types

0              dyfi,geoserve,origin,phase-data,shakemap
1                       dyfi,geoserve,origin,phase-data
2     dyfi,geoserve,moment-tensor,origin,phase-data,...
3              dyfi,geoserve,origin,phase-data,shakemap
4                       dyfi,geoserve,origin,phase-data
5                       dyfi,geoserve,origin,phase-data
6              dyfi,geoserve,origin,phase-data,shakemap
7                       dyfi,geoserve,origin,phase-data
8              dyfi,geoserve,origin,phase-data,shakemap
9              dyfi,geoserve,origin,phase-data,shakemap
10             dyfi,geoserve,origin,phase-data,shakemap
11             dyfi,geoserve,origin,phase-data,shakemap
12                      dyfi,geoserve,origin,phase-data
13             dyfi,geoserve,origin,phase-data,shakemap
14                      dyfi,geoserve,origin,phase-data
15             dyfi,geoserve,origin,phase-data,shakemap
16                      dyfi,geoserve,origin,phase-data
17             dyfi,geoserve,origin,phase-data,s

In [65]:
quakes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 28 columns):
id         30 non-null object
lat        30 non-null float64
long       30 non-null float64
depth      30 non-null float64
alert      1 non-null object
cdi        30 non-null float64
code       30 non-null object
felt       30 non-null int64
gap        30 non-null int64
ids        30 non-null object
mag        30 non-null float64
magType    30 non-null object
mmi        17 non-null float64
net        30 non-null object
nst        25 non-null float64
place      30 non-null object
rms        30 non-null float64
sig        30 non-null int64
sources    30 non-null object
status     30 non-null object
time       30 non-null int64
title      30 non-null object
tsunami    30 non-null int64
type       30 non-null object
types      30 non-null object
tz         30 non-null int64
updated    30 non-null int64
url        30 non-null object
dtypes: float64(8), int64(7), object(13)
memory usage: 6.

In [67]:
quakes.types.unique()


array([u'dyfi,geoserve,origin,phase-data,shakemap',
       u'dyfi,geoserve,origin,phase-data',
       u'dyfi,geoserve,moment-tensor,origin,phase-data,shakemap',
       u'dyfi,geoserve,impact-link,losspager,moment-tensor,origin,phase-data,shakemap'], dtype=object)

In [68]:
quakes

Unnamed: 0,id,lat,long,depth,alert,cdi,code,felt,gap,ids,...,sources,status,time,title,tsunami,type,types,tz,updated,url
0,uw61297016,-122.607167,48.774167,16.27,,3.8,61297016,286,99,"uw61297016,us1000955h",...,",uw,us,",reviewed,1498725603370,"M 3.1 - 4km S of Marietta, Washington",0,earthquake,"dyfi,geoserve,origin,phase-data,shakemap",-480,1500447547040,https://earthquake.usgs.gov/earthquakes/eventp...
1,uw61276967,-121.6675,48.2565,8.47,,3.8,61276967,31,74,"uw61276967,us20009pki",...,",uw,us,",reviewed,1498361114610,"M 2.9 - 4km W of Darrington, Washington",0,earthquake,"dyfi,geoserve,origin,phase-data",-480,1501131675040,https://earthquake.usgs.gov/earthquakes/eventp...
2,uw61293181,-122.885667,47.840333,56.45,,3.8,61293181,156,31,"us20009mws,uw61293181",...,",us,uw,",reviewed,1497585325800,"M 3.7 - 17km WSW of Port Ludlow, Washington",0,earthquake,"dyfi,geoserve,moment-tensor,origin,phase-data,...",-480,1500163839749,https://earthquake.usgs.gov/earthquakes/eventp...
3,uw61272097,-123.0955,45.057667,22.48,,4.1,61272097,118,60,"uw61272097,us10008x4t",...,",uw,us,",reviewed,1496332977740,"M 3.0 - 9km NW of Keizer, Oregon",0,earthquake,"dyfi,geoserve,origin,phase-data,shakemap",-480,1498847650040,https://earthquake.usgs.gov/earthquakes/eventp...
4,uw61268327,-122.581833,47.5815,24.03,,3.4,61268327,18,37,"uw61268327,us10008u20",...,",uw,us,",reviewed,1495399431280,"M 2.8 - 1km ESE of Enetai, Washington",0,earthquake,"dyfi,geoserve,origin,phase-data",-480,1498314661040,https://earthquake.usgs.gov/earthquakes/eventp...
5,uw61267487,-122.589167,47.593167,25.11,,2.7,61267487,8,36,"uw61267487,us10008tdr",...,",uw,us,",reviewed,1495131332560,"M 2.5 - 1km NE of Enetai, Washington",0,earthquake,"dyfi,geoserve,origin,phase-data",-480,1497540178040,https://earthquake.usgs.gov/earthquakes/eventp...
6,uw61265222,-122.466333,47.961667,23.67,,4.1,61265222,594,55,"uw61265222,us10008rit",...,",uw,us,",reviewed,1494525395950,"M 3.4 - 6km SE of Freeland, Washington",0,earthquake,"dyfi,geoserve,origin,phase-data,shakemap",-480,1496027875574,https://earthquake.usgs.gov/earthquakes/eventp...
7,uw61265082,-122.564667,47.589667,21.55,,3.4,61265082,31,37,"uw61265082,us10008rgc",...,",uw,us,",reviewed,1494495396720,"M 2.6 - 2km ENE of Enetai, Washington",0,earthquake,"dyfi,geoserve,origin,phase-data",-480,1497340038040,https://earthquake.usgs.gov/earthquakes/eventp...
8,uw61265017,-122.584167,47.588667,24.06,,4.5,61265017,946,37,"us10008rey,uw61265017",...,",us,uw,",reviewed,1494488124720,"M 3.5 - 1km ENE of Enetai, Washington",0,earthquake,"dyfi,geoserve,origin,phase-data,shakemap",-480,1496219077040,https://earthquake.usgs.gov/earthquakes/eventp...
9,uw61264612,-122.584,47.579167,24.5,,3.6,61264612,795,37,"uw61264612,us10008r2n",...,",uw,us,",reviewed,1494404072090,"M 3.4 - 1km ESE of Enetai, Washington",0,earthquake,"dyfi,geoserve,origin,phase-data,shakemap",-480,1497337558040,https://earthquake.usgs.gov/earthquakes/eventp...


## Pull actual data for continental US using USGS API.  

This was done in sections, by editing and re-running the code below

In [76]:
import requests

In [5]:
# This is the ComCat reqeust to return earthquakes in WA and OR by lat long box, with magnitude > 2.5 
# I did not limit it to dyfi (did you feel it) events, because I want to also examine any changes in 
# quake frenqency.  I pulled all data from 1970 to end of June 2017.


In [239]:
wc_url = "https://earthquake.usgs.gov/fdsnws/event/1/query?format=geojson&starttime=1970-01-01&endtime=1974-12-31 \
&minlatitude=25&maxlatitude=49&minlongitude=-130&maxlongitude=-100\
&eventtype=earthquake&minmagnitude=2.5"

In [147]:
ec_url = "https://earthquake.usgs.gov/fdsnws/event/1/query?format=geojson&starttime=1970-01-01&endtime=1999-12-31\
&minlatitude=25&maxlatitude=49&minlongitude=-99.9999&maxlongitude=-67\
&eventtype=earthquake&minmagnitude=2.5"

In [241]:
wc_url

'https://earthquake.usgs.gov/fdsnws/event/1/query?format=geojson&starttime=1970-01-01&endtime=1974-12-31 &minlatitude=25&maxlatitude=49&minlongitude=-130&maxlongitude=-100&eventtype=earthquake&minmagnitude=2.5'

In [242]:
# Please do not re-run this section, it pulls a lot of data from the USGS comcat server.   I can help minimize
# the request if you'd like to see a demo

# # Package the request, send the request and catch the response: r
# r = requests.get(wc_url)

# # Decode the JSON data into a dictionary: json_data
# json_data = r.json()

In [243]:
r

<Response [200]>

In [244]:
json_data

{u'bbox': [-129.651, 25.781, -1.832, -100.693, 48.952, 62.234],
 u'features': [{u'geometry': {u'coordinates': [-121.8735, 36.593, 4.946],
    u'type': u'Point'},
   u'id': u'nc1022389',
   u'properties': {u'alert': None,
    u'cdi': None,
    u'code': u'1022389',
    u'detail': u'https://earthquake.usgs.gov/fdsnws/event/1/query?eventid=nc1022389&format=geojson',
    u'dmin': 0.03694,
    u'felt': None,
    u'gap': 167,
    u'ids': u',nc1022389,',
    u'mag': 3.39,
    u'magType': u'md',
    u'mmi': None,
    u'net': u'nc',
    u'nst': 63,
    u'place': u'Central California',
    u'rms': 0.11,
    u'sig': 177,
    u'sources': u',nc,',
    u'status': u'reviewed',
    u'time': 157660096830,
    u'title': u'M 3.4 - Central California',
    u'tsunami': 0,
    u'type': u'earthquake',
    u'types': u',focal-mechanism,nearby-cities,origin,phase-data,',
    u'tz': None,
    u'updated': 1481756564940,
    u'url': u'https://earthquake.usgs.gov/earthquakes/eventpage/nc1022389'},
   u'type': u'Feat

In [245]:
event_list = []
for i in range(json_data['metadata']['count']):
    event_line = []
    event_line.append(json_data['features'][i]['id'])
    event_line += json_data['features'][i]['geometry']['coordinates']
    event_line.append(json_data['features'][i]['properties']['alert']),
    event_line.append(json_data['features'][i]['properties']['cdi']),
    event_line.append(json_data['features'][i]['properties']['code']),
    event_line.append(json_data['features'][i]['properties']['detail']),
    event_line.append(json_data['features'][i]['properties']['dmin']),
    event_line.append(json_data['features'][i]['properties']['felt']),
    event_line.append(json_data['features'][i]['properties']['gap']),
    event_line.append(json_data['features'][i]['properties']['ids'][1:-1]),
    event_line.append(json_data['features'][i]['properties']['mag']),
    event_line.append(json_data['features'][i]['properties']['magType']),
    event_line.append(json_data['features'][i]['properties']['mmi']),
    event_line.append(json_data['features'][i]['properties']['net']),
    event_line.append(json_data['features'][i]['properties']['nst']),
    event_line.append(json_data['features'][i]['properties']['place']),
    event_line.append(json_data['features'][i]['properties']['rms']),
    event_line.append(json_data['features'][i]['properties']['sig']),
    event_line.append(json_data['features'][i]['properties']['sources']),
    event_line.append(json_data['features'][i]['properties']['status']),
    event_line.append(json_data['features'][i]['properties']['time']),
    event_line.append(json_data['features'][i]['properties']['title']),
    event_line.append(json_data['features'][i]['properties']['tsunami']),
    event_line.append(json_data['features'][i]['properties']['type']),
    event_line.append(json_data['features'][i]['properties']['types'][1:-1]),
    event_line.append(json_data['features'][i]['properties']['tz']),
    event_line.append(json_data['features'][i]['properties']['updated']),
    event_line.append(json_data['features'][i]['properties']['url']),

    event_list.append(event_line)
   

In [246]:
cols = ['id', 'lat', 'long', 'depth', 'alert', 'cdi', 'code', 'detail', 'dmin', 'felt', 'gap', 'ids', \
        'mag', 'magType', 'mmi', 'net', 'nst', 'place', 'rms', 'sig', 'sources', 'status', 'time', \
        'title', 'tsunami', 'type', 'types', 'tz', 'updated', 'url']
quakes = pd.DataFrame(event_list, columns = cols)

In [247]:
quakes.shape

(4010, 30)

In [248]:
quakes.to_csv("./US-west-half_1Q70-4Q74.csv", index=False),

(None,)

In [205]:
quakes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13866 entries, 0 to 13865
Data columns (total 30 columns):
id         13866 non-null object
lat        13866 non-null float64
long       13866 non-null float64
depth      13866 non-null float64
alert      0 non-null object
cdi        4 non-null float64
code       13866 non-null object
detail     13866 non-null object
dmin       5561 non-null float64
felt       4 non-null float64
gap        12669 non-null float64
ids        13866 non-null object
mag        13866 non-null float64
magType    13866 non-null object
mmi        21 non-null float64
net        13866 non-null object
nst        12669 non-null float64
place      13866 non-null object
rms        13549 non-null float64
sig        13866 non-null int64
sources    13866 non-null object
status     13866 non-null object
time       13866 non-null int64
title      13866 non-null object
tsunami    13866 non-null int64
type       13866 non-null object
types      13866 non-null object
tz      

## Download complete - now to examine and clean it.

After downloading the separate files for east and west coast, and for smaller year spans,  I recombined everything into one US-quake.csv file, 45MB in size.  This can be loaded and used as the dataset for this capstone project

In [74]:
import pandas as pd

df = pd.read_csv('US-quake.csv')
df.shape

(114503, 30)

In [75]:
df.head(3)

Unnamed: 0,id,lat,long,depth,alert,cdi,code,detail,dmin,felt,...,sources,status,time,title,tsunami,type,types,tz,updated,url
0,nc1022389,-121.8735,36.593,4.946,,,1022389,https://earthquake.usgs.gov/fdsnws/event/1/que...,0.03694,,...,",nc,",reviewed,157660000000.0,M 3.4 - Central California,0,earthquake,"focal-mechanism,nearby-cities,origin,phase-data",,1481760000000.0,https://earthquake.usgs.gov/earthquakes/eventp...
1,nc1022388,-121.4645,36.929,3.946,,,1022388,https://earthquake.usgs.gov/fdsnws/event/1/que...,0.04144,,...,",nc,",reviewed,157647000000.0,M 3.0 - Central California,0,earthquake,"nearby-cities,origin,phase-data",,1481760000000.0,https://earthquake.usgs.gov/earthquakes/eventp...
2,ci3319041,-116.128833,29.907667,6.0,,,3319041,https://earthquake.usgs.gov/fdsnws/event/1/que...,2.734,,...,",ci,",reviewed,157641000000.0,"M 4.6 - 206km SSE of Maneadero, B.C., MX",0,earthquake,"origin,phase-data",,1454030000000.0,https://earthquake.usgs.gov/earthquakes/eventp...


In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114503 entries, 0 to 114502
Data columns (total 30 columns):
id         114503 non-null object
lat        114503 non-null float64
long       114503 non-null float64
depth      114498 non-null float64
alert      693 non-null object
cdi        14130 non-null float64
code       114503 non-null object
detail     114503 non-null object
dmin       68118 non-null float64
felt       14130 non-null float64
gap        105961 non-null float64
ids        114503 non-null object
mag        114503 non-null float64
magType    114417 non-null object
mmi        2955 non-null float64
net        114503 non-null object
nst        98355 non-null float64
place      114503 non-null object
rms        108641 non-null float64
sig        114503 non-null int64
sources    114503 non-null object
status     114503 non-null object
time       114503 non-null float64
title      114503 non-null object
tsunami    114503 non-null int64
type       114503 non-null object
type

## Discussion

Everything is currently text, and there is missing data.   This is expected, as not all information is recorded for every quake.  

A data dictionary describing this data is availabe at https://earthquake.usgs.gov/data/comcat/data-eventterms.php   

Of most importance to this project will be **time, lat, long, depth, magnitude, mmi** and **cdi.**  

time       114514 non-null object  
id         114514 non-null object  
lat        114514 non-null object  
long       114514 non-null object  
depth      114509 non-null object  
mag        114514 non-null object  
mmi        2966 non-null object  
cdi        14141 non-null object  

All but **mmi** and **cdi** are compete.   

The Modified Mercali Intensity (mmi) is only computed for quakes that cause damage, and cdi is computed from citizen reports beginning around 2005.  The full dataset will be used to examine any changes in earthquake frequency over time, and the smaller dataset of mmi/cdi will be used to calibrate the cdi intensity data.  With that done, the slighty larger cdi dataset can be used to explore the relationship between magnitude and intensity, and to validate the current formulas, and to develop new, more regionally focused, formulas.  
  
My evolving list of questions to attempt to answer with this data include ...

- have there been any increase or decrease in # of quakes in specific regions?
   - by depth, by magnitude, by intensity  
   - can we group by state, by zip code, by geologic unit, by fault zone …  
     - need to geocode, and somehow classify geologic characteristics  
   - can it be correlated to human activity, especially fracking or injection (these are separate)   
   
- Are there more discrete regional differences in magnitude/intensity relationship   
  
- Is citizen reported DYFI data, converted to CDI, accurate.  Does it correspond with MMI 


In [77]:
# Continuous Data
#    Convert to float
#       lat, long, depth, cdi, dmin, gap, mag, mmi, rms
df[['lat','long', 'depth', 'cdi', 'dmin', 'gap', 'mag', 'mmi', 'rms']] = \
df[['lat','long', 'depth', 'cdi', 'dmin', 'gap', 'mag', 'mmi', 'rms']].apply(pd.to_numeric, errors='coerce')

#    Convert to int
#       felt, nst, sig, tz(need to learn what this is)
df[['felt', 'nst', 'sig', 'tz']] = df[['felt', 'nst', 'sig', 'tz']].apply(pd.to_numeric, errors='coerce')
# Need to deal with the NaNs before casting as integer 
# df['felt'] = df['felt'].astype(int)
# df['nst'] = df['nst'].astype(int)
# df['sig'] = df['sig'].astype(int)
# df['tz'] = df['tz'].astype(int)

# Convert to boolian 
#   tsunami
df['tsunami'] = df['tsunami'].apply(pd.to_numeric, errors='coerce')
df['tsunami'] = df['tsunami'].astype(bool)

# Convert to date/time  
#           see: https://stackoverflow.com/questions/21787496/converting-epoch-time-with-milliseconds-to-datetime
#           example code given in cell below
#   time, updated

import datetime

df['time'] = df['time'].apply(pd.to_numeric)
df['updated'] = df['updated'].apply(pd.to_numeric)

df['time'] = df['time'].astype(int)
df['updated'] = df['updated'].astype(int)

df['time'] = df['time'].apply(lambda x: datetime.datetime.fromtimestamp(int(x)/1000.0) )
df['updated'] = df['updated'].apply(lambda x: datetime.datetime.fromtimestamp(int(x)/1000.0) )


# Convert to ordinal 
#   alert
#
# This code will convert, but how to handle blanks, which mean no alert issued? 
# Is it OK to make them 0, or does that give some unintended importance?
def ordinize(strval, ordered_list, start_idx, idx_skip):
    i = 0
    for val in ordered_list:
        if strval == val:
            return i*idx_skip + start_idx
        i += 1

df['alert'] = df['alert'].apply(lambda x: ordinize(x, ['green', 'yellow', 'red'], 1, 1))
df['alert'] = df['alert'].fillna(value=0)

# dummy (if used - not sure they will be needed for any of the project goals)
#   magType, net, sources, status

# leave as string (informational and reference only, not needed for model learning or prediction)
#   id, code detail, ids, place, title, type, types, url

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114503 entries, 0 to 114502
Data columns (total 30 columns):
id         114503 non-null object
lat        114503 non-null float64
long       114503 non-null float64
depth      114498 non-null float64
alert      114503 non-null float64
cdi        14130 non-null float64
code       114503 non-null object
detail     114503 non-null object
dmin       68118 non-null float64
felt       14130 non-null float64
gap        105961 non-null float64
ids        114503 non-null object
mag        114503 non-null float64
magType    114417 non-null object
mmi        2955 non-null float64
net        114503 non-null object
nst        98355 non-null float64
place      114503 non-null object
rms        108641 non-null float64
sig        114503 non-null int64
sources    114503 non-null object
status     114503 non-null object
time       114503 non-null datetime64[ns]
title      114503 non-null object
tsunami    114503 non-null bool
type       114503 non-null o

In [82]:
# Deal with missing values

# Depth: 5 missing - delete these rows
df = df.dropna(subset=['depth'])

# magType: 86 missing,  fill with 'Unknown'  (there is one existing with Unknown)
df['magType'] = df['magType'].fillna(value='Unknown')


# cdi, mmi, felt:  Leave as NaN, analysis of mmi cdi/felt relationship will be done for existing values only
# dmin, gap, nst, rms, tz: Leave as NaN, no planned analysis or modeling uses these features.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [83]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 114498 entries, 0 to 114502
Data columns (total 30 columns):
id         114498 non-null object
lat        114498 non-null float64
long       114498 non-null float64
depth      114498 non-null float64
alert      114498 non-null float64
cdi        14130 non-null float64
code       114498 non-null object
detail     114498 non-null object
dmin       68118 non-null float64
felt       14130 non-null float64
gap        105961 non-null float64
ids        114498 non-null object
mag        114498 non-null float64
magType    114498 non-null object
mmi        2955 non-null float64
net        114498 non-null object
nst        98355 non-null float64
place      114498 non-null object
rms        108641 non-null float64
sig        114498 non-null int64
sources    114498 non-null object
status     114498 non-null object
time       114498 non-null datetime64[ns]
title      114498 non-null object
tsunami    114498 non-null bool
type       114498 non-null o

In [36]:
df.head()

Unnamed: 0,id,lat,long,depth,alert,cdi,code,detail,dmin,felt,...,sources,status,time,title,tsunami,type,types,tz,updated,url
0,nc1022389,-121.8735,36.593,4.946,,,1022389,https://earthquake.usgs.gov/fdsnws/event/1/que...,0.03694,,...,",nc,",reviewed,157660096830,M 3.4 - Central California,False,earthquake,"focal-mechanism,nearby-cities,origin,phase-data",,1481756564940,https://earthquake.usgs.gov/earthquakes/eventp...
1,nc1022388,-121.4645,36.929,3.946,,,1022388,https://earthquake.usgs.gov/fdsnws/event/1/que...,0.04144,,...,",nc,",reviewed,157646814820,M 3.0 - Central California,False,earthquake,"nearby-cities,origin,phase-data",,1481756553974,https://earthquake.usgs.gov/earthquakes/eventp...
2,ci3319041,-116.128833,29.907667,6.0,,,3319041,https://earthquake.usgs.gov/fdsnws/event/1/que...,2.734,,...,",ci,",reviewed,157641167870,"M 4.6 - 206km SSE of Maneadero, B.C., MX",False,earthquake,"origin,phase-data",,1454032083640,https://earthquake.usgs.gov/earthquakes/eventp...
3,usp00009ad,-116.402,30.424,33.0,,,p00009ad,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,...,",us,",reviewed,157641135300,"M 4.0 - offshore Baja California, Mexico",False,earthquake,origin,,1415316088071,https://earthquake.usgs.gov/earthquakes/eventp...
4,usp000099y,-116.185,30.757,33.0,,,p000099y,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,...,",ci,us,",reviewed,157550821500,"M 4.2 - offshore Baja California, Mexico",False,earthquake,"origin,phase-data",,1454030304990,https://earthquake.usgs.gov/earthquakes/eventp...


In [31]:
# Here are the methods for dealing with the two time in milliseconds since the epoch (1970).  I'm leaving times
# In the dataframe, I'm leaving the times as 11 digit integors, and for the moment plan to deal with the 
# conversions when the times are needed for something.  The milliseconds are important in seismology for
# accurate compuation of location and magnitude, it's not clear to me yet whether that will need to be maintained
# for display

import time

ttime = 1481756553974   # example time value for test
s, ms = divmod(ttime, 1000) 
print '{}.{:03d}'.format(time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(s)), ms)

2016-12-14 23:02:33.974


In [2]:
import datetime
import time

ttime = 1481756553974/1000   # example time value for test

print 'ttime:', ttime
print 'fromtimestamp(ttime):', datetime.date.fromtimestamp(ttime)



 ttime: 1481756553
fromtimestamp(ttime): 2016-12-14


In [5]:
print 'Now    :', datetime.datetime.now()
print 'Today  :', datetime.datetime.today()
print 'UTC Now:', datetime.datetime.utcnow()

d = datetime.datetime.now()
for attr in [ 'year', 'month', 'day', 'hour', 'minute', 'second', 'microsecond']:
    print attr, ':', getattr(d, attr)


 Now    : 2017-08-20 13:04:44.009927
Today  : 2017-08-20 13:04:44.010349
UTC Now: 2017-08-20 17:04:44.010535
year : 2017
month : 8
day : 20
hour : 13
minute : 4
second : 44
microsecond : 10849


In [14]:
import datetime
ttime = 1481756553974/float(1000)   # example time value for test
print ttime

dt = datetime.datetime.fromtimestamp(ttime)

for attr in [ 'year', 'month', 'day', 'hour', 'minute', 'second', 'microsecond']:
    print attr, ':', getattr(dt, attr)


1481756553.97
1975-12-14 18:02:33.974000
year : 2016
month : 12
day : 14
hour : 18
minute : 2
second : 33
microsecond : 974000


In [None]:
# One of first steps will be to geocode the lat long and find location by township, zipcode, county, state
# I'd also like to tie it to geologic unit or at least some indicator of crustal and surface physical properties
# but this is difficult and I haven't found a suitable reference DB.  ** For the record, this problem has been 
# difficult since the beginning of seismology - knowing the material properties of the rock through which
# the seismic waves travel, and is one reason why we have only east coast and west coast equations for relating
# magnitude and intensity.


## EDA
Visualization of the quake dataset
 - Map showing location and itensity, maybe interactive , maybe dot size variations for magnitude
   ( this is like an interactive variant of the standard seismic risk maps )
     - with a slider bar for year,  # this would show the "bloom" in Oklahoma quakes over last 10 years
     - and or a slider bar for magnitide
 - Frequency bar charts per decade (or maybe half decade) divisions, grouped/binned by magnitude, with bars green to red for increasing magnitude  
 - Frequency by depth - still considering if this is useful or interesting
 - Felt area by magnitide showing geographic differences, maybe just example quakes in various areas
 - location of reporting agencies/labs with indicator of number of contributions to the dataset
 - magnitude calculation types by distance of seismograph from epicenter
 - Scatter plot of relationship between mmi and cdi  intensity assessments, possibly grouped by region

## Analysis
What can be said with statistical signifcance
 - Hypothesis test of equivalence of mmi and cdi 
 - Hypothesis test of changing quake frequency over time
 - Hypothesis test of Oklahoma quake magnitude / intensity increasing over time
 - Characterization of variance in magnitude cacluation methods
 - Characterization of variance in cdi ?
 - Characterization of variance among reporting agencies

## Modeling

 - Validate existing USGS magnitude / intensity equations for east and west coast
 - New equations for magnitude and intensity relationship
 - Not sure about stepping it the EQ prediction morass, but maybe something based on frequency of mag 7 or larger?
 - Need to generate more ideas here