Title: DSC350 Week 3 Exercises  
Author: Stefanie Molin  
Date: 13 September 2024  
Modified By: Caleb Trimble  
Description: This program ingests a csv related to earthquakes and tsunamis. It expresses various different statistical elements related to different locations impacted by events and creates fluidity through parsing and additional columns.

Codes in this program have been adapted from Hands-On Data Analysis with Pandas - Second Edition (Molin S., 2021)

In [25]:
import pandas as pd

df = pd.read_csv('data\\parsed.csv')  # Directs Pandas to read the csv for analysis. 
df.columns


Index(['alert', 'cdi', 'code', 'detail', 'dmin', 'felt', 'gap', 'ids', 'mag',
       'magType', 'mmi', 'net', 'nst', 'place', 'rms', 'sig', 'sources',
       'status', 'time', 'title', 'tsunami', 'type', 'types', 'tz', 'updated',
       'url', 'parsed_place'],
      dtype='object')

In [12]:
# Subsets the dataframe using label-based lookups. 
japan_mb_df = df.loc[
    (df.place.str.contains('Japan'))  # Defines the label to look up in the 'place' column.
    & (df.magType == 'mb')  # Defines the type of magnitude to look up.
]
percentile_95 = japan_mb_df['mag'].quantile(0.95)  # Directs the program to find the 95th percentile of earthquakes in Japan with a magType of 'mb'
print(f"The 95th percentile of earthquake magnitude in Japan (mb) is: {percentile_95}")
print()

The 95th percentile of earthquake magnitude in Japan (mb) is: 4.9



In [13]:
indo_df = df.loc[(df.place.str.contains('Indonesia'))]
total_earthquakes = len(indo_df)  # Looks up total number of entries in Indonesia.
earthquakes_with_tsunamis = len(indo_df[indo_df.tsunami == 1])  # Looks up the total number for Indonesia that also have tsunamis.
percentage = (earthquakes_with_tsunamis/total_earthquakes) * 100  # Calculates the percentage of earthquakes that coincided with tsunamis.
print(f"The percentage of earthquakes with tsunamis is: {percentage: .2f}")
print()

The percentage of earthquakes with tsunamis is:  23.13



In [14]:
nevada_df = df.loc[df.place.str.contains('Nevada')]
sum_stats = nevada_df.describe()  # .describe provides summary statistics for the defined place.
print(sum_stats)
print()

             cdi        dmin       felt         gap         mag   mmi  \
count  15.000000  677.000000  15.000000  677.000000  677.000000  1.00   
mean    2.440000    0.166982   2.400000  154.029527    0.491728  2.84   
std     0.501142    0.166400   4.626013   68.769713    0.689560   NaN   
min     2.000000    0.001000   1.000000   29.140000   -0.500000  2.84   
25%     2.000000    0.054000   1.000000   97.670000   -0.100000  2.84   
50%     2.200000    0.113000   1.000000  149.550000    0.400000  2.84   
75%     2.900000    0.234000   1.000000  200.470000    0.900000  2.84   
max     3.300000    1.414000  19.000000  355.910000    2.900000  2.84   

              nst         rms         sig          time  tsunami     tz  \
count  677.000000  677.000000  677.000000  6.770000e+02    677.0  677.0   
mean    12.608567    0.151909   10.688331  1.538314e+12      0.0 -480.0   
std      9.890620    0.084742   19.252727  5.954070e+08      0.0    0.0   
min      3.000000    0.000500    0.000000 

In [15]:
df.place.str.extract(r', (.*$)')[0].sort_values().unique()  # Uses a regular expression to extract everything after the comma in the 'place' column.
#  Replaces patterns in the 'place' column with uniform place names in 'parsed_place'.
df['parsed_place'] = df.place.str.replace(
    r'.* of ', '', regex=True 
).str.replace(
    'the ', '' 
).str.replace(
    r'CA$', 'California', regex=True 
).str.replace(
    r'NV$', 'Nevada', regex=True 
).str.replace(
    r'MX$', 'Mexico', regex=True 
).str.replace(
    r' region$', '', regex=True 
).str.replace(
    'northern ', '' 
).str.replace(
    'Fiji Islands', 'Fiji' 
).str.replace(
    r'^.*, ', '', regex=True 
).str.strip() # remove any extra spaces
df.parsed_place.sort_values().unique()


array(['Afghanistan', 'Alaska', 'Argentina', 'Arizona', 'Arkansas',
       'Ascension Island', 'Australia', 'Azerbaijan', 'Balleny Islands',
       'Barbuda', 'Bolivia', 'British Virgin Islands', 'Burma',
       'California', 'Canada', 'Carlsberg Ridge',
       'Central East Pacific Rise', 'Central Mid-Atlantic Ridge', 'Chile',
       'China', 'Christmas Island', 'Colombia', 'Colorado', 'Costa Rica',
       'Dominican Republic', 'East Timor', 'Ecuador', 'El Salvador',
       'Fiji', 'Greece', 'Greenland', 'Guam', 'Guatemala', 'Haiti',
       'Hawaii', 'Honduras', 'Idaho', 'Illinois', 'India',
       'Indian Ocean Triple Junction', 'Indonesia', 'Iran', 'Iraq',
       'Italy', 'Jamaica', 'Japan', 'Kansas', 'Kentucky',
       'Kermadec Islands', 'Kuril Islands', 'Kyrgyzstan', 'Martinique',
       'Mauritius', 'Mayotte', 'Mexico', 'Mid-Indian Ridge', 'Missouri',
       'Montana', 'Nevada', 'New Caledonia', 'New Hampshire',
       'New Mexico', 'New Zealand', 'Nicaragua', 'North Carolina',


While not required for the assignment, without creating a parsed place column we could have significant issues when attempting to achieve accurate results for Ring of Fire statistics.

In [28]:
# Defines the places within the ring of fire.
ring_of_fire_locations = [
    'Alaska', 'Antarctic', 'Bolivia', 'California', 'Canada', 'Chile', 'Costa Rica', 'Ecuador',
    'Fiji', 'Guatemala', 'Indonesia', 'Japan', 'Kermadec Islands', 'Mexico', 'New Zealand',
    'Peru', 'Philippines', 'Russia', 'Taiwan', 'Tonga', 'Washington'
]
def is_ring_of_fire(parsed_place):
# Searches 'parsed_place' column and compares it to the list of places in the Ring of Fire.
# Returns TRUE if the place is in the Ring of Fire and FALSE if it's not found in the list.
    return any(location in parsed_place for location in ring_of_fire_locations)

# Creates ring_of_fire column by applying the is_ring_of_fire function to the 'parsed_place' column.
df['ring_of_fire'] = df['parsed_place'].apply(is_ring_of_fire) 
print(df.head())

  alert  cdi      code                                             detail  \
0   NaN  NaN  37389218  https://earthquake.usgs.gov/fdsnws/event/1/que...   
1   NaN  NaN  37389202  https://earthquake.usgs.gov/fdsnws/event/1/que...   
2   NaN  4.4  37389194  https://earthquake.usgs.gov/fdsnws/event/1/que...   
3   NaN  NaN  37389186  https://earthquake.usgs.gov/fdsnws/event/1/que...   
4   NaN  NaN  73096941  https://earthquake.usgs.gov/fdsnws/event/1/que...   

       dmin  felt    gap           ids   mag magType  ...           time  \
0  0.008693   NaN   85.0  ,ci37389218,  1.35      ml  ...  1539475168010   
1  0.020030   NaN   79.0  ,ci37389202,  1.29      ml  ...  1539475129610   
2  0.021370  28.0   21.0  ,ci37389194,  3.42      ml  ...  1539475062610   
3  0.026180   NaN   39.0  ,ci37389186,  0.44      ml  ...  1539474978070   
4  0.077990   NaN  192.0  ,nc73096941,  2.16      md  ...  1539474716050   

                           title  tsunami        type  \
0  M 1.4 - 9km NE of Ag

In [29]:
# Counts number of entries that are in the 'ring_of_fire' column.
rof_earthquakes = df['ring_of_fire'].sum()
# Counts total number of entries in the dataframe and subtracts the number of Ring of Fire earthquakes from it.
non_rof_earthquakes = len(df) - rof_earthquakes
# Prints results for Ring of Fire earthquakes and earthquakes outside of the Ring of Fire.
print(f"Number of Ring of Fire Earthquakes: {rof_earthquakes}")
print(f"Number of Earthquakes outside of the Ring of Fire: {non_rof_earthquakes}")


Number of Ring of Fire Earthquakes: 7189
Number of Earthquakes outside of the Ring of Fire: 2143


In [30]:
# Creates dataframe for Ring of Fire specific events.
ring_of_fire_df = df[df['ring_of_fire']]
# References Ring of Fire dataframe and counts entries that have a tsunami correlation.
rof_tsunamis = ring_of_fire_df['tsunami'].sum()  
print(f"Number of Ring of Fire Tsunamis: {rof_tsunamis}")


Number of Ring of Fire Tsunamis: 45
