# Import required library

In [1]:
import numpy as np
import pandas as pd
import os
import glob
import concurrent.futures
import requests
import cloudscraper
import time
from requests.exceptions import RequestException

### Earthquake dataset form USGS(United States Geological Survey)

 Data collection begains with the downloading the dataset form USGS, the USGS provides science for a changing world, which reflects and responds to society’s continuously evolving needs. As the science arm of the Department of the Interior, the USGS brings an array of earth, water, biological, and mapping data and expertise to bear in support of decision-making on environmental, resource, and public safety issues. 
url = 'https://earthquake.usgs.gov/earthquakes/search/#%7B%22currentfeatureid%22%3Anull%2C%22mapposition%22%3A%5B%5B-86.46756%2C-537.1875%5D%2C%5B86.44583%2C160.3125%5D%5D%2C%22autoUpdate%22%3A%5B%22autoUpdate%22%5D%2C%22feed%22%3A%22undefined_undefined%22%2C%22listFormat%22%3A%22default%22%2C%22restrictListToMap%22%3A%5B%5D%2C%22sort%22%3A%22newest%22%2C%22basemap%22%3A%22grayscale%22%2C%22overlays%22%3A%5B%22plates%22%5D%2C%22distanceUnit%22%3A%22km%22%2C%22timezone%22%3A%22local%22%2C%22viewModes%22%3A%5B%22settings%22%2C%22map%22%5D%2C%22event%22%3Anull%2C%22search%22%3Anull%7D'
from this source ive downloaded dataset from year 2011 to present date which is 2024 jan.
from the website we can download less than 20000 entries at a time and there are more thatn 20k lables in a year 
so there are several dataset that i ve to import and concat.

In [2]:
# Function to read CSV files using pandas. where path is the location of the file in the local storage 

def read_csv(path):
    return pd.read_csv(path)



In [3]:
# To read all CSV files from a specific location using the glob.glob method 

csv_files = glob.glob('../dataset/first_dataset_from_sorce/*.{}'.format('csv'))
csv_files
# these are the list of usefull CSV dataframe 

['../dataset/first_dataset_from_sorce/query (11).csv',
 '../dataset/first_dataset_from_sorce/query (2).csv',
 '../dataset/first_dataset_from_sorce/query (27).csv',
 '../dataset/first_dataset_from_sorce/query (26).csv',
 '../dataset/first_dataset_from_sorce/query (3).csv',
 '../dataset/first_dataset_from_sorce/query (10).csv',
 '../dataset/first_dataset_from_sorce/query (21).csv',
 '../dataset/first_dataset_from_sorce/query (8).csv',
 '../dataset/first_dataset_from_sorce/query (17).csv',
 '../dataset/first_dataset_from_sorce/query (4).csv',
 '../dataset/first_dataset_from_sorce/query (5).csv',
 '../dataset/first_dataset_from_sorce/query (16).csv',
 '../dataset/first_dataset_from_sorce/query (9).csv',
 '../dataset/first_dataset_from_sorce/query (20).csv',
 '../dataset/first_dataset_from_sorce/query (19).csv',
 '../dataset/first_dataset_from_sorce/query (23).csv',
 '../dataset/first_dataset_from_sorce/query (6).csv',
 '../dataset/first_dataset_from_sorce/query (15).csv',
 '../dataset/firs

In [4]:
#concating all the csv dataset in to one 

df = pd.concat([read_csv(f) for f in csv_files ], ignore_index=True)

In [5]:
df.columns

Index(['time', 'latitude', 'longitude', 'depth', 'mag', 'magType', 'nst',
       'gap', 'dmin', 'rms', 'net', 'id', 'updated', 'place', 'type',
       'horizontalError', 'depthError', 'magError', 'magNst', 'status',
       'locationSource', 'magSource'],
      dtype='object')

In [6]:
# sorting the dataframe by the time column because during concatining the files entries are ultered

df = df.sort_values(by='time').reset_index(drop = True)

In [7]:
# here the time column have time and date, here im taking hours, minute, day, months and year for the analysis
df['time'].head()

0    2011-01-01T00:02:31.960Z
1    2011-01-01T00:06:59.380Z
2    2011-01-01T00:34:10.130Z
3    2011-01-01T00:46:01.400Z
4    2011-01-01T01:00:46.850Z
Name: time, dtype: object

In [8]:
df['time'] = df['time'].str.slice(0,19)
df['time'] = df['time'].str.replace('T',' ')
df['time'].head()

0    2011-01-01 00:02:31
1    2011-01-01 00:06:59
2    2011-01-01 00:34:10
3    2011-01-01 00:46:01
4    2011-01-01 01:00:46
Name: time, dtype: object

In [9]:
df.shape

(363898, 22)

In [10]:
# converting the time features in to the readable format to scrape distance of moon by the time 
df['time'] = pd.to_datetime(df['time'], format = '%Y-%m-%d %H:%M:%S')

In [11]:
df.columns

Index(['time', 'latitude', 'longitude', 'depth', 'mag', 'magType', 'nst',
       'gap', 'dmin', 'rms', 'net', 'id', 'updated', 'place', 'type',
       'horizontalError', 'depthError', 'magError', 'magNst', 'status',
       'locationSource', 'magSource'],
      dtype='object')

### filtering the dataset based on requirement for the research 

In [12]:
df['type'].value_counts()


type
earthquake                    360471
mining explosion                2368
explosion                        641
ice quake                        133
other event                      105
volcanic eruption                 73
rock burst                        38
quarry blast                      33
mine collapse                     13
experimental explosion             8
nuclear explosion                  4
landslide                          3
Landslide                          2
sonic boom                         2
collapse                           1
induced or triggered event         1
Ice Quake                          1
acoustic noise                     1
Name: count, dtype: int64

In [13]:
# there are different types of recorded sesmic waves here we are working on earthquake dataset so removing oter types of waves
df = df[df['type'].isin(['earthquake'])]
df['type'].unique()

array(['earthquake'], dtype=object)

In [14]:
df = df.sample(frac = 0.001)
df.shape

(360, 22)

### Webscraping Distance 

In [15]:
from skyfield.api import Loader, Topos
from datetime import datetime

def get_distance_earth_moon(date_time):
    # Load the ephemeris data from the JPL DE421 file
    load = Loader('~/skyfield-data')
    planets = load('de421.bsp')  # Load the DE421 ephemeris data set
    earth, moon = planets['earth'], planets['moon']  # Get Earth and Moon objects

    # Get the position of the Moon relative to Earth
    ts = load.timescale()  # Load the timescale object
    # Convert the provided datetime to Skyfield's time object
    t = ts.utc(date_time.year, date_time.month, date_time.day, 
               date_time.hour, date_time.minute, date_time.second)
    astrometric = (moon - earth).at(t)  # Calculate the astrometric position of the Moon relative to Earth

    # Get the distance between Earth and Moon in kilometers
    distance_km = astrometric.distance().km
#     print(distance_km)
    return distance_km  # Return the distance in kilometers



In [16]:
df['distance'] = df['time'].apply(get_distance_earth_moon)

In [17]:
df['distance'] = df['distance'].astype(int)

In [18]:
df.head()


Unnamed: 0,time,latitude,longitude,depth,mag,magType,nst,gap,dmin,rms,...,place,type,horizontalError,depthError,magError,magNst,status,locationSource,magSource,distance
283505,2021-07-29 17:51:42,55.1186,-157.0809,42.5,3.1,ml,,,,0.79,...,"155 km SSE of Chignik, Alaska",earthquake,,1.1,,,reviewed,ak,ak,396592
89319,2014-11-19 06:40:52,-37.6885,179.6773,35.0,4.9,mb,,99.0,1.095,1.37,...,"181 km NE of Gisborne, New Zealand",earthquake,7.1,2.0,0.117,25.0,reviewed,us,us,394257
91934,2014-12-23 20:35:28,36.8913,-97.6911,5.554,3.0,ml,,54.0,,0.79,...,"4 km SW of Renfrow, Oklahoma",earthquake,1.9,7.0,,,reviewed,tul,tul,365253
19005,2011-10-07 17:54:54,35.365333,-92.263667,5.07,2.6,md,15.0,61.0,0.04348,0.13,...,"4 km WSW of Quitman, Arkansas",earthquake,0.6,0.8,0.178,5.0,reviewed,nm,nm,398879
237202,2020-01-16 16:40:25,52.2522,159.8762,35.0,4.1,mb,,68.0,1.074,1.03,...,"121 km SE of Petropavlovsk-Kamchatsky, Russia",earthquake,7.8,2.0,0.165,16.0,reviewed,us,us,369733


### webscraping gravity 

In [19]:
def get_url(lat, lon):
    # Construct the URL using the provided latitude and longitude values
    url = str(f'https://geodesy.noaa.gov/api/gravd/gp?lat={lat}&lon={lon}&eht=100.0')
    # Optional: Uncomment the next line to print the generated URL for debugging
    # print(url)
    return url

# Example call to the function using the first row's latitude and longitude values from the dataframe `df`
get_url(df['latitude'].iloc[0], df['longitude'].iloc[0])


'https://geodesy.noaa.gov/api/gravd/gp?lat=55.1186&lon=-157.0809&eht=100.0'

In [20]:
# Apply the get_url function to each row in the dataframe `df` by using the latitude and longitude columns,
# and store the resulting URLs in a new column named 'url'.

df['url'] = np.vectorize(get_url)(df['latitude'], df['longitude'])

In [21]:


# to fetch gravity data from multiple URLs concurrently using multithreading. The provided code defines a function get_gravity to retrieve gravity data from 
# a single URL and another function fetch_gravity_concurrently to apply multithreading for concurrent requests.

def get_gravity(url):
    scraper = cloudscraper.create_scraper()
    max_retries = 3
    for attempt in range(max_retries):
        try:
            response = scraper.get(url)
            if response.status_code == 200:
                data = response.json()  # Parse the JSON response
                return data.get('predictedGravity', 'none')
            else:
                print(f"Request failed with status code: {response.status_code}")
                time.sleep(180)  # Wait a bit before retrying
                return 'none'
        except RequestException as e:
            print(f"Attempt {attempt + 1} for URL {url} failed: {e}")
            if attempt == max_retries - 1:
                return 'none'
            
# Function to apply multithreading for concurrent requests
def fetch_gravity_concurrently(urls):
    results = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_to_url = {executor.submit(get_gravity, url): url for url in urls}
        for future in concurrent.futures.as_completed(future_to_url):
            url = future_to_url[future]
            try:
                result = future.result()
                results.append(result)
                print(len(results), result)
            except Exception as exc:
                print(f'{url} generated an exception: {exc}')
                results.append('none')
    return results

# Main execution
# if __name__ == "__main__":
#     urls = df['url'].tolist()
#     df['gravity'] = fetch_gravity_concurrently(urls)

1 981566.869
2 978504.989
3 979842.139
4 979280.662
5 981124.562
6 979725.599
7 981547.584
8 981942.19
9 982213.356
10 978491.657
11 979854.475
12 981092.456
13 981120.617
14 979918.69
15 980006.673
16 979555.106
17 979742.813
18 977948.749
19 979658.565
20 980169.566
21 982206.413
22 979578.411
23 979190.898
24 979012.271
25 978094.27
26 980692.058
27 978033.028
28 981359.314
29 982337.075
30 979762.638
31 982533.522
32 980995.959
33 981561.157
34 979886.584
35 978477.987
36 981024.579
37 978542.269
38 979954.531
39 979431.383
40 981253.966
41 983065.718
42 981621.387
43 978284.392
44 981415.168
45 978152.221
46 979748.433
47 979468.788
48 979058.863
49 978317.769
50 978528.188
51 978304.066
52 979922.273
53 978312.128
54 979847.65
55 978370.297
56 980142.432
57 979779.523
58 979428.277
59 981236.484
60 979952.278
61 980018.375
62 979918.84
63 982369.882
64 979015.606
65 978654.782
66 978823.426
67 977959.335
68 981361.283
69 979421.046
70 981499.682
71 978217.097
72 978453.511
73 978

In [22]:

df.head(2)

Unnamed: 0,time,latitude,longitude,depth,mag,magType,nst,gap,dmin,rms,...,horizontalError,depthError,magError,magNst,status,locationSource,magSource,distance,url,gravity
283505,2021-07-29 17:51:42,55.1186,-157.0809,42.5,3.1,ml,,,,0.79,...,,1.1,,,reviewed,ak,ak,396592,https://geodesy.noaa.gov/api/gravd/gp?lat=55.1...,981566.869
89319,2014-11-19 06:40:52,-37.6885,179.6773,35.0,4.9,mb,,99.0,1.095,1.37,...,7.1,2.0,0.117,25.0,reviewed,us,us,394257,https://geodesy.noaa.gov/api/gravd/gp?lat=-37....,978504.989


### calculating force between earth and moon

In [23]:
# G is the universal gravitational constant 6.67 X 10-11N.m2/kg2.  which can be represented as 6.67e-11.
G = 6.67e-11
# the mass of moon in the kg
moon_mass =  7.35e22

# this is mass of the eart in the kg
earth_mass = 5.9722e24
# by usning sample dataframe the force acting at the time of the event of earth quake is calculated, 

# Calculate force
# force = (G * earth_mass * moon_mass) / (df['distance'] * 1000) ** 2
# i have multiplied distance by 1000 because it is in killometer and we have to apply formula on meter.
# Assign the calculated force to the 'force' column using .loc
df.loc[:, 'force'] = df['distance'].apply(lambda x: (G * earth_mass * moon_mass) / ((x * 1000) ** 2))
df['force'].head()

283505    1.861485e+20
89319     1.883600e+20
91934     2.194623e+20
19005     1.840201e+20
237202    2.141761e+20
Name: force, dtype: float64

In [33]:
# df.to_csv('raw_dataset.csv')