# Large dataset generator

This script generates 500'000 test records of earthquakes and retrieves the country and nearest city from the gep locaiton data

In [1]:
import numpy as np
import pandas as pd
import datetime
import random

# Define the list of cities with high earthquake activity
cities = [
    {"City": "Tokyo", "Country": "Japan"},
    {"City": "San Francisco", "Country": "USA"},
    {"City": "Los Angeles", "Country": "USA"},
    {"City": "Mexico City", "Country": "Mexico"},
    {"City": "Anchorage", "Country": "USA"},
    {"City": "Istanbul", "Country": "Turkey"},
    {"City": "Kathmandu", "Country": "Nepal"}
]

# Set the number of earthquake records to generate
num_records = 500000

# Create a datetime range
start_date = datetime.datetime(1974, 1, 1)
end_date = datetime.datetime(2023, 9, 30)
date_range = pd.date_range(start_date, end_date, periods=num_records)

# Generate synthetic earthquake data
data = {
    "Timestamp": date_range,
    "City": [random.choice(cities)["City"] for _ in range(num_records)],
    "Magnitude": np.random.uniform(3.0, 9.0, num_records),
    "Depth (km)": np.random.uniform(1.0, 100.0, num_records),
   
}

# Create a DataFrame to store the earthquake data
earthquake_data = pd.DataFrame(data)



In [2]:
earthquake_data.head()

Unnamed: 0,Timestamp,City,Magnitude,Depth (km)
0,1974-01-01 00:00:00.000000000,Los Angeles,8.849242,49.372167
1,1974-01-01 00:52:19.609479218,Istanbul,5.875772,24.514697
2,1974-01-01 01:44:39.218958437,Istanbul,3.866932,37.611283
3,1974-01-01 02:36:58.828437656,Kathmandu,4.844319,1.358652
4,1974-01-01 03:29:18.437916875,Los Angeles,6.56826,93.289882


In [3]:
earthquake_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   Timestamp   500000 non-null  datetime64[ns]
 1   City        500000 non-null  object        
 2   Magnitude   500000 non-null  float64       
 3   Depth (km)  500000 non-null  float64       
dtypes: datetime64[ns](1), float64(2), object(1)
memory usage: 15.3+ MB


In [4]:

# Optionally, you can save the data to a CSV file
earthquake_data.to_csv('../data/earthquake_data.csv', index=False)

In [5]:
# export as pickle file
earthquake_data.to_pickle('../data/earthquake_data.pkl')

In [12]:
import pandas as pd

# Define a list of cities with their corresponding countries
cities = [
    {"City": "Tokyo", "Country": "Japan"},
    {"City": "San Francisco", "Country": "USA"},
    {"City": "Los Angeles", "Country": "USA"},
    {"City": "Mexico City", "Country": "Mexico"},
    {"City": "Anchorage", "Country": "USA"},
    {"City": "Istanbul", "Country": "Turkey"},
    {"City": "Kathmandu", "Country": "Nepal"}
]

# Create a DataFrame to store population data
population_data = pd.DataFrame(columns=["City", "Country", "Population"])

# Define synthetic population data for each city
population_data["City"] = [city["City"] for city in cities]
population_data["Country"] = [city["Country"] for city in cities]

# Generate synthetic population figures
# You can adjust the population range as needed
population_data["Population"] = [
    int(pop * 1000) for pop in
    [37.0, 0.883, 3.8, 21.0, 0.29, 15.0, 1.0]  # Population in millions (as of my last update)
]



In [13]:
# Optionally, you can save the data to a CSV file
population_data.to_csv('../data/city_population_data.csv', index=False)


In [14]:
population_data.to_pickle('../data/city_population_data.pkl')