In [1]:
%matplotlib inline
import os
import numpy as np
import matplotlib.pyplot as plt

# Import the Parsers
from openquake.cat.parsers.isf_catalogue_reader import ISFReader
from openquake.cat.parsers.converters import (GenericCataloguetoISFParser, GCMTtoISFParser)
from openquake.cat.isc_homogenisor import (HomogenisorPreprocessor,
                                   DynamicHomogenisor,
                                   MagnitudeConversionRule,
                                   DuplicateFinder)
import openquake.cat.catalogue_query_tools as cqt

import unittest
import tempfile
import numpy as np
import pandas as pd
import toml

from openquake.cat.hmg import merge

import tempfile
import shutil

In [2]:
BASE_PATH = os.getcwd()
data_path = os.path.join(BASE_PATH, "inputs")

temp_dir = tempfile.mkdtemp()

In [3]:
# This merges multiple .isf files together

def append_isf_files(source_folder, output_file):
    '''  
    Append a series of .isf files that were downloaded from the ISC website.
    '''
    try:
        with open(output_file, 'w') as output:
            for filename in os.listdir(source_folder):
                if filename.endswith('.isf'):
                    file_path = os.path.join(source_folder, filename)
                    with open(file_path, 'r') as input_file:
                        content = input_file.read()
                        output.write(content + '\n')

        print("Content of text files appended together successfully.")
    except Exception as e:
        print("An error occurred:", str(e))

# Replace 'path_to_source_folder' with the actual path of the folder containing the text files.
source_folder = os.path.join(data_path, "isf")
output_file = os.path.join(BASE_PATH, "outputs", "1900-2021-PH_ISF_Catalogue.isf")


append_isf_files(source_folder, output_file)

# move to temp_dir
shutil.move(output_file, temp_dir)
print("moved to Temporary directory:", temp_dir)

Content of text files appended together successfully.
moved to Temporary directory: C:\Users\ENRICO~1.ABC\AppData\Local\Temp\tmpz68255hq


In [4]:
#ISCGEM
iscgem_parser = GenericCataloguetoISFParser("inputs/isc-gem-cat.csv")
iscgem_catalogue = iscgem_parser.parse("ISC-GEM", "ISC-GEM-CAT")

#ISC
isc_parser_1900_2021 = ISFReader(os.path.join(temp_dir, "1900-2021-PH_ISF_Catalogue.isf"),
                   selected_origin_agencies=["ISC-GEM", "ISC-EHB", "EHB", "ISC", "IDC", "NEIC", "NEIS", "USCGS", "NIED", "GCMT", "GUTE", "PAS"],
                   selected_magnitude_agencies=["ISC-GEM", "ISC-EHB", "EHB", "ISC", "IDC", "NEIC", "NEIS", "USCGS", "NIED", "GCMT", "GUTE", "PAS"])
isc_catalogue_1900_2021 = isc_parser_1900_2021.read_file("ISC-RB", "ISC-1900-2021")

#USGS-COMCAT
usgsComcat_parser = GenericCataloguetoISFParser("inputs/usgs1900-2023.csv")
usgsComcat_catalogue = usgsComcat_parser.parse("usgsComcat", "usgsComcat-CAT")

#GCMT
GCMT_catalogue = GCMTtoISFParser("inputs/gcmt-cat-1976-2020.txt").parse()

#Pacheco & Sykes 1992
PAS_parser = GenericCataloguetoISFParser("inputs/ps1992-ph-cat.csv")
PAS_catalogue = PAS_parser.parse("ps1992-ph", "ps1992-ph-CAT")

#PHIVOLCS
phivolcs_parser = GenericCataloguetoISFParser("inputs/phivolcs-combined-hmtk-minM4pt5.csv")
phivolcs_catalogue = phivolcs_parser.parse("phivolcs", "phivolcs-CAT")

print("ISC-GEM Catalogue 1900-2020 contains: %d events" % iscgem_catalogue.get_number_events())
print("ISC Reviewed Bulleting Catalogue 1900-2021 contains: %d events" % isc_catalogue_1900_2021.get_number_events())
print("USGS Catalogue 1977-2023 contains: %d events" % usgsComcat_catalogue.get_number_events())
print("GCMT Catalogue 1976-2020 contains: %d events" % GCMT_catalogue.get_number_events())
print("Pacheco&Sykes1992 in PH Catalogue 1900-1989 contains: %d events" % PAS_catalogue.get_number_events())
print("PHIVOLCS Catalogue 2015-2023 contains: %d events" % phivolcs_catalogue.get_number_events())

Parsing catalogue ...
complete. Contains 56832 moment tensors
ISC-GEM Catalogue 1900-2020 contains: 3993 events
ISC Reviewed Bulleting Catalogue 1900-2021 contains: 68488 events
USGS Catalogue 1977-2023 contains: 31716 events
GCMT Catalogue 1976-2020 contains: 56832 events
Pacheco&Sykes1992 in PH Catalogue 1900-1989 contains: 53 events
PHIVOLCS Catalogue 2015-2023 contains: 1861 events


In [5]:
# Build the HDF5 Database
database_file_iscgem = os.path.join(temp_dir, "iscgem-catalogue_db.hdf5")
if os.path.exists(database_file_iscgem):
    os.remove(database_file_iscgem)
_ = iscgem_catalogue.build_dataframe(hdf5_file=database_file_iscgem)

db = cqt.CatalogueDB(database_file_iscgem)
agency_count = cqt.get_agency_magtype_statistics(db)

Agency: ISC-GEM - 3993 Origins
Mw (3993)


In [6]:
# Build the HDF5 Database
database_file_iscrb = os.path.join(temp_dir, "iscrb-catalogue_db1.hdf5")
if os.path.exists(database_file_iscrb):
    os.remove(database_file_iscrb)
_ = isc_catalogue_1900_2021.build_dataframe(hdf5_file=database_file_iscrb)

db1 = cqt.CatalogueDB(database_file_iscrb)
agency_count = cqt.get_agency_magtype_statistics(db1)

Agency: ISC - 52964 Origins
mb (50053) | MS (10086)
Agency: IDC - 50238 Origins
mb (50235) | mbtmp (42280) | mb1 (35182) | mb1mx (35164) | MS (20385) | Ms1 (13683) | ms1mx (13664) | ML (11085)
Agency: NEIC - 31889 Origins
mb (29044) | MSZ (1281) | MS (856) | MW (444) | Mww (444) | Mwb (194) | mw (169) | Ms_20 (156) | Mwr (115) | ME (114) | Mw (46) | Me (19) | Mwc (17) | ML (5) | mb_Lg (1)
Agency: ISC-EHB - 16591 Origins
No magnitudes corresponding to this agency
Agency: EHB - 13380 Origins
No magnitudes corresponding to this agency
Agency: NEIS - 4118 Origins
mb (4075) | MSZ (280) | MS (257) | mw (26) | Mb (1)
Agency: GCMT - 3716 Origins
MW (3703) | MS (13)
Agency: NIED - 2108 Origins
Mw (1469) | MW (638)
Agency: USCGS - 1464 Origins
mb (1444) | MS (69)
Agency: GUTE - 214 Origins
UK (74)


In [7]:
# Build the HDF5 Database
database_file_usgs = os.path.join(temp_dir, "usgs-catalogue_db2.hdf5")
if os.path.exists(database_file_usgs):
    os.remove(database_file_usgs)
_ = usgsComcat_catalogue.build_dataframe(hdf5_file=database_file_usgs)

db2 = cqt.CatalogueDB(database_file_usgs)
agency_count = cqt.get_agency_magtype_statistics(db2)

Agency: usgsComcat - 31716 Origins
mb (28522) | mwc (1180) | mww (599) | mw (583) | ms (246) | mwr (176) | mwb (159) | m (135) | ml (115) | Mb (1)


In [8]:
# Build the HDF5 Database
database_file_gcmt = os.path.join(temp_dir, "gcmt-catalogue_db3.hdf5")
if os.path.exists(database_file_gcmt):
    os.remove(database_file_gcmt)
_ = GCMT_catalogue.build_dataframe(hdf5_file=database_file_gcmt)

db3 = cqt.CatalogueDB(database_file_gcmt)
agency_count = cqt.get_agency_magtype_statistics(db3)

Agency: GCMT - 56832 Origins
Mw (56832)
Agency: PDEW - 30674 Origins
Ms (22489) | mb (14423)
Agency: PDE  - 21841 Origins
mb (21826) | Ms (13506)
Agency: MLI  - 3065 Origins
mb (3065) | Ms (1850)
Agency: SWEQ - 1143 Origins
Ms (1143)
Agency: REB  - 57 Origins
mb (55) | Ms (2)
Agency: PDEQ - 33 Origins
Ms (33)
Agency: SWEM - 7 Origins
Ms (7)
Agency:      - 5 Origins
mb (4) | Ms (4)
Agency: HSW  - 4 Origins
Ms (4)
Agency: MLE  - 1 Origins
mb (1) | Ms (1)
Agency: ISC  - 1 Origins
mb (1)
Agency: DMC  - 1 Origins
mb (1) | Ms (1)


In [9]:
# Build the HDF5 Database
database_file_PAS = os.path.join(temp_dir, "ps1992-catalogue_db4.hdf5")
if os.path.exists(database_file_PAS):
    os.remove(database_file_PAS)
_ = PAS_catalogue.build_dataframe(hdf5_file=database_file_PAS)

db4 = cqt.CatalogueDB(database_file_PAS)
agency_count = cqt.get_agency_magtype_statistics(db4)

Agency: ps1992-ph - 53 Origins
Ms (53)


In [10]:
# Build the HDF5 Database
database_file_Phivolcs = os.path.join(temp_dir, "phivolcs-catalogue_db4.hdf5")
if os.path.exists(database_file_Phivolcs):
    os.remove(database_file_Phivolcs)
_ = phivolcs_catalogue.build_dataframe(database_file_Phivolcs)

db4 = cqt.CatalogueDB(database_file_Phivolcs)
agency_count = cqt.get_agency_magtype_statistics(db4)

Agency: phivolcs - 1861 Origins
Ms (1369) | Mw (479) | ML (6) | MLv (5) | Mwp (1) | mb (1)


In [11]:
time_window = 60
distance_window = 100

print('merging ISC-RB and usgs-comcat')
merge1 = DuplicateFinder(isc_catalogue_1900_2021, time_window, distance_window, logging=True)
merged1Catalogue = merge1.merge_catalogue(usgsComcat_catalogue)

print('merging gcmt')
merge2 = DuplicateFinder(merged1Catalogue, time_window, distance_window, logging=True)
merged2Catalogue = merge2.merge_catalogue(GCMT_catalogue)

print('merging Pacheco and Sykes')
merge3 = DuplicateFinder(merged2Catalogue, time_window, distance_window, logging=True)
merged3Catalogue = merge3.merge_catalogue(PAS_catalogue)

print('merging phivolcs')
merge4 = DuplicateFinder(merged3Catalogue, time_window, distance_window, logging=True)
merged4Catalogue = merge4.merge_catalogue(phivolcs_catalogue)

print('merge to ISCGEM')
merge5 = DuplicateFinder(iscgem_catalogue, time_window, distance_window, logging=True)
merged5Catalogue = merge5.merge_catalogue(merged4Catalogue)

merged5Catalogue


merging ISC-RB and usgs-comcat
After duplicate finding: 71315 events (71315)
merging gcmt
After duplicate finding: 124511 events (124511)
merging Pacheco and Sykes
After duplicate finding: 124516 events (124516)
merging phivolcs
After duplicate finding: 124560 events (124560)
merge to ISCGEM
After duplicate finding: 124895 events (124895)


<openquake.cat.isf_catalogue.ISFCatalogue at 0x199df1b36d0>

In [12]:
# Build the HDF5 Database
database_file = os.path.join(temp_dir, "merged-catalogue_db1.hdf5")
if os.path.exists(database_file):
    os.remove(database_file)
_ = merged5Catalogue.build_dataframe(hdf5_file=database_file)

db1 = cqt.CatalogueDB(database_file)
agency_count = cqt.get_agency_magtype_statistics(db1)

Agency: GCMT - 60548 Origins
Mw (56832) | MW (3703) | MS (13)
Agency: ISC - 52964 Origins
mb (50053) | MS (10086)
Agency: IDC - 50238 Origins
mb (50235) | mbtmp (42280) | mb1 (35182) | mb1mx (35164) | MS (20385) | Ms1 (13683) | ms1mx (13664) | ML (11085)
Agency: NEIC - 31889 Origins
mb (29044) | MSZ (1281) | MS (856) | MW (444) | Mww (444) | Mwb (194) | mw (169) | Ms_20 (156) | Mwr (115) | ME (114) | Mw (46) | Me (19) | Mwc (17) | ML (5) | mb_Lg (1)
Agency: usgsComcat - 31716 Origins
mb (28522) | mwc (1180) | mww (599) | mw (583) | ms (246) | mwr (176) | mwb (159) | m (135) | ml (115) | Mb (1)
Agency: PDEW - 30674 Origins
Ms (22489) | mb (14423)
Agency: PDE  - 21841 Origins
mb (21826) | Ms (13506)
Agency: ISC-EHB - 16591 Origins
No magnitudes corresponding to this agency
Agency: EHB - 13380 Origins
No magnitudes corresponding to this agency
Agency: NEIS - 4118 Origins
mb (4075) | MSZ (280) | MS (257) | mw (26) | Mb (1)
Agency: ISC-GEM - 3993 Origins
Mw (3993)
Agency: MLI  - 3065 Origin

In [13]:
origin_rules = [
    ("1930/01/01 - 1959/12/31", ["ISC-GEM", "ISC-EHB", "EHB", "ISC", "ISC ", "IDC", "NEIC", "NEIS", "USCGS", "NIED", "GCMT", "GUTE", "PAS", "ps1992-ph", "usgsComcat", "phivolcs"])
]

In [14]:
"""
Weatherill (2015) Table 1. Mw conversions
"""
def iscgem_mw(magnitude):
    """
    For Mw recorded by ISCGEM take the value with no uncertainty
    """
    return magnitude

def iscgem_mw_sigma(magnitude):
    """
    No additional uncertainty   
    """
    return 0.0

def gcmt_mw(magnitude):
    """
    For Mw recorded by GCMT take the value with no uncertainty
    """
    return magnitude

def gcmt_mw_sigma(magnitude):
    """
    No additional uncertainty   
    """
    return 0.0

def neic_mw(magnitude):
    """
    If Mw reported by NEIC,
    """
    return 1.021 * magnitude - 0.091

def neic_mw_sigma(magnitude):
    """
    Uncertainty of 0.101 units
    """
    return 0.105

def nied_mw(magnitude):
    """
    If Mw reported by NIED,
    """
    return 0.964 * magnitude + 0.248

def nied_mw_sigma(magnitude):
    """
    Uncertainty of 0.11 units
    """
    return 0.11

def isc_ms(magnitude):
    """
    If Ms reported by ISC, convert to Mw from Weatherill (2015),
    """
    if magnitude > 6.0:
        return 0.994 * magnitude + 0.1        
    else:
        return 0.616 * magnitude + 2.369

def isc_ms_sigma(magnitude):
    """
    With Magnitude dependent uncertainty
    """
    if magnitude > 6.0:
        return 0.174
    else:
        return 0.147

def neic_ms(magnitude):
    """
    If Ms reported by NEIC, convert to Mw from Weatherill (2015),
    """
    if magnitude > 6.47:
        return 1.005 * magnitude - 0.026       
    else:
        return 0.723 * magnitude + 1.798

def neic_ms_sigma(magnitude):
    """
    With Magnitude dependent uncertainty
    """
    if magnitude > 6.47:
        return 0.187
    else:
        return 0.159
    
def neic_msz(magnitude):
    """
    If Msz reported by NEIC, convert to Mw from Weatherill (2015),
    """
    if magnitude > 6.47:
        return 0.950 * magnitude + 0.359     
    else:
        return 0.707 * magnitude + 1.933

def neic_msz_sigma(magnitude):
    """
    With Magnitude dependent uncertainty
    """
    if magnitude > 6.47:
        return 0.204
    else:
        return 0.179

def neic_mb(magnitude):
    """
    If Mb reported by NEIC,
    """
    return 1.159 * magnitude - 0.659

def neic_mb_sigma(magnitude):
    """
    Uncertainty of 0.283 units
    """
    return 0.283

def isc_mb(magnitude):
    """
    If Mw reported by isc,
    """
    return 1.084 * magnitude - 0.142

def isc_mb_sigma(magnitude):
    """
    Uncertainty of 0.317 units
    """
    return 0.317

def pas_ms(magnitude):
    """
    For Ms recorded by paS take the value with no uncertainty. 
    In their database Pacheco & Sykes (1992) use
    the 20-s period Ms value, which, for our purposes, we treat as
    equivalent to MW in the magnitude range 7.0 ≤ MW ≤ 8.0. (Weatherill, 2015)
    """
    return magnitude

def pas_ms_sigma(magnitude):
    """
    0.2 additional uncertainty   
    """
    return 0.2

def phivolcs_ms(magnitude):
    """
    own regression
    """
    if magnitude > 6.5:
        return 0.686 * magnitude + 1.997     
    else:
        return 0.857 * magnitude + 0.888

def phivolcs_ms_sigma(magnitude):
    """
    own regression
    """
    if magnitude > 6.5:
        return 0.072
    else:
        return 0.185

def phivolcs_mw(magnitude):
    """
    own regression
    """
    if magnitude > 6.5:
        return 1.072 * magnitude - 0.562     
    else:
        return 0.887 * magnitude + 0.640

def phivolcs_mw_sigma(magnitude):
    """
    own regression
    """
    if magnitude > 6.5:
        return 0.114
    else:
        return 0.116

In [15]:
rule_set_1930_1959 = [
    MagnitudeConversionRule("ISC-GEM", "Mw", iscgem_mw, iscgem_mw_sigma),

    MagnitudeConversionRule("GCMT", "Mw", gcmt_mw, gcmt_mw_sigma),

    MagnitudeConversionRule("NEIC", "Mw", neic_mw, neic_mw_sigma),
    MagnitudeConversionRule("NEIS", "Mw", neic_mw, neic_mw_sigma),

    MagnitudeConversionRule("NIED", "Mw", nied_mw, nied_mw_sigma),

    MagnitudeConversionRule("ISC", "Ms", isc_ms, isc_ms_sigma),
    MagnitudeConversionRule("IDC", "Ms", isc_ms, isc_ms_sigma),

    MagnitudeConversionRule("NEIC", "Ms", neic_ms, neic_ms_sigma),
    MagnitudeConversionRule("NEIS", "MS", neic_ms, neic_ms_sigma),
    MagnitudeConversionRule("USCGS", "ms", neic_ms, neic_ms_sigma),
    

    MagnitudeConversionRule("NEIC", "Msz", neic_msz, neic_msz_sigma),
    MagnitudeConversionRule("NEIS", "Msz", neic_msz, neic_msz_sigma),

    MagnitudeConversionRule("NEIC", "Mb", neic_mb, neic_mb_sigma),
    MagnitudeConversionRule("NEIS", "Mb", neic_mb, neic_mb_sigma),
    MagnitudeConversionRule("USCGS", "mb", neic_mb, neic_mb_sigma),


    MagnitudeConversionRule("ISC", "Mb", isc_mb, isc_mb_sigma),
    
    MagnitudeConversionRule("PAS", "Ms", pas_ms, pas_ms_sigma),
    MagnitudeConversionRule("ps1992-ph", "Ms", pas_ms, pas_ms_sigma),

    MagnitudeConversionRule("usgsComcat", "mwc", neic_mw, neic_mw_sigma),
    MagnitudeConversionRule("usgsComcat", "mww", neic_mw, neic_mw_sigma),
    MagnitudeConversionRule("usgsComcat", "mw", neic_mw, neic_mw_sigma),
    MagnitudeConversionRule("usgsComcat", "mwr", neic_mw, neic_mw_sigma),
    MagnitudeConversionRule("usgsComcat", "mwb", neic_mw, neic_mw_sigma),
    MagnitudeConversionRule("usgsComcat", "ms", neic_ms, neic_ms_sigma),
    MagnitudeConversionRule("usgsComcat", "mb", neic_mb, neic_mb_sigma),

    MagnitudeConversionRule("phivolcs", "Mw", phivolcs_mw, phivolcs_mw_sigma),
    MagnitudeConversionRule("phivolcs", "Ms", phivolcs_ms, phivolcs_ms_sigma),

]

magnitude_rules = [
    ("1930/01/01 - 1959/12/31", rule_set_1930_1959)
]

In [16]:
preprocessor = HomogenisorPreprocessor("time")
pp_catalogue = preprocessor.execute(merged5Catalogue, origin_rules, magnitude_rules)
harmonisor_1930_1959 = DynamicHomogenisor(pp_catalogue, logging=True)
homogenised_catalogue_1930_1959 = harmonisor_1930_1959.homogenise(magnitude_rules, origin_rules)
print("Merged Catalogue 1930-1959 contains: %d events" % homogenised_catalogue_1930_1959.get_number_events())

907206 ISC-GEM-Mw ISC-GEM|ISC-GEM-Mw
907275 ISC-GEM-Mw ISC-GEM|ISC-GEM-Mw
907279 ISC-GEM-Mw ISC-GEM|ISC-GEM-Mw
907303 ISC-GEM-Mw ISC-GEM|ISC-GEM-Mw
907304 ISC-GEM-Mw ISC-GEM|ISC-GEM-Mw
907410 ISC-GEM-Mw ISC-GEM|ISC-GEM-Mw
907427 PAS-Ms GUTE|PAS-Ms
907443 ISC-GEM-Mw ISC-GEM|ISC-GEM-Mw
907510 PAS-Ms GUTE|PAS-Ms
907538 ISC-GEM-Mw ISC-GEM|ISC-GEM-Mw
907610 ISC-GEM-Mw ISC-GEM|ISC-GEM-Mw
907611 ISC-GEM-Mw ISC-GEM|ISC-GEM-Mw
907626 ISC-GEM-Mw ISC-GEM|ISC-GEM-Mw
907727 PAS-Ms ISC|PAS-Ms
907790 ISC-GEM-Mw ISC-GEM|ISC-GEM-Mw
907791 ISC-GEM-Mw ISC-GEM|ISC-GEM-Mw
907805 ISC-GEM-Mw ISC-GEM|ISC-GEM-Mw
907817 PAS-Ms GUTE|PAS-Ms
907822 ISC-GEM-Mw ISC-GEM|ISC-GEM-Mw
907824 ISC-GEM-Mw ISC-GEM|ISC-GEM-Mw
906543 ISC-GEM-Mw ISC-GEM|ISC-GEM-Mw
906588 ISC-GEM-Mw ISC-GEM|ISC-GEM-Mw
906642 ISC-GEM-Mw ISC-GEM|ISC-GEM-Mw
906681 ISC-GEM-Mw ISC-GEM|ISC-GEM-Mw
906682 ISC-GEM-Mw ISC-GEM|ISC-GEM-Mw
906684 ISC-GEM-Mw ISC-GEM|ISC-GEM-Mw
906692 ISC-GEM-Mw ISC-GEM|ISC-GEM-Mw
906783 ISC-GEM-Mw ISC-GEM|ISC-GEM-Mw
906789 IS

In [17]:
log_file = os.path.join(temp_dir, "1930-1959-merged-homogenisor_log.csv")
if os.path.exists(log_file):
    os.remove(log_file)

harmonisor_1930_1959.dump_log(log_file)
print("log file saved in Temporary directory:", log_file)


log file saved in Temporary directory: C:\Users\ENRICO~1.ABC\AppData\Local\Temp\tmpz68255hq\1930-1959-merged-homogenisor_log.csv


In [18]:
output_catalogue_file = f"outputs/1930-1959-merged-homogeneous_catalogue.csv"
if os.path.exists(output_catalogue_file):
    os.remove(output_catalogue_file)
harmonisor_1930_1959.export_homogenised_to_csv(output_catalogue_file)

print("catalogue saved in cwd:", output_catalogue_file)

catalogue saved in cwd: outputs/1930-1959-merged-homogeneous_catalogue.csv


In [19]:
# Open the input and output files
input_file_path = "outputs/1930-1959-merged-homogeneous_catalogue.csv"
output_file_path = "outputs/1930-1959-merged-homogeneous_catalogue-cleaned.csv"

with open(input_file_path, "r") as input_file, open(output_file_path, "w") as output_file:
    for line in input_file:
        # Split the line into parts using commas
        parts = line.split(",")
        
        # Keep only the first 20 parts (up to the 20th comma)
        truncated_line = ",".join(parts[:20]) + "\n"
        
        # Write the modified line to the output file
        output_file.write(truncated_line)

print("CSV file manipulation completed.")


CSV file manipulation completed.


In [20]:
# Open the input and output files
input_file_path = "outputs/1930-1959-merged-homogeneous_catalogue.csv"
output_file_path = os.path.join(temp_dir, "1930-1959-merged-homogeneous_catalogue-cleaned.csv")

delimiter = ","  # Replace with your CSV delimiter

with open(input_file_path, "r") as input_file, open(output_file_path, "w") as output_file:
    for line in input_file:
        # Split the line into parts using commas
        parts = line.strip().split(delimiter)
        
        # Join the first 20 parts (up to the 20th comma) and everything after it
        truncated_parts = parts[:20] + [" || ".join(parts[20:])]
        
        # Create the modified line by joining the truncated parts
        modified_line = delimiter.join(truncated_parts) + "\n"
        
        # Write the modified line to the output file
        output_file.write(modified_line)

print("CSV file manipulation completed.")

cleaned_file_path = output_file_path


CSV file manipulation completed.


In [21]:
df = pd.read_csv(cleaned_file_path)

# Assuming your DataFrame is named df
# Filter the DataFrame based on latitude and longitude conditions
filtered_df = df[
    (df['latitude'] >= 1.89) & (df['latitude'] <= 24) &
    (df['longitude'] >= 114) & (df['longitude'] <= 129.5)
]

# Save the filtered DataFrame to a CSV file
output_csv_path = "outputs/1930-1959-merged-homogeneous_catalogue.csv"
filtered_df.to_csv(output_csv_path, index=False)

print("Filtered DataFrame saved to CSV:", output_csv_path)

Filtered DataFrame saved to CSV: outputs/1930-1959-merged-homogeneous_catalogue.csv


In [22]:
#remove the contents of the tempdir when you are done
import shutil
shutil.rmtree(temp_dir)
print("Temporary directory and its contents removed.")

Temporary directory and its contents removed.
