<a href="https://colab.research.google.com/github/christinaxliu/research/blob/main/JPL-Caltech/DataFetcher/NASAExoplanetCatalogDataFetcher.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
import requests
import csv
from google.colab import drive

In [25]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
exoplanet_types = [
    "Terrestrial",
    "Super+Earth",
    "Neptune-like",
    "Gas+Giant"
]

exoplanet_type_keys_dict ={
    "Terrestrial": "Terrestrial",
    "Super+Earth": "Super-Earth",
    "Neptune-like": "Neptune-Like",
    "Gas+Giant": "Gas-Giant"
}

exoplanet_type_num_pages_dict ={
    "Terrestrial": 14,
    "Super+Earth": 116,
    "Neptune-like": 133,
    "Gas+Giant": 125
}

# Fetch webpage content with the URL specified by the url parameter
def fetch_webpage(url):
    webpage_content = ""
    response = requests.get(url)
    if response.status_code == 200:
        webpage_content = response.text
    else:
        print(f"Failed to fetch the webpage with url: {url}, error code: {response.status_code}")
    return webpage_content

# Replace matched substring that matches the string specified by pattern_to_match with target_str
def replace_matched_str(input_str, pattern_to_match, target_str):
    result_str = input_str
    index = input_str.find(pattern_to_match)
    if index != -1:
        result_str = input_str[:index] + target_str + input_str[(index + len(pattern_to_match) + 1):]
    return result_str

# Parse webpage content to extract exoplanet data
def parse_nasa_exoplanet_catalog_webpage(webpage_content):
    exoplanet_names = []
    content = webpage_content

    index = index = content.find("<div class=\"hds-content-item\"><a href=\"/exoplanet-catalog/")
    while index != -1:
        content = content[index:]

        index = content.find("title=\"")
        content = content[(index+len("title=\"")):]

        index = content.find("\">")
        exoplanet_name = replace_matched_str(content[:index], "&amp;#8217;", "'s")
        exoplanet_names.append(exoplanet_name)
        content = content[index:]

        index = content.find("<div class=\"hds-content-item\"><a href=\"/exoplanet-catalog/")
    return exoplanet_names

# Save data stored in dictionary into CSV file at the location specified by csv_file_name
def save_as_csv(data_dict, csv_file_name):
    # Open the CSV file with the file name specified by csv_file_name in write mode
    with open(csv_file_name, 'w', newline='') as file:
        # Create a CSV writer object
        writer = csv.writer(file)

        # Write the header - the keys of the dictionary
        writer.writerow(data_dict.keys())

        # Write the data rows - the values of the dictionary
        writer.writerows(zip(*data_dict.values()))

In [27]:
found_expolanet_names = []
found_expolanet_types = []
exoplanet_type_dict = {
    "pl_name": found_expolanet_names,
    "pl_type": found_expolanet_types
}

num_exoplanet_per_type_dict = {
    "Terrestrial": 0,
    "Super+Earth": 0,
    "Neptune-like": 0,
    "Gas+Giant": 0
}

total_num_exoplanets = 0

# Loop through exoplanet types and corresponding pages to fetch exoplanet data from NASA Exoplanet Catalog
for exoplanet_type in exoplanet_types:
    exoplanet_type_num_pages = exoplanet_type_num_pages_dict[exoplanet_type]
    print(f"Fetch {exoplanet_type} exoplanet data from NASA Exoplanet Catalog with {exoplanet_type_num_pages} pages...\n")
    for page_minus_one in range(exoplanet_type_num_pages):
        # Construct URL with the specific exoplanet type and the specific page number
        url = f"https://science.nasa.gov/exoplanets/exoplanet-catalog/?pageno={page_minus_one + 1}&planet_type={exoplanet_type}&content_list=true"
        print(f"Fetch {exoplanet_type} exoplanet data with URL: {url}")

        # Fetch webpage content with the specific URL
        webpage_content = fetch_webpage(url)

        # Parse the webpage content to extract out the exoplanets with types
        exoplanet_names = parse_nasa_exoplanet_catalog_webpage(webpage_content)

        num_exoplanet_per_type_dict[exoplanet_type] += len(exoplanet_names)
        total_num_exoplanets += len(exoplanet_names)
        print(f"Fetched {len(exoplanet_names)} exoplanets with type: {exoplanet_type}")

        # Populate fetched exoplanets into the exoplanet type dictionary
        for explanet_name in exoplanet_names:
            exoplanet_type_dict["pl_name"].append(explanet_name)
            exoplanet_type_dict["pl_type"].append(exoplanet_type_keys_dict[exoplanet_type])
    print(f"Done with fetching of {exoplanet_type} exoplanet data. {num_exoplanet_per_type_dict[exoplanet_type]} {exoplanet_type} exoplanets were fetched.\n")

print(f"Fetched {total_num_exoplanets} exoplanets from NASA Exoplanet Catalog completed.")
print("Breakdown of number of exoplanets per type:")
for exoplanet_type in exoplanet_types:
    print(f"Number of {exoplanet_type} exoplanets: {num_exoplanet_per_type_dict[exoplanet_type]}")

# Save the exoplanet type dictionary into CSV file
csv_file_name = '/content/drive/My Drive/Colab Notebooks/research/JPL-Caltech/DataFetcher/NasaCatalogExoplanetType_2025.01.04.csv'
save_as_csv(exoplanet_type_dict, csv_file_name)
print(f"Exoplanets with types are saved as CSV file: {csv_file_name}.")

Fetch Terrestrial exoplanet data from NASA Exoplanet Catalog with 14 pages...

Fetch Terrestrial exoplanet data with URL: https://science.nasa.gov/exoplanets/exoplanet-catalog/?pageno=1&planet_type=Terrestrial&content_list=true
Fetched 15 exoplanets with type: Terrestrial
Fetch Terrestrial exoplanet data with URL: https://science.nasa.gov/exoplanets/exoplanet-catalog/?pageno=2&planet_type=Terrestrial&content_list=true
Fetched 15 exoplanets with type: Terrestrial
Fetch Terrestrial exoplanet data with URL: https://science.nasa.gov/exoplanets/exoplanet-catalog/?pageno=3&planet_type=Terrestrial&content_list=true
Fetched 15 exoplanets with type: Terrestrial
Fetch Terrestrial exoplanet data with URL: https://science.nasa.gov/exoplanets/exoplanet-catalog/?pageno=4&planet_type=Terrestrial&content_list=true
Fetched 15 exoplanets with type: Terrestrial
Fetch Terrestrial exoplanet data with URL: https://science.nasa.gov/exoplanets/exoplanet-catalog/?pageno=5&planet_type=Terrestrial&content_list=t