# Retrieve Data for NASA CME and GST

In [None]:

# Required imports
import pandas as pd
import requests
import json

# Using the provided API key directly
api_key = "40877474-1998-4af6-8e32-c65022146ead"

# Constants for the API call
base_url = "https://api.nasa.gov/DONKI/"
specifier_cme = "CME"
start_date = "2013-05-01"
end_date = "2024-05-01"

# Build the query URL for CME data
query_url_cme = f"{base_url}{specifier_cme}?startDate={start_date}&endDate={end_date}&api_key={api_key}"

# Request CME data from the NASA API
response_cme = requests.get(query_url_cme)
if response_cme.status_code != 200:
    raise Exception(f"Failed to retrieve CME data. Status code: {response_cme.status_code}")

# Convert response to JSON
cme_json = response_cme.json()

# Preview the first few results in JSON format
print(json.dumps(cme_json[:5], indent=4))


## Next Steps


1. Convert `cme_json` into a Pandas DataFrame.
2. Filter and clean the data according to the instructions.
3. Proceed to request GST data.
4. Merge CME and GST data for analysis.


## Process CME Data

In [None]:

# Convert JSON data to a Pandas DataFrame
cme_df = pd.json_normalize(cme_json)

# Keep only relevant columns: activityID, startTime, and linkedEvents
cme_df = cme_df[["activityID", "startTime", "linkedEvents"]]

# Remove rows with missing linkedEvents
cme_df = cme_df[cme_df["linkedEvents"].notnull()]

# Expand linkedEvents to individual rows
expanded_rows = []
for i in cme_df.index:
    activity_id = cme_df.loc[i, "activityID"]
    start_time = cme_df.loc[i, "startTime"]
    linked_events = cme_df.loc[i, "linkedEvents"]
    for event in linked_events:
        expanded_rows.append({
            "activityID": activity_id,
            "startTime_CME": start_time,
            "linkedEvent": event
        })

cme_expanded_df = pd.DataFrame(expanded_rows)

# Function to extract GST activityID from linked events
def extract_activityID_from_dict(input_dict):
    try:
        return input_dict["activityID"]
    except (ValueError, TypeError, KeyError):
        return None

# Apply the function to extract GST_ActivityID
cme_expanded_df["GST_ActivityID"] = cme_expanded_df["linkedEvent"].apply(extract_activityID_from_dict)

# Remove rows with missing GST_ActivityID
cme_expanded_df = cme_expanded_df[cme_expanded_df["GST_ActivityID"].notnull()]

# Keep only GST-related rows
cme_expanded_df = cme_expanded_df[cme_expanded_df["GST_ActivityID"].str.contains("GST")]

# Finalize the CME DataFrame
cme_expanded_df = cme_expanded_df.rename(columns={"activityID": "cmeID"}).drop(columns=["linkedEvent"])
cme_expanded_df["startTime_CME"] = pd.to_datetime(cme_expanded_df["startTime_CME"])

# Display the CME DataFrame
cme_expanded_df.head()


## Process GST Data

In [None]:

# Build the query URL for GST data
specifier_gst = "GST"
query_url_gst = f"{base_url}{specifier_gst}?startDate={start_date}&endDate={end_date}&api_key={api_key}"

# Request GST data from the NASA API
response_gst = requests.get(query_url_gst)
if response_gst.status_code != 200:
    raise Exception(f"Failed to retrieve GST data. Status code: {response_gst.status_code}")

# Convert response to JSON
gst_json = response_gst.json()

# Convert JSON data to a Pandas DataFrame
gst_df = pd.json_normalize(gst_json)

# Keep only relevant columns: activityID, startTime, and linkedEvents
gst_df = gst_df[["activityID", "startTime", "linkedEvents"]]

# Remove rows with missing linkedEvents
gst_df = gst_df[gst_df["linkedEvents"].notnull()]

# Expand linkedEvents to individual rows
gst_df = gst_df.explode("linkedEvents").reset_index(drop=True)

# Extract CME_ActivityID from linkedEvents
gst_df["CME_ActivityID"] = gst_df["linkedEvents"].apply(extract_activityID_from_dict)

# Remove rows with missing CME_ActivityID
gst_df = gst_df[gst_df["CME_ActivityID"].notnull()]

# Keep only CME-related rows
gst_df = gst_df[gst_df["CME_ActivityID"].str.contains("CME")]

# Finalize the GST DataFrame
gst_df = gst_df.rename(columns={"activityID": "gstID", "startTime": "startTime_GST"}).drop(columns=["linkedEvents"])
gst_df["startTime_GST"] = pd.to_datetime(gst_df["startTime_GST"])

# Display the GST DataFrame
gst_df.head()


## Merge and Analyze Data

In [None]:

# Merge CME and GST data
merged_df = pd.merge(cme_expanded_df, gst_df, left_on=["GST_ActivityID", "cmeID"], right_on=["gstID", "CME_ActivityID"])

# Calculate the time difference
merged_df["timeDiff"] = (merged_df["startTime_GST"] - merged_df["startTime_CME"]).dt.total_seconds() / 3600

# Analyze the time difference
time_diff_stats = merged_df["timeDiff"].describe()

# Display statistics
print(time_diff_stats)

# Export merged data to CSV
output_file = "merged_cme_gst_data.csv"
merged_df.to_csv(output_file, index=False)
print(f"Data exported to {output_file}")


In [None]:

# Mock CME data
cme_mock_data = [
    {"activityID": "CME1", "startTime": "2023-01-01T00:00Z", "GST_ActivityID": "GST1"},
    {"activityID": "CME2", "startTime": "2023-01-02T12:00Z", "GST_ActivityID": "GST2"},
    {"activityID": "CME3", "startTime": "2023-01-03T18:30Z", "GST_ActivityID": "GST3"}
]
cme_mock_df = pd.DataFrame(cme_mock_data)
cme_mock_df["startTime_CME"] = pd.to_datetime(cme_mock_df["startTime"])
cme_mock_df = cme_mock_df.rename(columns={"activityID": "cmeID"}).drop(columns=["startTime"])

# Mock GST data
gst_mock_data = [
    {"activityID": "GST1", "startTime": "2023-01-01T06:00Z", "CME_ActivityID": "CME1"},
    {"activityID": "GST2", "startTime": "2023-01-02T18:00Z", "CME_ActivityID": "CME2"},
    {"activityID": "GST3", "startTime": "2023-01-03T20:00Z", "CME_ActivityID": "CME3"}
]
gst_mock_df = pd.DataFrame(gst_mock_data)
gst_mock_df["startTime_GST"] = pd.to_datetime(gst_mock_df["startTime"])
gst_mock_df = gst_mock_df.rename(columns={"activityID": "gstID"}).drop(columns=["startTime"])

# Use mock data for merging
merged_mock_df = pd.merge(cme_mock_df, gst_mock_df, left_on=["GST_ActivityID", "cmeID"], right_on=["gstID", "CME_ActivityID"])

# Calculate time difference
merged_mock_df["timeDiff"] = (merged_mock_df["startTime_GST"] - merged_mock_df["startTime_CME"]).dt.total_seconds() / 3600

# Analyze the time difference
mock_time_diff_stats = merged_mock_df["timeDiff"].describe()

# Display mock data results
print("Mock Data Statistics:")
print(mock_time_diff_stats)

# Export mock merged data
mock_output_file = "mock_merged_cme_gst_data.csv"
merged_mock_df.to_csv(mock_output_file, index=False)
print(f"Mock data exported to {mock_output_file}.")
