In [2]:
# =============================================
# Preprocessing OMNI Solar Wind Dataset
# =============================================

import pandas as pd

# 1. Load the raw TXT file
file_path = "data/OMNI_dataset_20252026.txt"  # update path if different
df = pd.read_csv(file_path, delim_whitespace=True, header=None)

# 2. Add column names
df.columns = [
    "Year", "DOY", "Hour",
    "Bz_GSM_nT",
    "Proton_Temp_K",
    "Proton_Density_cm3",
    "Solar_Wind_Speed_kms",
    "Flow_Pressure_nPa",
    "Kp_x10"
]

# 3. Convert Kp Ã— 10 to actual Kp
df["Kp"] = df["Kp_x10"] / 10

# 4. Handle missing values (drop or fill with median)
df = df.fillna(df.median())

# 5. Ensure numeric data types
numeric_cols = [
    "Bz_GSM_nT",
    "Proton_Temp_K",
    "Proton_Density_cm3",
    "Solar_Wind_Speed_kms",
    "Flow_Pressure_nPa",
    "Kp"
]
df[numeric_cols] = df[numeric_cols].astype(float)

# 6. Save cleaned dataset
clean_file_path = "data/OMNI_dataset_20252026.csv"
df.to_csv(clean_file_path, index=False)

print(f"Preprocessing complete! Clean dataset saved to: {clean_file_path}")
df.head()


Preprocessing complete! Clean dataset saved to: data/OMNI_dataset_20252026.csv


  df = pd.read_csv(file_path, delim_whitespace=True, header=None)


Unnamed: 0,Year,DOY,Hour,Bz_GSM_nT,Proton_Temp_K,Proton_Density_cm3,Solar_Wind_Speed_kms,Flow_Pressure_nPa,Kp_x10,Kp
0,2025,1,0,-1.0,178561.0,19.6,427.0,6.66,40,4.0
1,2025,1,1,-3.2,98035.0,16.8,446.0,6.07,40,4.0
2,2025,1,2,-2.9,130848.0,22.3,434.0,7.77,40,4.0
3,2025,1,3,-1.1,140511.0,20.0,454.0,7.46,53,5.3
4,2025,1,4,-8.3,131587.0,31.9,482.0,13.76,53,5.3


In [3]:
# Saves the first 100 rows only 
df.head(100).to_csv("data/sample_OMNI.csv", index=False)
