In [None]:
# Exploratory Data Analysis and Feature Engineering

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder

# Load raw data
df = pd.read_csv('data/sample_data.csv')
df.head()

# Basic Info
df.info()
df.describe()

# Missing Values
missing = df.isnull().sum()
print("Missing Values:\n", missing)

# Histograms
numeric_cols = ['magnetic_anomaly', 'REE_concentration', 'rock_density']
df[numeric_cols].hist(bins=30, figsize=(10, 5))
plt.suptitle("Distribution of Numeric Features")
plt.tight_layout()
plt.show()

# Boxplots
for col in numeric_cols:
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot - {col}')
    plt.show()

# Correlation Matrix
corr = df[numeric_cols].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()

# Categorical Counts
cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
for col in cat_cols:
    print(df[col].value_counts())
    sns.countplot(x=col, data=df)
    plt.title(f"{col} Counts")
    plt.xticks(rotation=45)
    plt.show()

# One-Hot Encoding
encoder = OneHotEncoder(sparse=False, drop='first')
encoded = encoder.fit_transform(df[cat_cols])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(cat_cols))

# Combine Encoded + Cleaned Data
df_clean = df.copy()
df_clean[numeric_cols] = df_clean[numeric_cols].fillna(df_clean[numeric_cols].mean())
df_clean = pd.concat([df_clean.drop(columns=cat_cols), encoded_df], axis=1)

# Save Preprocessed Output
df_clean.to_csv('data/preprocessed_mineral_data.csv', index=False)
print("Preprocessed data saved to data/preprocessed_mineral_data.csv")
