# Load Dataset

### Import Libraries

In [None]:
import os
from dotenv import load_dotenv

### Load Historical Prices

In [None]:
# Load environment variables
load_dotenv()

# MongoDB Connection Setup
MONGO_URI = os.getenv("MONGO_URI")
if not MONGO_URI:
    raise ValueError("MONGO_URI not found in environment variables. Please check your .env file.")

DATABASE_NAME = "robo_advisor"
HISTORICAL_PRICES_COLLECTION = "historical_prices"

# Load Historical Prices
def load_historical_prices():
    client = MongoClient(MONGO_URI)
    db = client[DATABASE_NAME]
    collection = db[HISTORICAL_PRICES_COLLECTION]
    
    # Fetch all historical prices
    data = list(collection.find({}, {"_id": 0}))  # Exclude MongoDB `_id` field for simplicity
    if not data:
        print("No data found in historical_prices collection.")
        return pd.DataFrame()
    
    # Convert to DataFrame
    df = pd.DataFrame(data)
    print(f"Loaded {len(df)} records from historical_prices collection.")
    return df

# Load and display
historical_prices_df = load_historical_prices()
historical_prices_df.head()

### Load Asset Metadata

In [None]:
# Asset Metadata Collection
ASSET_METADATA_COLLECTION = "asset_metadata"

# Load Asset Metadata
def load_asset_metadata():
    client = MongoClient(MONGO_URI)
    db = client[DATABASE_NAME]
    collection = db[ASSET_METADATA_COLLECTION]
    
    # Fetch all asset metadata
    data = list(collection.find({}, {"_id": 0}))  # Exclude MongoDB `_id` field for simplicity
    if not data:
        print("No data found in asset_metadata collection.")
        return pd.DataFrame()
    
    # Convert to DataFrame
    df = pd.DataFrame(data)
    print(f"Loaded {len(df)} records from asset_metadata collection.")
    return df

# Load and display
asset_metadata_df = load_asset_metadata()
asset_metadata_df.head()

# Data Cleaning

Ensure all data is clean and consistent:
- Handle missing values (e.g., impute or drop).
- Normalize date formats (e.g., consistent Date fields).
- Standardize numerical fields (e.g., ensure returns, volatility are floats).
- Drop irrelevant or redundant fields.

### Check if there are any missing values

In [None]:
import missingno as msno
import matplotlib.pyplot as plt

# Function to visualize missing values
def visualize_missing_values(df, title="Missing Values"):
    if df.isnull().sum().sum() == 0:
        print("No missing values detected.")
        return
    
    print(f"Total missing values: {df.isnull().sum().sum()}")
    msno.matrix(df, figsize=(10, 6), fontsize=12)
    plt.title(title, fontsize=16)
    plt.show()

# Visualize missing values for historical_prices
print("Visualizing missing values for 'historical_prices':")
visualize_missing_values(historical_prices_df, title="Missing Values in Historical Prices")

# Visualize missing values for asset_metadata
print("\nVisualizing missing values for 'asset_metadata':")
visualize_missing_values(asset_metadata_df, title="Missing Values in Asset Metadata")

**Observation**

xxxxx

### Normalize date formats

In [None]:
# Normalize 'Date' column in the historical_prices DataFrame
def normalize_date_format(df, date_column="Date"):
    """
    Normalize the format of the Date column in a DataFrame.

    Args:
        df (pd.DataFrame): DataFrame containing the date column.
        date_column (str): Name of the date column to normalize.

    Returns:
        pd.DataFrame: DataFrame with normalized date format.
    """
    if date_column not in df.columns:
        print(f"Column '{date_column}' not found in DataFrame.")
        return df

    # Convert to datetime, handle errors, and format as YYYY-MM-DD
    df[date_column] = pd.to_datetime(df[date_column], errors="coerce")
    
    # Identify rows with invalid dates
    invalid_dates = df[date_column].isnull().sum()
    if invalid_dates > 0:
        print(f"Warning: {invalid_dates} invalid dates found. They have been set to NaT.")
    
    return df

# Normalize the 'Date' column in the historical_prices DataFrame
historical_prices_df = normalize_date_format(historical_prices_df, date_column="Date")

# Display sample data
print(historical_prices_df.head())