**Load Stocks Data**

Source: datasets are extracted from https://www.nasdaq.com/ and
https://www.macrotrends.net/

In [2]:
import pandas as pd

# List of file paths with their corresponding asset names
file_paths_and_assets = [
    ("../datasets/HistoricalData_GS.csv", "GS"),
    ("../datasets/HistoricalData_MS.csv", "MS"),
    ("../datasets/HistoricalData_UNG.csv", "UNG"),
    ("../datasets/HistoricalData_PDBC.csv", "PDBC"),
    ("../datasets/HistoricalData_INOD.csv", "INOD"),
    ("../datasets/HistoricalData_LBTYB.csv", "LBTYB"),
    ("../datasets/HistoricalData_WLFC.csv", "WLFC"),
    ("../datasets/HistoricalData_AMSC.csv", "AMSC"),
    ("../datasets/HistoricalData_BMA.csv", "BMA"),
    ("../datasets/HistoricalData_USLM.csv", "USLM"),
    ("../datasets/HistoricalData_USD.csv", "USD"),
    ("../datasets/HistoricalData_FNGU.csv", "FNGU"),
    ("../datasets/HistoricalData_NQX.csv", "NQX"),
    ("../datasets/HistoricalData_NDX.csv", "NDX"),
    ("../datasets/HistoricalData_INDU.csv", "INDU")
]

# Initialize an empty list to store processed DataFrames
dataframes = []

# Process each file
for file_path, asset_name in file_paths_and_assets:
    try:
        # Load the CSV into a DataFrame
        df = pd.read_csv(file_path)
        
        # Add the 'Asset' column
        df['Asset'] = asset_name
        
        # Filter and rename columns
        df = df[['Date', 'Asset', 'Close']]

        # # Ensure the Date column is in yyyy-mm-dd format
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')  # Convert to datetime, handle errors as NaT
        df['Date'] = df['Date'].dt.strftime('%Y-%m-%d')  # Format as yyyy-mm-dd
  
        # Append the processed DataFrame to the list
        dataframes.append(df)
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")

# Combine all processed DataFrames into one
final_df = pd.concat(dataframes, ignore_index=True)

# Display the combined DataFrame
print(final_df)

             Date Asset       Close
0      1999-05-04    GS     50.5210
1      1999-05-05    GS     49.6237
2      1999-05-06    GS     48.7695
3      1999-05-07    GS     53.2129
4      1999-05-10    GS     50.7435
...           ...   ...         ...
72198  2014-12-02  INDU  17879.5500
72199  2014-12-01  INDU  17776.8000
72200  2014-11-28  INDU  17828.2400
72201  2014-11-27  INDU  17827.7500
72202  2014-11-26  INDU  17827.7500

[72203 rows x 3 columns]


In [3]:
print(sorted(final_df['Asset'].unique()))

['AMSC', 'BMA', 'FNGU', 'GS', 'INDU', 'INOD', 'LBTYB', 'MS', 'NDX', 'NQX', 'PDBC', 'UNG', 'USD', 'USLM', 'WLFC']


In [4]:
# print sum of missing values in each column
print(final_df.isna().sum())

Date     0
Asset    0
Close    0
dtype: int64


**Inject data to MongoDB**

In [5]:
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()
MONGO_URI = os.getenv("MONGO_URI")

In [6]:
from pymongo import MongoClient

# Connect to MongoDB
client = MongoClient(MONGO_URI)
db = client["robo_advisor"]  # Use the "robo_advisor" database
collection = db["historical_prices"]  # Use the "market_data" collection

In [7]:
# Insert data into MongoDB
data_dict = final_df.to_dict("records")  # Convert DataFrame to list of dictionaries
collection.insert_many(data_dict)  # Insert into the "market_data" collection

print("data successfully inserted into the 'historical_prices' collection.")

data successfully inserted into the 'historical_prices' collection.
