<a href="https://colab.research.google.com/github/elvissounna/Colab_Training-/blob/main/QuantStatsAnalyzing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import yfinance as yf
from datetime import datetime, timedelta

# Set random seed for reproducibility
np.random.seed(42)

# Define the time period
end_date = datetime(2023, 12, 31)
start_date = end_date - timedelta(days=5*365)  # Approximately 5 years of data

In [2]:


# Create a function to generate a dataset
def generate_portfolio_dataset():
    # Download actual market data for realism
    tickers = ['AAPL', 'MSFT', 'AMZN', 'GOOGL', 'JPM', 'JNJ', 'PG', 'XOM', 'SPY']

    print("Downloading market data...")
    # Download each ticker separately to ensure we get the data correctly
    market_data = pd.DataFrame()

    for ticker in tickers:
        try:
            # Get data for this ticker
            data = yf.download(ticker, start=start_date, end=end_date)
            # Extract just the Adjusted Close and add it to our dataframe
            if 'Adj Close' in data.columns:
                market_data[ticker] = data['Adj Close']
            else:
                # Fallback to Close if Adj Close is not available
                print(f"Using Close price for {ticker} as Adj Close is not available")
                market_data[ticker] = data['Close']
        except Exception as e:
            print(f"Error downloading {ticker}: {e}")

    # Check if we have data
    if market_data.empty:
        raise ValueError("Failed to download any market data. Please check your internet connection.")

    print(f"Successfully downloaded data for {len(market_data.columns)} tickers")

    # Calculate daily returns
    returns = market_data.pct_change().dropna()

    # Create a hypothetical portfolio with custom weights
    weights = {
        'AAPL': 0.15,
        'MSFT': 0.15,
        'AMZN': 0.10,
        'GOOGL': 0.10,
        'JPM': 0.12,
        'JNJ': 0.12,
        'PG': 0.10,
        'XOM': 0.08,
        'SPY': 0.08  # Some exposure to the S&P 500 ETF
    }

    # Ensure we only use weights for tickers that we have data for
    available_tickers = set(market_data.columns).intersection(set(weights.keys()))
    if len(available_tickers) < len(weights):
        print(f"Warning: Only {len(available_tickers)} out of {len(weights)} tickers have data")

    # Recalculate weights to use only available tickers
    filtered_weights = {ticker: weights[ticker] for ticker in available_tickers}
    # Normalize weights to sum to 1
    weight_sum = sum(filtered_weights.values())
    filtered_weights = {ticker: weight/weight_sum for ticker, weight in filtered_weights.items()}

    # Calculate portfolio returns
    portfolio_returns = pd.Series(0, index=returns.index)
    for ticker, weight in filtered_weights.items():
        portfolio_returns += returns[ticker] * weight

    # Create a synthetic benchmark based on SPY if available, otherwise use portfolio average
    if 'SPY' in returns.columns:
        benchmark_returns = returns['SPY'] * (1 + np.random.normal(0, 0.0005, len(returns)))
    else:
        print("SPY data not available, using portfolio average as benchmark")
        benchmark_returns = portfolio_returns * (1 + np.random.normal(0, 0.001, len(returns)))

    # Create a dataset with asset prices, portfolio value, and benchmark
    # Start with initial investments of $10,000 each
    portfolio_value = (1 + portfolio_returns).cumprod() * 10000
    benchmark_value = (1 + benchmark_returns).cumprod() * 10000

    # Combine into a final dataframe
    dataset = pd.DataFrame({
        'portfolio_returns': portfolio_returns,
        'portfolio_value': portfolio_value,
        'benchmark_returns': benchmark_returns,
        'benchmark_value': benchmark_value
    })

    # Add individual assets to the dataset
    for ticker in filtered_weights.keys():
        if ticker != 'SPY' or 'SPY' not in benchmark_returns.name:  # Skip SPY if it's already the benchmark
            asset_value = (1 + returns[ticker]).cumprod() * 10000
            dataset[f'{ticker}_value'] = asset_value
            dataset[f'{ticker}_returns'] = returns[ticker]

    # Save the dataset
    dataset.to_csv('portfolio_performance_dataset.csv')

    # Create a sample of the data for quick viewing
    asset_allocation = pd.Series(filtered_weights)

    return dataset, asset_allocation, market_data

# Generate the dataset
try:
    dataset, asset_allocation, market_data = generate_portfolio_dataset()

    # Display sample information
    print("\nDataset generation complete.")
    print(f"Time period: {start_date.date()} to {end_date.date()}")
    print(f"Number of trading days: {len(dataset)}")
    print("\nAsset allocation:")
    print(asset_allocation)
    print("\nSample of the generated dataset:")
    print(dataset.head())

    # Also save the asset prices
    market_data.to_csv('asset_prices_dataset.csv')

    print("\nDatasets saved as 'portfolio_performance_dataset.csv' and 'asset_prices_dataset.csv'")
except Exception as e:
    print(f"An error occurred: {e}")
    print("Please try again with a different approach or check if yfinance is working properly.")

Downloading market data...
YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Using Close price for AAPL as Adj Close is not available
Using Close price for MSFT as Adj Close is not available


[*********************100%***********************]  1 of 1 completed


Using Close price for AMZN as Adj Close is not available


[*********************100%***********************]  1 of 1 completed


Using Close price for GOOGL as Adj Close is not available


[*********************100%***********************]  1 of 1 completed


Using Close price for JPM as Adj Close is not available


[*********************100%***********************]  1 of 1 completed


Using Close price for JNJ as Adj Close is not available


[*********************100%***********************]  1 of 1 completed


Using Close price for PG as Adj Close is not available


[*********************100%***********************]  1 of 1 completed


Using Close price for XOM as Adj Close is not available


[*********************100%***********************]  1 of 1 completed

Using Close price for SPY as Adj Close is not available
Successfully downloaded data for 9 tickers

Dataset generation complete.
Time period: 2019-01-01 to 2023-12-31
Number of trading days: 1257

Asset allocation:
SPY      0.08
GOOGL    0.10
PG       0.10
AMZN     0.10
XOM      0.08
MSFT     0.15
JPM      0.12
JNJ      0.12
AAPL     0.15
dtype: float64

Sample of the generated dataset:
            portfolio_returns  portfolio_value  benchmark_returns  \
Date                                                                
2019-01-03          -0.033204      9667.962882          -0.023869   
2019-01-04           0.037624     10031.707215           0.033493   
2019-01-07           0.003054     10062.340645           0.007887   
2019-01-08           0.010750     10170.510015           0.009402   
2019-01-09           0.002530     10196.237786           0.004673   

            benchmark_value   GOOGL_value  GOOGL_returns      PG_value  \
Date                                                


