<a href="https://colab.research.google.com/github/chielgroen1998/RAAM/blob/main/RAAM_(momentum).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
!pip install yfinance
!pip install plotly




In [26]:
import yfinance as yf
import pandas as pd
import numpy as np
from datetime import datetime
import time

# Define the ticker symbols for the stocks
ticker_symbols = [
    "AAPL", "MSFT", "GOOG", "AMZN", "NVDA", "META", "TSLA", "PEP", "AVGO",
    "COST", "CSCO", "ADBE", "NFLX", "TMUS", "TXN", "CMCSA", "QCOM", "INTC", "HON",
    "AMD", "AMGN", "INTU", "ISRG", "BKNG", "MDLZ", "ADI", "LRCX", "VRTX", "MU",
    "AMAT", "SBUX", "GILD", "MRNA", "ADP", "PANW", "FISV", "CSX", "REGN", "MNST",
    "KLAC", "MAR", "NXPI", "ORLY", "ADSK", "MCHP", "AEP", "KDP", "SNPS",
    "FTNT", "IDXX", "LULU", "EXC", "CTAS", "PAYX", "XEL", "PCAR", "ODFL", "VRSK",
    "WBA", "CDNS", "AZN", "DLTR", "EBAY", "BIIB", "ROST", "CRWD", "CHTR",
    "FAST", "PDD", "ANSS", "MRVL", "TEAM", "WDAY", "BKR", "DDOG", "ZS", "CEG",
    "KHC", "VRSN", "CTSH", "SWKS", "OKTA", "EA", "LCID", "BIDU", "ALGN",
    "MELI", "JD", "LI", "NTES", "ASML", "DXCM", "CPRT"
]

# Parameters - all UTC timestamps
startdate = '2000-01-01'
enddate = '2025-12-31'
cutoff_date = '2014-07-01'

mom_p = 20 #26
vol_p = 35
RSI_p = 40
ass_amount = 6
MA_p = 40
cor_p = 40 # monthly

# Create reference timestamps with timezone
START_TS = pd.Timestamp(startdate).tz_localize('UTC')
END_TS = pd.Timestamp(enddate).tz_localize('UTC')
CUTOFF_TS = pd.Timestamp(cutoff_date).tz_localize('UTC')

def download_stock_data(ticker):
    """
    Download stock data for a single ticker with improved error handling.
    """
    try:
        print(f"\nDownloading data for {ticker}...")

        # Create a Ticker object
        stock = yf.Ticker(ticker)

        # Download the historical data
        data = stock.history(
            start=startdate,
            end=enddate,
            interval='1wk',
            auto_adjust=True  # This ensures we get adjusted prices
        )

        if data.empty:
            print(f"No data available for {ticker}")
            return None

        # Extract the closing prices
        prices = data['Close']  # Use 'Close' instead of 'Adj Close' since auto_adjust=True

        # Verify we have actual price data
        if len(prices) == 0:
            print(f"No price data for {ticker}")
            return None

        print(f"{ticker}: Got {len(prices)} prices from {prices.index[0]} to {prices.index[-1]}")

        # Add a small delay to avoid rate limiting
        time.sleep(1)  # Increased delay to be more conservative

        return prices

    except Exception as e:
        print(f"Error downloading {ticker}: {str(e)}")
        return None

def main():
    print("Starting download of stock data...")

    # Download and store the data
    all_data = {}
    successful_downloads = 0
    failed_downloads = 0
    long_history_tickers = []

    # Convert cutoff date to timezone-aware pandas timestamp
    cutoff = pd.Timestamp(cutoff_date, tz='UTC')

    # First pass: Download all data and identify stocks with sufficient history
    for ticker in ticker_symbols:
        series = download_stock_data(ticker)
        if series is not None and not series.empty:
            # Ensure index is timezone aware
            if series.index.tz is None:
                series.index = series.index.tz_localize('UTC')
            # Check if the stock has data from before our cutoff date
            if series.index[0] <= CUTOFF_TS:
                long_history_tickers.append(ticker)
                all_data[ticker] = series
                successful_downloads += 1
                print(f"Successfully processed {ticker} (full history)")
            else:
                print(f"Skipping {ticker} - insufficient history (starts from {series.index[0]})")
                failed_downloads += 1
        else:
            failed_downloads += 1

    # Create DataFrame and save results
    if all_data:
        # Convert to DataFrame
        combined_data = pd.DataFrame(all_data)

        # Save to CSV
        combined_data.to_csv('combined_stock_data.csv')

        print("\nDownload Summary:")
        print(f"Successfully downloaded: {successful_downloads} stocks")
        print(f"Failed downloads: {failed_downloads} stocks")
        print(f"Stocks with complete history from 2014: {len(long_history_tickers)}")
        print(f"\nShape of combined data: {combined_data.shape}")
        print("\nDate range in data:")
        print(f"Start: {combined_data.index[0]}")
        print(f"End: {combined_data.index[-1]}")
        print(f"\nStocks in dataset: {len(combined_data.columns)}")
        print("\nFirst few rows of the data:")
        print(combined_data.head())

        # Print list of included stocks
        print("\nIncluded stocks with complete history:")
        print(', '.join(sorted(long_history_tickers)))

    else:
        print("\nNo data was successfully downloaded!")
        print(f"Attempted downloads: {len(ticker_symbols)}")
        print(f"Failed downloads: {failed_downloads}")

if __name__ == "__main__":
    main()

Starting download of stock data...

Downloading data for AAPL...
AAPL: Got 1336 prices from 2000-01-01 00:00:00-05:00 to 2025-08-02 00:00:00-04:00
Successfully processed AAPL (full history)

Downloading data for MSFT...
MSFT: Got 1336 prices from 2000-01-01 00:00:00-05:00 to 2025-08-02 00:00:00-04:00
Successfully processed MSFT (full history)

Downloading data for GOOG...
GOOG: Got 1095 prices from 2004-08-16 00:00:00-04:00 to 2025-08-04 00:00:00-04:00
Successfully processed GOOG (full history)

Downloading data for AMZN...
AMZN: Got 1336 prices from 2000-01-01 00:00:00-05:00 to 2025-08-02 00:00:00-04:00
Successfully processed AMZN (full history)

Downloading data for NVDA...
NVDA: Got 1336 prices from 2000-01-01 00:00:00-05:00 to 2025-08-02 00:00:00-04:00
Successfully processed NVDA (full history)

Downloading data for META...
META: Got 691 prices from 2012-05-14 00:00:00-04:00 to 2025-08-04 00:00:00-04:00
Successfully processed META (full history)

Downloading data for TSLA...
TSLA: 

ERROR:yfinance:$FISV: possibly delisted; no timezone found


Successfully processed PANW (full history)

Downloading data for FISV...
No data available for FISV

Downloading data for CSX...
CSX: Got 1336 prices from 2000-01-01 00:00:00-05:00 to 2025-08-02 00:00:00-04:00
Successfully processed CSX (full history)

Downloading data for REGN...
REGN: Got 1336 prices from 2000-01-01 00:00:00-05:00 to 2025-08-02 00:00:00-04:00
Successfully processed REGN (full history)

Downloading data for MNST...
MNST: Got 1336 prices from 2000-01-01 00:00:00-05:00 to 2025-08-02 00:00:00-04:00
Successfully processed MNST (full history)

Downloading data for KLAC...
KLAC: Got 1336 prices from 2000-01-01 00:00:00-05:00 to 2025-08-02 00:00:00-04:00
Successfully processed KLAC (full history)

Downloading data for MAR...
MAR: Got 1336 prices from 2000-01-01 00:00:00-05:00 to 2025-08-02 00:00:00-04:00
Successfully processed MAR (full history)

Downloading data for NXPI...
NXPI: Got 784 prices from 2010-08-02 00:00:00-04:00 to 2025-08-04 00:00:00-04:00
Successfully process

In [27]:
def load_stock_data(filepath='combined_stock_data.csv'):
    combined_data = pd.read_csv(filepath, index_col=0, parse_dates=True)
    # CRITICAL: Convert index to UTC with proper timezone handling
    combined_data.index = pd.to_datetime(combined_data.index, utc=True)
    return combined_data

combined_data = load_stock_data()

In [28]:
price_changes = combined_data.pct_change()
volatility = price_changes.rolling(window= vol_p ).std()
volatility_monthly = volatility.resample('M').last()
ranked_volatility = volatility_monthly.rank(axis=1, method='first')

ranked_volatility

Unnamed: 0_level_0,AAPL,MSFT,GOOG,AMZN,NVDA,META,TSLA,PEP,AVGO,COST,...,SWKS,EA,BIDU,ALGN,MELI,JD,NTES,ASML,DXCM,CPRT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-31 00:00:00+00:00,,,,,,,,,,,...,,,,,,,,,,
2000-02-29 00:00:00+00:00,,,,,,,,,,,...,,,,,,,,,,
2000-03-31 00:00:00+00:00,,,,,,,,,,,...,,,,,,,,,,
2000-04-30 00:00:00+00:00,,,,,,,,,,,...,,,,,,,,,,
2000-05-31 00:00:00+00:00,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-04-30 00:00:00+00:00,52.0,39.0,37.0,46.0,75.0,63.0,74.0,14.0,77.0,25.0,...,76.0,56.0,62.0,42.0,51.0,61.0,26.0,40.0,71.0,13.0
2025-05-31 00:00:00+00:00,58.0,30.0,48.0,51.0,75.0,61.0,74.0,11.0,77.0,24.0,...,78.0,21.0,55.0,47.0,52.0,23.0,38.0,40.0,67.0,46.0
2025-06-30 00:00:00+00:00,57.0,34.0,45.0,51.0,76.0,66.0,77.0,8.0,78.0,10.0,...,67.0,13.0,50.0,46.0,52.0,30.0,44.0,41.0,72.0,47.0
2025-07-31 00:00:00+00:00,35.0,31.0,42.0,41.0,71.0,66.0,80.0,12.0,79.0,8.0,...,47.0,16.0,44.0,81.0,55.0,48.0,56.0,40.0,69.0,54.0


In [29]:
price_changes = combined_data.pct_change()

mask1 = combined_data < combined_data.shift(mom_p)

moving_average = combined_data.rolling(window=MA_p).mean()
mask2 = combined_data < moving_average

price_changes = price_changes.where(~(mask1 | mask2))

momentum = price_changes.rolling(window=mom_p).apply(lambda x: (x + 1).prod() - 1)

momentum_monthly = momentum.resample('M').last()

ranked_momentum = momentum_monthly.rank(axis=1, method='first', ascending=False)

ranked_momentum.to_csv('ranked_momentum.csv')

ranked_momentum

Unnamed: 0_level_0,AAPL,MSFT,GOOG,AMZN,NVDA,META,TSLA,PEP,AVGO,COST,...,SWKS,EA,BIDU,ALGN,MELI,JD,NTES,ASML,DXCM,CPRT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-31 00:00:00+00:00,,,,,,,,,,,...,,,,,,,,,,
2000-02-29 00:00:00+00:00,,,,,,,,,,,...,,,,,,,,,,
2000-03-31 00:00:00+00:00,,,,,,,,,,,...,,,,,,,,,,
2000-04-30 00:00:00+00:00,,,,,,,,,,,...,,,,,,,,,,
2000-05-31 00:00:00+00:00,,,,,1.0,,,26.0,,,...,8.0,,,,,,,32.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-04-30 00:00:00+00:00,,,,,,,,,,,...,,,,,,,,,,
2025-05-31 00:00:00+00:00,,,,,,,,,,,...,,3.0,,,,,,,,
2025-06-30 00:00:00+00:00,,1.0,,,,,,,,,...,,,,,9.0,,3.0,,,
2025-07-31 00:00:00+00:00,,11.0,15.0,23.0,2.0,7.0,17.0,,5.0,,...,,,,19.0,28.0,,22.0,30.0,,


In [30]:


import pandas as pd

# Calculate percentual change for each ticker
pct_change_df = combined_data.pct_change()

# Initialize an empty list to store correlation mean for each date entry
correlation_means = []

# Iterate through each row
for index, row in pct_change_df.iterrows():
    # Compute correlation for each ticker with other tickers and take the mean
    other_tickers = [ticker for ticker in pct_change_df.columns if ticker != index]
    correlation_mean = row.corr(pct_change_df[other_tickers].mean(axis=1))
    correlation_means.append(correlation_mean)


# Add the correlation means as a new column to the DataFrame
pct_change_df['Correlation_Mean'] = correlation_means
pct_change_df = pct_change_df.drop('Correlation_Mean', axis=1)

# Resample the dataframe back every month
resampled_df = pct_change_df.resample('M').mean()

# Calculate the mean value of 16 months back
rolling_mean_df = resampled_df.rolling(window=cor_p).mean()

# Roll forward the rolling mean values
rolling_mean_df = rolling_mean_df.shift(-1)

# Drop the last row since it will be NaN after the shift
rolling_mean_df = rolling_mean_df.iloc[:-1]

# Create a new DataFrame to hold the rankings
rankings_df = rolling_mean_df

# Rank the tickers based on their correlation
ranked_correlation = rolling_mean_df.rank(axis=1, method='first')

# Print the resulting DataFrame
ranked_correlation


Unnamed: 0_level_0,AAPL,MSFT,GOOG,AMZN,NVDA,META,TSLA,PEP,AVGO,COST,...,SWKS,EA,BIDU,ALGN,MELI,JD,NTES,ASML,DXCM,CPRT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-31 00:00:00+00:00,,,,,,,,,,,...,,,,,,,,,,
2000-02-29 00:00:00+00:00,,,,,,,,,,,...,,,,,,,,,,
2000-03-31 00:00:00+00:00,,,,,,,,,,,...,,,,,,,,,,
2000-04-30 00:00:00+00:00,,,,,,,,,,,...,,,,,,,,,,
2000-05-31 00:00:00+00:00,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-03-31 00:00:00+00:00,45.0,56.0,42.0,46.0,81.0,76.0,58.0,10.0,80.0,67.0,...,3.0,33.0,14.0,2.0,77.0,13.0,48.0,34.0,23.0,63.0
2025-04-30 00:00:00+00:00,32.0,53.0,35.0,48.0,81.0,78.0,66.0,10.0,80.0,68.0,...,5.0,20.0,6.0,2.0,77.0,8.0,47.0,44.0,33.0,51.0
2025-05-31 00:00:00+00:00,37.0,54.0,38.0,51.0,81.0,78.0,67.0,9.0,80.0,64.0,...,5.0,25.0,7.0,2.0,76.0,11.0,60.0,48.0,23.0,50.0
2025-06-30 00:00:00+00:00,34.0,62.0,48.0,45.0,81.0,79.0,47.0,12.0,80.0,57.0,...,5.0,33.0,9.0,2.0,75.0,10.0,51.0,35.0,17.0,46.0


In [31]:
import pandas as pd


# Ensure the index is a datetime object
combined_data.index = pd.to_datetime(combined_data.index)

# Initialize an empty DataFrame to store the RSI values
rsi_values = pd.DataFrame(index=combined_data.index)

# Calculate the RSI for each stock
for stock in combined_data.columns:
    # Calculate the price changes for the current stock
    stock_changes = combined_data[stock].pct_change()

    # Calculate the gain and loss
    gain = stock_changes.clip(lower=0)
    loss = -stock_changes.clip(upper=0)

    # Calculate the average gain and loss
    avg_gain = gain.ewm(com=RSI_p, adjust=False).mean()  # Exponential moving average
    avg_loss = loss.ewm(com=RSI_p, adjust=False).mean()  # Exponential moving average

    # Calculate the RSI
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))

    # Add the RSI values to the DataFrame
    rsi_values[stock] = rsi

# Resample the RSI values to monthly data
rsi_monthly = rsi_values.resample('M').last()

# Rank the RSI
ranked_rsi = rsi_monthly.rank(axis=1, method='first', ascending=False)

# Print the ranked RSI
ranked_rsi

Unnamed: 0_level_0,AAPL,MSFT,GOOG,AMZN,NVDA,META,TSLA,PEP,AVGO,COST,...,SWKS,EA,BIDU,ALGN,MELI,JD,NTES,ASML,DXCM,CPRT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-31 00:00:00+00:00,35.0,38.0,,48.0,27.0,,,56.0,,20.0,...,3.0,34.0,,,,,,14.0,,42.0
2000-02-29 00:00:00+00:00,26.0,39.0,,55.0,24.0,,,57.0,,23.0,...,2.0,32.0,,,,,,14.0,,43.0
2000-03-31 00:00:00+00:00,25.0,38.0,,55.0,27.0,,,58.0,,21.0,...,12.0,36.0,,,,,,13.0,,44.0
2000-04-30 00:00:00+00:00,36.0,44.0,,55.0,23.0,,,56.0,,22.0,...,9.0,37.0,,,,,,10.0,,40.0
2000-05-31 00:00:00+00:00,39.0,47.0,,53.0,19.0,,,55.0,,31.0,...,11.0,34.0,,,,,,6.0,,43.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-04-30 00:00:00+00:00,63.0,27.0,55.0,56.0,45.0,35.0,40.0,79.0,25.0,15.0,...,70.0,26.0,49.0,75.0,11.0,48.0,10.0,60.0,39.0,13.0
2025-05-31 00:00:00+00:00,69.0,18.0,54.0,34.0,24.0,27.0,23.0,80.0,15.0,25.0,...,65.0,42.0,73.0,75.0,9.0,68.0,6.0,51.0,39.0,78.0
2025-06-30 00:00:00+00:00,66.0,8.0,49.0,28.0,14.0,20.0,51.0,77.0,5.0,48.0,...,53.0,40.0,72.0,68.0,22.0,74.0,3.0,43.0,63.0,79.0
2025-07-31 00:00:00+00:00,64.0,3.0,29.0,34.0,8.0,11.0,44.0,62.0,5.0,51.0,...,66.0,30.0,57.0,79.0,31.0,67.0,15.0,68.0,54.0,78.0


In [32]:
# Define a separate set of weights for ranking
ranking_weights = pd.Series({
    'Momentum Score': 0.25,
    'RSI Score': 0.25,
    'Volatility Score': 0.25,
    'Correlation Score': 0.25
})


# Calculate weighted scores
weighted_momentum = ranked_momentum * ranking_weights['Momentum Score']
weighted_rsi = ranked_rsi * ranking_weights['RSI Score']
weighted_volatility = ranked_volatility * ranking_weights['Volatility Score']
weighted_correlation = ranked_correlation * ranking_weights['Correlation Score']

# Calculate the cumulative score with weights
cumulative_score = weighted_momentum + weighted_rsi + weighted_volatility + weighted_correlation

cumulative_score.to_csv('cumscore.csv', index=True)

cumulative_score

Unnamed: 0_level_0,AAPL,MSFT,GOOG,AMZN,NVDA,META,TSLA,PEP,AVGO,COST,...,SWKS,EA,BIDU,ALGN,MELI,JD,NTES,ASML,DXCM,CPRT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-31 00:00:00+00:00,,,,,,,,,,,...,,,,,,,,,,
2000-02-29 00:00:00+00:00,,,,,,,,,,,...,,,,,,,,,,
2000-03-31 00:00:00+00:00,,,,,,,,,,,...,,,,,,,,,,
2000-04-30 00:00:00+00:00,,,,,,,,,,,...,,,,,,,,,,
2000-05-31 00:00:00+00:00,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-04-30 00:00:00+00:00,,,,,,,,,,,...,,,,,,,,,,
2025-05-31 00:00:00+00:00,,,,,,,,,,,...,,22.75,,,,,,,,
2025-06-30 00:00:00+00:00,,26.25,,,,,,,,,...,,,,,39.5,,25.25,,,
2025-07-31 00:00:00+00:00,,27.00,35.75,38.5,40.5,40.5,48.0,,42.25,,...,,,,45.75,47.5,,35.25,44.5,,


In [33]:
import pandas as pd

def process_portfolio_selections(cumulative_score, ass_amount):
    """
    Process portfolio selections based on cumulative scores and create a date-ticker mapping.

    Parameters:
    cumulative_score (pd.DataFrame): DataFrame with cumulative scores
    ass_amount (int): Number of assets to select

    Returns:
    pd.DataFrame: DataFrame with Date and Ticker columns for selected assets
    """
    # Get the smallest n scores for each date
    ranked_df = cumulative_score.apply(lambda x: x.nsmallest(ass_amount), axis=1)

    # Create a boolean mask for valid selections (not null values)
    ranked_mask = ranked_df.notna()

    # Create DataFrame with ticker names where mask is True
    result_df = pd.DataFrame(index=ranked_df.index, columns=ranked_df.columns)
    for column in ranked_df.columns:
        result_df[column] = ranked_mask[column].map({True: column, False: 0})

    # Convert index to datetime if not already
    result_df.index = pd.to_datetime(result_df.index)

    # Shift dates forward by one month
    result_df.index = result_df.index + pd.DateOffset(months=1)

    # Create the output DataFrame more efficiently
    dates = []
    tickers = []

    # Iterate through the DataFrame and collect non-zero entries
    for date, row in result_df.iterrows():
        valid_tickers = row[row != 0]
        if not valid_tickers.empty:
            dates.extend([date] * len(valid_tickers))
            tickers.extend(valid_tickers.index)

    # Create the final DataFrame
    non_zero_df = pd.DataFrame({
        'Date': dates,
        'Ticker': tickers
    })

    # Ensure Date column is datetime
    non_zero_df['Date'] = pd.to_datetime(non_zero_df['Date'])

    return non_zero_df

# Use the function
portfolio_selections = process_portfolio_selections(cumulative_score, ass_amount)

# Save to CSV
portfolio_selections.to_csv('portfolio_selections.csv', index=False)

# Print some information about the selections
print("\nPortfolio Selections Summary:")
print(f"Total number of selections: {len(portfolio_selections)}")
print(f"Date range: {portfolio_selections['Date'].min()} to {portfolio_selections['Date'].max()}")
print(f"Number of unique tickers: {portfolio_selections['Ticker'].nunique()}")
print("\nFirst few selections:")
print(portfolio_selections.head(50))
print(portfolio_selections.tail(6))


Portfolio Selections Summary:
Total number of selections: 1560
Date range: 2003-04-30 00:00:00+00:00 to 2025-08-31 00:00:00+00:00
Number of unique tickers: 79

First few selections:
                        Date Ticker
0  2003-04-30 00:00:00+00:00   AMGN
1  2003-04-30 00:00:00+00:00  CMCSA
2  2003-04-30 00:00:00+00:00   EBAY
3  2003-04-30 00:00:00+00:00   SBUX
4  2003-04-30 00:00:00+00:00    XEL
5  2003-05-30 00:00:00+00:00   AMGN
6  2003-05-30 00:00:00+00:00   AMZN
7  2003-05-30 00:00:00+00:00  CMCSA
8  2003-05-30 00:00:00+00:00   COST
9  2003-05-30 00:00:00+00:00   EBAY
10 2003-05-30 00:00:00+00:00   PCAR
11 2003-06-30 00:00:00+00:00   AMGN
12 2003-06-30 00:00:00+00:00   BKNG
13 2003-06-30 00:00:00+00:00     EA
14 2003-06-30 00:00:00+00:00   PCAR
15 2003-06-30 00:00:00+00:00   VRSN
16 2003-06-30 00:00:00+00:00    XEL
17 2003-07-30 00:00:00+00:00   BKNG
18 2003-07-30 00:00:00+00:00   CSCO
19 2003-07-30 00:00:00+00:00     EA
20 2003-07-30 00:00:00+00:00   EBAY
21 2003-07-30 00:00:00+00

In [None]:
import pandas as pd
import yfinance as yf
from datetime import datetime, timedelta
import numpy as np
from typing import Dict, List, Any

def calculate_stock_return(ticker: str, start_date: pd.Timestamp, end_date: pd.Timestamp) -> Dict[str, Any]:
    """Calculate return for a single stock with error handling"""
    try:
        stock = yf.download(
            ticker,
            start=start_date,
            end=end_date + timedelta(days=1),
            progress=False,
            ignore_tz=True
        )

        if stock.empty or len(stock) < 2:
            print(f"Warning: Insufficient data for {ticker} between {start_date} and {end_date}")
            return None

        first_price = float(stock['Close'].iloc[0].item())  # Updated float conversion
        last_price = float(stock['Close'].iloc[-1].item())  # Updated float conversion
        pct_change = ((last_price - first_price) / first_price) * 100

        return {
            'Start_Price': first_price,
            'End_Price': last_price,
            'Return_Pct': pct_change
        }
    except Exception as e:
        print(f"Error processing {ticker} for period {start_date} to {end_date}: {str(e)}")
        return None

def analyze_portfolio(df: pd.DataFrame) -> pd.DataFrame:
    """Analyze portfolio returns"""
    results = []

    # Group by date to get monthly portfolios
    monthly_portfolios = df.groupby('Date')['Ticker'].apply(list).reset_index()

    total_tickers = sum(len(tickers) for tickers in monthly_portfolios['Ticker'])
    processed = 0

    for _, row in monthly_portfolios.iterrows():
        date = pd.to_datetime(row['Date'])
        tickers = row['Ticker']

        # Calculate start and end of month
        start_date = date.replace(day=1)
        end_date = (start_date + pd.offsets.MonthEnd(0))

        for ticker in tickers:
            return_data = calculate_stock_return(ticker, start_date, end_date)
            processed += 1

            if return_data is not None:
                results.append({
                    'Date': date,
                    'Ticker': ticker,
                    **return_data
                })

            # Print progress
            if processed % 100 == 0:
                print(f"Processed {processed}/{total_tickers} stocks")

    return pd.DataFrame(results)

def format_summary(summary_df: pd.DataFrame) -> pd.DataFrame:
    """Format the summary dataframe for better display"""
    summary_df.index = summary_df.index.strftime('%Y-%m-%d')
    return summary_df

def print_analysis(returns_df: pd.DataFrame, summary_df: pd.DataFrame):
    """Print formatted analysis results"""
    print("\nPortfolio Analysis Summary:")
    print(f"Total periods analyzed: {len(summary_df)}")
    print(f"Total stocks analyzed: {len(returns_df)}")

    print("\nFirst few rows of monthly summary:")
    print(format_summary(summary_df.head()))

    print("\nOverall Statistics:")
    print(f"Average monthly return: {returns_df['Return_Pct'].mean():.2f}%")
    print(f"Best monthly return: {returns_df['Return_Pct'].max():.2f}%")
    print(f"Worst monthly return: {returns_df['Return_Pct'].min():.2f}%")
    print(f"Return standard deviation: {returns_df['Return_Pct'].std():.2f}%")

    # Calculate annualized statistics
    monthly_returns = returns_df.groupby('Date')['Return_Pct'].mean()
    annualized_return = ((1 + monthly_returns/100).prod() ** (12/len(monthly_returns)) - 1) * 100
    annualized_vol = monthly_returns.std() * np.sqrt(12)

    print(f"\nAnnualized Statistics:")
    print(f"Annualized Return: {annualized_return:.2f}%")
    print(f"Annualized Volatility: {annualized_vol:.2f}%")
    print(f"Sharpe Ratio (Rf=0): {(annualized_return/annualized_vol):.2f}")

# Example usage:
if __name__ == "__main__":
    # Read your CSV data
    portfolio_df = pd.read_csv("portfolio_selections.csv")

    # Process returns
    print("Starting portfolio analysis...")
    returns_df = analyze_portfolio(portfolio_df)

    # Generate summary
    summary_df = returns_df.groupby('Date').agg({
        'Return_Pct': [
            ('Mean Return %', 'mean'),
            ('Std Dev %', 'std'),
            ('Min Return %', 'min'),
            ('Max Return %', 'max'),
            ('Count', 'count')
        ]
    }).round(2)

    # Flatten column names
    summary_df.columns = summary_df.columns.get_level_values(1)

    # Save results
    returns_df.to_csv('stock_returns_detailed.csv', index=False)
    summary_df.to_csv('monthly_summary.csv')

    # Print analysis
    print_analysis(returns_df, summary_df)

Starting portfolio analysis...
Processed 100/1560 stocks
Processed 200/1560 stocks
Processed 300/1560 stocks
Processed 400/1560 stocks
Processed 500/1560 stocks
Processed 600/1560 stocks
Processed 700/1560 stocks
Processed 800/1560 stocks
Processed 900/1560 stocks
Processed 1000/1560 stocks
Processed 1100/1560 stocks
Processed 1200/1560 stocks
Processed 1300/1560 stocks
Processed 1400/1560 stocks
Processed 1500/1560 stocks


In [None]:
import pandas as pd
import numpy as np
import yfinance as yf
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

def calculate_monthly_matrix(returns_series):
    """Convert monthly returns to a year x month matrix"""
    df = pd.DataFrame({'returns': returns_series})
    df['year'] = df.index.year
    df['month'] = df.index.month
    return df.pivot_table(index='year', columns='month', values='returns')

def create_performance_charts(returns_df, benchmark_tickers=['SPY', 'QQQ']):
    """Create performance visualization suite"""
    # Convert returns to decimal
    returns_df['Return_Pct'] = returns_df['Return_Pct'] / 100

    # Calculate portfolio performance
    monthly_returns = returns_df.groupby('Date')['Return_Pct'].mean()
    portfolio_cum_returns = (1 + monthly_returns).cumprod()

    # Download benchmark data
    benchmark_returns = {}
    for ticker in benchmark_tickers:
        print(f"\nDownloading {ticker} data...")
        benchmark_data = yf.download(ticker,
                                   start=returns_df['Date'].min(),
                                   end=returns_df['Date'].max(),
                                   interval='1mo')
        benchmark_returns[ticker] = benchmark_data['Close'].pct_change()

    # Create figures
    fig1 = make_subplots(rows=2, cols=1, shared_xaxes=True,
                        subplot_titles=('Cumulative Returns (Log Scale)', 'Drawdowns'))

    # Cumulative Returns Plot
    fig1.add_trace(
        go.Scatter(x=portfolio_cum_returns.index, y=portfolio_cum_returns,
                  name='Portfolio', line=dict(color='blue')),
        row=1, col=1
    )

    for ticker, returns in benchmark_returns.items():
        cum_returns = (1 + returns).cumprod()
        fig1.add_trace(
            go.Scatter(x=cum_returns.index, y=cum_returns,
                      name=ticker, line=dict(dash='dash')),
            row=1, col=1
        )

    # Drawdowns Plot
    drawdowns = (portfolio_cum_returns / portfolio_cum_returns.cummax() - 1)
    fig1.add_trace(
        go.Scatter(x=drawdowns.index, y=drawdowns,
                  name='Portfolio Drawdowns', line=dict(color='red')),
        row=2, col=1
    )

    fig1.update_layout(height=800, title='Portfolio Performance Analysis')
    fig1.update_yaxes(type="log", row=1, col=1)

    # Create monthly returns matrix for heatmap
    monthly_matrix = calculate_monthly_matrix(monthly_returns)

    # Create yearly comparison table
    yearly_returns = monthly_returns.groupby(monthly_returns.index.year).apply(
        lambda x: (1 + x).prod() - 1
    )

    yearly_comparison = pd.DataFrame({
        'Portfolio': yearly_returns
    })

    for ticker, returns in benchmark_returns.items():
        yearly_comparison[ticker] = returns.groupby(returns.index.year).apply(
            lambda x: (1 + x).prod() - 1
        )

    # Create heatmap figure
    fig2 = plt.figure(figsize=(15, 8))
    sns.heatmap(monthly_matrix,
                cmap='RdYlGn',
                center=0,
                annot=True,
                fmt='.2%')
    plt.title('Monthly Returns Heatmap')

    # Calculate yearly statistics
    yearly_stats = pd.DataFrame(index=yearly_returns.index)

    for year in yearly_returns.index:
        year_returns = monthly_returns[monthly_returns.index.year == year]

        # Basic statistics
        yearly_stats.loc[year, 'Return'] = yearly_returns[year]
        yearly_stats.loc[year, 'Volatility'] = year_returns.std() * np.sqrt(12)
        yearly_stats.loc[year, 'Sharpe'] = (yearly_returns[year] - 0.02) / (year_returns.std() * np.sqrt(12))

        # Sortino Ratio
        downside_returns = year_returns[year_returns < 0]
        if len(downside_returns) > 0:
            yearly_stats.loc[year, 'Sortino'] = (yearly_returns[year] - 0.02) / (downside_returns.std() * np.sqrt(12))
        else:
            yearly_stats.loc[year, 'Sortino'] = np.nan

        # Maximum Drawdown
        cum_returns = (1 + year_returns).cumprod()
        yearly_stats.loc[year, 'Max Drawdown'] = (cum_returns / cum_returns.cummax() - 1).min()

    return {
        'performance_plot': fig1,
        'heatmap': fig2,
        'yearly_comparison': yearly_comparison,
        'yearly_stats': yearly_stats
    }

if __name__ == "__main__":
    # Read your data
    returns_df = pd.read_csv('stock_returns_detailed.csv')
    returns_df['Date'] = pd.to_datetime(returns_df['Date'])

    print("Creating performance charts...")
    analysis_results = create_performance_charts(returns_df)

    # Display results
    print("\nYearly Performance Comparison (%):")
    print(analysis_results['yearly_comparison'].round(4) * 100)

    print("\nYearly Statistics:")
    print(analysis_results['yearly_stats'].round(4))

    # Save results
    analysis_results['yearly_comparison'].to_csv('yearly_performance_comparison.csv')
    analysis_results['yearly_stats'].to_csv('yearly_statistics.csv')

    # Show plots
    analysis_results['performance_plot'].show()
    plt.show()  # Show the heatmap

    # Additional analytics
    monthly_returns = returns_df.groupby('Date')['Return_Pct'].mean()

    print("\nPortfolio Statistics:")
    print(f"Total Return: {(((1 + monthly_returns).prod() - 1) * 100):.2f}%")
    print(f"Annual Return: {(((1 + monthly_returns).prod() ** (12/len(monthly_returns)) - 1) * 100):.2f}%")
    print(f"Monthly Volatility: {(monthly_returns.std() * 100):.2f}%")
    print(f"Annual Volatility: {(monthly_returns.std() * np.sqrt(12) * 100):.2f}%")
    print(f"Sharpe Ratio: {((monthly_returns.mean() - 0.02/12) / (monthly_returns.std()) * np.sqrt(12)):.2f}")
    print(f"Max Drawdown: {(((1 + monthly_returns).cumprod() / (1 + monthly_returns).cumprod().cummax() - 1).min() * 100):.2f}%")