In [None]:
from IPython.display import display, Math, Latex

import pandas as pd
import numpy as np
import numpy_financial as npf
import yfinance as yf
import matplotlib.pyplot as plt
from datetime import datetime

## Group Assignment
### Team Number: 17
### Team Member Names: John, Gen, Timothy
### Team Strategy Chosen: Risky 

Requirements for this assignment:
- Code needs to be dynamic
- Must read in a csv file containing a finite number of stock tickers (with an example csv file in this repo)
- Portfolio must contain only US listed stocks
- Must have an average daily volume of at least 10 000 shares as calculated based on the time interval July 2, 2021 to October 22, 2021
- Pick a minimum 10 maximum 20 stocks for portfolio.
- If we choose n stocks, each stock must make up minimum (100/(2n))% of the portfolio when weighted by value as of closing prices on November 26, 2021
- No individual stock may make up more than 35% of the portfolio when weighted by value
- We have $100000 USD to spend on portfolio and all must be spent
- When code is run with the .csv file, it must create a DataFrame "FinalPortfolio"
    - With this portfolio, index starts at 1 and ends at the number of stocks that our code chooses.
    - Headings must be: Ticker, Price, Shares, Value, Weight
        - Ticker is the ticker selected
        - Price is the price on November 26, 2021
        - Shares is the number of shares purchased (can be fractional)
        - Value is the total value of those shares
        - Weighted is the weight that the value of shares represents relative to the value of the portfolio (which is $100,000)
    - Needs to show that the total adds up to $100,000
    - Also need to show the weights add to 100%
    - This DataFrame must be printed to the screen as the second to last output to the screen.
- After the DataFrame, one final DataFrame called "Stocks" which has the same index as "Final Portfolio" must be mad
    - Only has the Tickers and Shares from "Final Portfolio"
    - Must output this DataFrame to a csv file titled "Stocks_Group_XX.csv"

### Step 1 (Gen): Filtering out US valid tickers

In [None]:
stocks_from_csv = pd.read_csv('Tickers.csv')
stocks_from_csv.head()

In [None]:
stocks_lst = []
for g in range(len(stocks_from_csv.iloc[:,0])):
   stocks_lst.append(stocks_from_csv.iloc[g,0])
stocks_lst.append(stocks_from_csv.columns[0]) 
stocks_lst

In [None]:
len(stocks_from_csv.iloc[:,0])

In [None]:
len(stocks_lst)

In [None]:
stocks_from_csv.columns[0]

In [None]:
us_stock_lst = []
def filter_us():
    i=0
    while i < len(stocks_lst):
        if (yf.Ticker(stocks_lst[i]).info['regularMarketPrice'] != None) and (yf.Ticker(stocks_lst[i]).info['market']=='us_market'):
            us_stock_lst.append(yf.Ticker(stocks_lst[i]))
        i+=1

filter_us()

In [None]:
us_stock_lst, len(us_stock_lst)

### Step 4 (John): Out of the 20 most risky stocks, choose the portfolio of 10 with highest correlation

In [20]:
# Define a global price dataframe to store all past data
global_price_df = pd.DataFrame()

# Define the start and end date
start_date = '2020-01-01'
end_date = '2021-11-01'

In [21]:
### Add all data for tickers to a global price dataframe
def update_price_df(ticker_lst, start_date, end_date):
    # Load the global dataframe
    global global_price_df
    
    # Read in the history closing price info
    # Store in a dictionary where the key is the name of the ticker
    hist_dic = {}
    for i in ticker_lst:
        if i not in global_price_df:
            ticker_yf = yf.Ticker(i)
            hist_dic[i] = ticker_yf.history(start=start_date, end=end_date)['Close']
    
    # Convert the dictionary to a dataframe
    temp_price_df = pd.DataFrame(hist_dic)
    
    # If there is tickers to add, then resample it to month and add to the global closing price df
    if not temp_price_df.empty:
        # temp_prices_df = prices_df.resample('MS').first()
        global_price_df = pd.concat([global_price_df, temp_price_df], axis=1)

In [22]:
### Find all possible combinations of n tickers within the ticker_lst using a recursive function
def combinations(arr, length, pre_arr=[]):
    if len(pre_arr) == length:
        return [pre_arr]
    
    combs = []
    # Add each element to the previous array
    for i, val in enumerate(arr):
        cur_copy = pre_arr.copy()
        cur_copy.append(val)
        combs += combinations(arr[i+1:], length, cur_copy)
    return combs

In [23]:
### Find the average correlation between each stock
def find_avg_corr(ticker_lst, start_date, end_date):
    # Load the global price dataframe
    global global_price_df
    
    # Find the dataframe with the given tickers and filter the dates
    price_df = global_price_df[ticker_lst]
    price_df = price_df[(start_date <= price_df.index) & (price_df.index <= end_date)]
    
    # Find the correlation matrix
    df_corr = price_df.corr()
    
    # Calculate the avg corr
    sum_corr = 0
    sum_count = 0
    for i in range (len(ticker_lst)):
        for j in range (i+1, len(ticker_lst)):
            sum_corr += df_corr.loc[ticker_lst[i],ticker_lst[j]]
            sum_count += 1
    
    # Calculate the return the avg correlation
    return sum_corr/sum_count

In [24]:
# Find the combinaton of tickers with the maximum avg correlation
def find_max_correlation(ticker_lst,  start_date, end_date, result_size = 10):
    # Store the maximum value of corr and the portfolio
    max_avg_corr = -1
    max_corr_port = np.array([])
    
    # Load the global price dataframe
    global global_price_df
    
    # Convert list to array and find the combinations
    comb = combinations(np.array(ticker_lst), result_size)
    
    # Find the comb with the maximum average correlation
    for i in comb:
        avg_corr = find_avg_corr(i, start_date, end_date)
        if avg_corr > max_avg_corr:
            max_avg_corr = avg_corr
            max_corr_port = i
    
    # Return the result sample
    return max_corr_port

In [25]:
### Applying the functions
# A random ticker list
ticker_lst = ['MSFT', 'AAPL', 'GOOGL', 'AMZN', 'TSLA', 'FB', 'NVDA', 'NFLX', 'TSM', 'JPM',
             'BABA', 'V', 'JNJ', 'UNH', 'WMT'] #, 'HD', 'BAC', 'MA', 'ASML', 'PG']

# Update the global closing df
update_price_df(ticker_lst, start_date, end_date)

# Find the combination of tickers with maximum correlation
max_corr_port = find_max_correlation(ticker_lst, start_date, end_date)

# Display
print('The portfolio with maximum avg correlation is:', max_corr_port)

The portfolio with maximum avg correlation is: ['MSFT', 'AAPL', 'GOOGL', 'TSLA', 'FB', 'NVDA', 'TSM', 'V', 'JNJ', 'UNH']


## Contribution Declaration

The following team members made a meaningful contribution to this assignment:

Insert Names Here. 