In [1]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats as st
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
%matplotlib inline

In [2]:
alldata_df = pd.read_csv('all_stocks_5yr.csv')
alldata_df.head()

Unnamed: 0,date,open,high,low,close,volume,Name
0,2013-02-08,15.07,15.12,14.63,14.75,8407500,AAL
1,2013-02-11,14.89,15.01,14.26,14.46,8882000,AAL
2,2013-02-12,14.45,14.51,14.1,14.27,8126000,AAL
3,2013-02-13,14.3,14.94,14.25,14.66,10259500,AAL
4,2013-02-14,14.94,14.96,13.16,13.99,31879900,AAL


In [4]:
sp500 = pd.read_csv('S&P500IndexData.csv')
sp500 = sp500.reset_index().drop(['index','Adj Close'], axis = 1)
sp500['dailyrange'] = ((sp500['high'] - sp500['low'] )/ sp500['low'])*100
sp500['dailychange'] = (abs(sp500['close'].shift() - sp500['close'] )/ sp500['close'])*100
for i in range(len(sp500)):
    sp500.loc[i,'year'] = sp500.loc[i,'date'].split('/')[-1]
sp500['sma10'] = sp500.loc[:,'close'].rolling(window = 10).mean()
sp500['ema10'] = sp500['close'].ewm(span = 10, adjust = False).mean()
sp500values = sp500[['open','high','low','close','volume','dailyrange','dailychange','sma10','ema10']].dropna().values
rb = StandardScaler()
sp500_for_PCA = rb.fit_transform(sp500values)
sp500_pca = PCA(svd_solver = 'full', n_components = 1).fit_transform(sp500_for_PCA)
sp500_pca = sp500_pca.reshape(len(sp500_pca))

# Algo 1 (Random trial)

In [5]:
def get_stock_pc(stock, ticker):
    
    stock = stock.reset_index().drop(['index','Name'], axis = 1)

    stock['dailyrange'] = ((stock['high'] - stock['low'] )/ stock['low'])*100

    stock['dailychange'] = (abs(stock['close'].shift() - stock['close'] )/ stock['close'])*100

    for i in range(len(stock)):
        stock.loc[i,'year'] = stock.loc[i,'date'].split('-')[0]

    stock['sma10'] = stock.loc[:,'close'].rolling(window = 10).mean()

    stock['ema10'] = stock['close'].ewm(span = 10, adjust = False).mean()

    stockvalues = stock[['open','high','low','close','volume','dailyrange','dailychange','sma10','ema10']].dropna().values

    rb = StandardScaler()

    stock_for_PCA = rb.fit_transform(stockvalues)
    
    stock_pca = PCA(svd_solver = 'full').fit(stock_for_PCA)

    stock_pc = PCA(svd_solver = 'full', n_components = 1).fit_transform(stock_for_PCA)
    
    print('% Variance Explained for '+ ticker + ' ={}'.format(round(stock_pca.explained_variance_ratio_[0],2)))

    return stock_pc.reshape(len(stock_pc))

In [6]:
correlation_df = pd.DataFrame(index = range(len(sp500_pca)), columns = ['sp500'])
correlation_df['sp500'] = sp500_pca

In [193]:
tickers_all = alldata_df['Name'].unique().tolist()

#Just for now
tickers = [tickers_all[i] for i in range(len(tickers_all)) if i%5==0]

for ticker in tickers:
    stock = alldata_df[alldata_df['Name'] == ticker]
    if len(stock) != len(sp500):
        continue
    stock_pc = get_stock_pc(stock, ticker)
    correlation_df[ticker] = stock_pc

% Variance Explained for AAL =0.63
% Variance Explained for ABT =0.63
% Variance Explained for ADP =0.63
% Variance Explained for AES =0.64
% Variance Explained for AIV =0.62
% Variance Explained for ALGN =0.63
% Variance Explained for AMAT =0.63
% Variance Explained for AMP =0.64
% Variance Explained for ANTM =0.63
% Variance Explained for APD =0.63
% Variance Explained for ATVI =0.62
% Variance Explained for AXP =0.65
% Variance Explained for BAX =0.66
% Variance Explained for BEN =0.67
% Variance Explained for BK =0.64
% Variance Explained for BSX =0.68
% Variance Explained for CAT =0.65
% Variance Explained for CB =0.62
% Variance Explained for CERN =0.62
% Variance Explained for CHRW =0.62
% Variance Explained for CL =0.63
% Variance Explained for CMI =0.68
% Variance Explained for COG =0.67
% Variance Explained for CSX =0.62
% Variance Explained for CVS =0.63
% Variance Explained for DE =0.64
% Variance Explained for DLR =0.63
% Variance Explained for DRI =0.62
% Variance Explain

In [194]:
correlations_without_ema = correlation_df.corr()

In [197]:
correlations_without_ema['FB'].sort_values(ascending = False)

FB       1.000000
MSFT     0.946156
BSX      0.942546
XYL      0.904266
AIV      0.894974
ECL      0.888075
MDT      0.869644
CL       0.828337
APD      0.824877
SEE      0.752518
PFG      0.742242
AAL      0.724431
PRU      0.685221
CERN     0.545442
KSS      0.439973
IP       0.437735
CMI      0.427141
UTX      0.396352
RHI      0.340472
CVS      0.331406
EMN      0.301370
ETN      0.288852
HAL      0.160849
AXP      0.109194
GE       0.077494
JEC      0.067024
PNR     -0.007629
WDC     -0.011009
MAC     -0.041850
PXD     -0.057685
           ...   
LEN     -0.814204
INCY    -0.814819
ALGN    -0.830797
PKG     -0.834736
MCD     -0.848434
AMAT    -0.853706
DRI     -0.865850
ANTM    -0.888668
PCLN    -0.898442
MLM     -0.903883
BK      -0.905088
DLR     -0.907534
EXPD    -0.908624
RCL     -0.909003
TMK     -0.909308
TDG     -0.911387
HRS     -0.913434
VMC     -0.917984
ADP     -0.920754
ITW     -0.925190
TXN     -0.932272
sp500   -0.935750
ATVI    -0.942236
TRV     -0.945962
SPGI    -0

In [198]:
correlations_with_ema['FB'].sort_values(ascending = False)

FB       1.000000
EA       0.960265
NDAQ     0.960115
INTU     0.955307
PEP      0.954298
BSX      0.952783
GD       0.950849
PKI      0.936620
AIZ      0.933279
EW       0.932704
HIG      0.926245
MO       0.915192
ETFC     0.915059
EXPE     0.912751
BLL      0.910227
CHD      0.908395
MNST     0.906703
AAPL     0.905564
MCO      0.904588
ECL      0.900198
AIV      0.895725
ESS      0.890976
AMGN     0.878290
EIX      0.878036
HOLX     0.876831
MDT      0.872787
GLW      0.859540
D        0.854458
AVB      0.849281
HSIC     0.847800
           ...   
ACN     -0.949259
MHK     -0.949488
AET     -0.949872
EQIX    -0.950080
ATVI    -0.951154
IFF     -0.951393
TRV     -0.952047
CMCSA   -0.952507
AON     -0.952965
ADBE    -0.953189
HD      -0.954403
HON     -0.954858
SPGI    -0.955044
MSFT    -0.955716
IT      -0.958542
NEE     -0.959189
HII     -0.959670
GOOGL   -0.960722
CB      -0.960871
AOS     -0.961436
FIS     -0.964081
MMC     -0.964815
BDX     -0.966667
RTN     -0.969243
CTAS    -0

# Algo 2 - with S&P Index PC1

In [6]:
def get_stock_fit(stock, ticker, fitter):
    
    stock = stock.reset_index().drop(['index','Name'], axis = 1)

    stock['dailyrange'] = ((stock['high'] - stock['low'] )/ stock['low'])*100

    stock['dailychange'] = (abs(stock['close'].shift() - stock['close'] )/ stock['close'])*100

    for i in range(len(stock)):
        stock.loc[i,'year'] = stock.loc[i,'date'].split('-')[0]

    stock['sma10'] = stock.loc[:,'close'].rolling(window = 10).mean()

    stock['ema10'] = stock['close'].ewm(span = 10, adjust = False).mean()

    stockvalues = stock[['open','high','low','close','volume','dailyrange','dailychange','sma10','ema10']].dropna().values

    scaler = StandardScaler()

    stock_for_PCA = scaler.fit_transform(stockvalues)
    
    stock_pc = fitter.transform(stock_for_PCA)[:,0]
    
    #print('% Variance Explained for '+ ticker + ' ={}'.format(round(stock_pc.explained_variance_ratio_[0],2)))
    
    return stock_pc.reshape(len(stock_pc))

In [253]:
correlation_df = pd.DataFrame(index = range(len(sp500_pca)), columns = ['sp500'])
correlation_df['sp500'] = sp500_pca

tickers_all = alldata_df['Name'].unique().tolist()

tickers = tickers_all
#tickers = [tickers_all[i] for i in range(len(tickers_all)) if i%5==0]

fitter = PCA(svd_solver = 'full').fit(sp500_for_PCA)

for ticker in tickers:
    stock = alldata_df[alldata_df['Name'] == ticker]
    stock_pc = get_stock_fit(stock, ticker, fitter)
    if len(stock_pc) != len(correlation_df):
        continue
    correlation_df[ticker] = stock_pc

In [254]:
correlation_df

Unnamed: 0,sp500,AAL,AAPL,AAP,ABBV,ABC,ABT,ACN,ADBE,ADI,...,XLNX,XL,XOM,XRAY,XRX,XYL,YUM,ZBH,ZION,ZTS
0,-4.814659,-5.513222,-3.469487,-3.976937,-4.001831,-5.263549,-3.572409,-2.935093,-3.293052,-2.715101,...,-2.945472,-4.253263,0.527195,-3.758277,-2.926104,-2.726311,-3.331244,-4.879143,-2.383841,-2.300151
1,-5.483528,-6.000667,-3.807464,-4.123189,-4.393673,-5.471747,-3.556788,-3.128378,-3.530697,-3.057030,...,-3.273996,-4.453018,-0.018462,-4.315951,-2.907332,-3.542817,-3.554229,-5.267561,-2.745605,-2.608931
2,-4.912416,-5.864864,-3.862623,-4.395258,-4.126074,-5.262235,-3.567022,-2.893834,-3.398891,-2.771381,...,-3.408005,-4.363694,0.258436,-3.847227,-2.793417,-2.933409,-3.460004,-5.168964,-2.641959,-2.393216
3,-5.123697,-5.571103,-3.770569,-4.176610,-4.226945,-5.241224,-3.652781,-3.005019,-3.910615,-2.976935,...,-3.402860,-4.298157,0.369058,-3.946418,-2.619440,-3.174726,-3.351531,-5.148991,-2.573517,-2.685778
4,-4.606469,-5.350950,-3.562523,-4.262380,-4.187332,-5.199103,-3.807985,-2.783784,-3.324730,-3.205566,...,-3.033239,-4.115408,0.754467,-3.735755,-2.359680,-2.784176,-3.222822,-4.943168,-2.495971,-2.831689
5,-4.777379,-5.688852,-3.933926,-4.194158,-4.510259,-5.418826,-3.709926,-2.971758,-3.458156,-2.818835,...,-3.325721,-4.209755,0.714690,-3.944105,-2.433938,-3.042150,-3.446170,-4.917969,-2.567042,-2.872338
6,-4.676191,-5.694501,-4.068046,-4.243656,-4.095439,-5.077131,-4.084655,-2.847637,-3.485500,-2.627211,...,-3.188085,-4.060034,0.581968,-3.808898,-2.411615,-2.832100,-3.186645,-4.878973,-2.394221,-2.180235
7,-4.778663,-5.411176,-4.239702,-4.128670,-4.313485,-4.951394,-3.466880,-3.059090,-3.455409,-2.640019,...,-3.366356,-3.925410,0.697630,-4.029604,-2.229750,-3.363756,-3.240398,-4.917364,-2.401245,-2.194604
8,-4.381005,-5.558945,-3.945474,-4.290771,-4.025021,-4.967362,-3.612836,-2.561119,-3.167993,-2.566820,...,-3.088432,-3.667845,0.767994,-3.619335,-2.568314,-3.202094,-3.121487,-4.882596,-2.452320,-2.247283
9,-4.324335,-5.363832,-3.966530,-4.218261,-4.048918,-5.155341,-3.459235,-2.566199,-3.085365,-2.666899,...,-3.246441,-3.667233,0.526117,-3.576882,-1.968873,-3.425757,-2.789366,-4.899095,-2.071951,-2.511688


# --------------------------------------------------------------------------------------------------------------

# The Working Algo (self thought)

In [3]:
def relative_strength_index(stock, n = 14):
    
    difference = stock['close'].diff()
    up, down = difference.copy(), difference.copy()
    up[up < 0] = 0
    down[down > 0] = 0
    rs = up.rolling(n).mean() / down.abs().rolling(n).mean()
    rsi = 100.0 - (100.0 / (1.0 + rs))
    
    return rsi

In [4]:
def average_directional_movement_index(stock, n=14, n_ADX=14):
    i = 0
    UpI = []
    DoI = []
    while i + 1 <= stock.index[-1]:
        UpMove = stock.loc[i + 1, 'high'] - stock.loc[i, 'high']
        DoMove = stock.loc[i, 'low'] - stock.loc[i + 1, 'low']
        if UpMove > DoMove and UpMove > 0:
            UpD = UpMove
        else:
            UpD = 0
        UpI.append(UpD)
        if DoMove > UpMove and DoMove > 0:
            DoD = DoMove
        else:
            DoD = 0
        DoI.append(DoD)
        i = i + 1
    i = 0
    TR_l = [0]
    while i < stock.index[-1]:
        TR = max(stock.loc[i + 1, 'high'], stock.loc[i, 'close']) - min(stock.loc[i + 1, 'low'], stock.loc[i, 'close'])
        TR_l.append(TR)
        i = i + 1
    TR_s = pd.Series(TR_l)
    ATR = pd.Series(TR_s.ewm(span=n, min_periods=n).mean())
    UpI = pd.Series(UpI)
    DoI = pd.Series(DoI)
    PosDI = pd.Series(UpI.ewm(span=n, min_periods=n).mean() / ATR)
    NegDI = pd.Series(DoI.ewm(span=n, min_periods=n).mean() / ATR)
    ADX = pd.Series((abs(PosDI - NegDI) / (PosDI + NegDI)).ewm(span=n_ADX).mean())
    ADX = 100*ADX
    
    return ADX

In [5]:
def bollinger_bands(stock, n=14):
    
    MA = stock['close'].rolling(n).mean()
    MSD = stock['close'].rolling(n).std()
    bandwidth = 4 * MSD / MA * 100

    return bandwidth

In [213]:
def get_stock_indicators(stock):
    
    stock = stock.reset_index().drop(['index','Name'], axis = 1)

    #stock['dailyrange'] = ((stock['high'] - stock['low'] )/ stock['low'])*100

    #stock['dailychange'] = (abs(stock['close'].shift() - stock['close'] )/ stock['close'])*100

    #stock['sma10'] = stock.loc[:,'close'].rolling(window = 14).mean()

    stock['ema10'] = stock['close'].ewm(span = 14, adjust = False).mean()
    
    #stock['rsi'] = relative_strength_index(stock,14)
    
    #stock['adx'] = average_directional_movement_index(stock,14,14)
    
    #stock['bollingerbw'] = bollinger_bands(stock,14)

    return stock

In [7]:
def principal_components(stock, ticker = 'stockname_in_parameter'):
    
    stockvalues = stock.iloc[:,1:].dropna().values

    scaler = StandardScaler()

    stock_for_PCA = scaler.fit_transform(stockvalues)
      
    stock_pca = PCA(svd_solver = 'full').fit(stock_for_PCA)

    stock_pc = PCA(svd_solver = 'full', n_components = 1).fit_transform(stock_for_PCA)
    
    print('% Variance Explained for '+ ticker + ' = {}'.format(round(stock_pca.explained_variance_ratio_[0],2)))
    
    return stock_pc.reshape(len(stock_pc))

In [8]:
def principal_components_with_fitter(stock, fitter):
   
    '''
    Fits a given stock to the PCs of another stock to see similarity.
    Returns the transformed stock wrt the PC1 of the fitter stock.
    
    '''
    
    stockvalues = stock.iloc[:,1:].dropna().values

    scaler = StandardScaler()

    stock_for_PCA = scaler.fit_transform(stockvalues)
    
    stock_pc = fitter.transform(stock_for_PCA)[:,0]
    
    return stock_pc.reshape(len(stock_pc))

In [212]:
'''
This block basically takes only the stocks with over 10% or 15% returns and eliminates the rest.
The 15% data is already stored in the high_returns_df dataframe, which was stored in excel previously.
See the blogposttrial.ipynb for more info.

'''
#high_returns_df = pd.read_excel('selected_stocks_15_percent_YoY.xlsx').dropna()

high_returns_df = pd.read_excel('selected_stocks_10_percent_YoY.xlsx').dropna()

tickers = high_returns_df['tickers'].tolist()

selected_stocks = alldata_df[alldata_df['Name'] == tickers[0]]
for ticker in tickers[1:]:
    selected_stocks = pd.concat( [selected_stocks, alldata_df[alldata_df['Name'] == ticker] ] , axis = 0)
#selected_stocks = pd.concat( [selected_stocks, alldata_df[alldata_df['Name'] == 'WYNN'] ] , axis = 0)

#This part just removes incomplete data.
for ticker in selected_stocks['Name'].unique().tolist():
    if selected_stocks['Name'].value_counts()[ticker] != 1259:
        selected_stocks = selected_stocks[selected_stocks.Name != ticker]
selected_stocks['Name'].unique()

array(['AAL', 'AAPL', 'ABBV', 'ABC', 'ABT', 'ACN', 'ADBE', 'ADI', 'ADP',
       'ADSK', 'ADS', 'AET', 'AFL', 'AGN', 'AIZ', 'AJG', 'AKAM', 'ALB',
       'ALGN', 'ALK', 'ALL', 'AMAT', 'AMD', 'AME', 'AMGN', 'AMP', 'AMT',
       'AMZN', 'ANDV', 'ANSS', 'ANTM', 'AON', 'AOS', 'APD', 'APH', 'ARE',
       'ATVI', 'AVGO', 'AVY', 'AWK', 'AYI', 'AZO', 'BAC', 'BA', 'BBT',
       'BBY', 'BDX', 'BF.B', 'BIIB', 'BK', 'BLK', 'BLL', 'BRK.B', 'BSX',
       'CAT', 'CBG', 'CBOE', 'CB', 'CCL', 'CDNS', 'CELG', 'CHTR', 'CI',
       'CMA', 'CMCSA', 'CME', 'CMI', 'CNC', 'COF', 'COL', 'COO', 'COST',
       'CRM', 'CSCO', 'CSX', 'CTAS', 'CTSH', 'C', 'DAL', 'DE', 'DFS',
       'DG', 'DHI', 'DIS', 'DLR', 'DLTR', 'DPS', 'DRI', 'EA', 'ECL',
       'EFX', 'EL', 'EOG', 'EQIX', 'ETFC', 'EW', 'EXPE', 'EXR', 'FBHS',
       'FB', 'FDX', 'FISV', 'FIS', 'FITB', 'FLIR', 'FMC', 'GD', 'GILD',
       'GLW', 'GOOGL', 'GPN', 'GRMN', 'GS', 'GT', 'HAS', 'HBAN', 'HBI',
       'HCA', 'HD', 'HIG', 'HII', 'HOLX', 'HON', 'HRL', 'HRS', '

In [215]:
#Next 2 blocks are trial blocks.

facebook = selected_stocks[selected_stocks['Name'] == 'FB']
facebook = get_stock_indicators(facebook)
facebook_pc1 = principal_components(facebook, 'FB')
scaler = StandardScaler()
facebook_for_pca = scaler.fit_transform(facebook.iloc[:,1:].dropna().values)


transformed_stocks = pd.DataFrame(index = range(len(facebook_pc1)))

tickers = selected_stocks['Name'].unique().tolist()

fitter = PCA(svd_solver = 'full').fit(facebook_for_pca)

for ticker in tickers:
    stock = selected_stocks[selected_stocks['Name'] == ticker]
    
    stock = get_stock_indicators(stock)
    
    stock_pc = principal_components_with_fitter(stock, fitter)
    
    if len(stock_pc) != len(transformed_stocks):
        continue
    transformed_stocks[ticker] = stock_pc

% Variance Explained for FB = 0.88


In [216]:
transformed_stocks.corr()['FB'].sort_values()

NRG     -0.457644
STX     -0.357283
WYNN    -0.314002
OKE     -0.099197
WDC      0.008176
UAA      0.031152
MCK      0.059273
ADS      0.130356
TIF      0.156161
GILD     0.168093
FMC      0.181965
NTAP     0.238361
AKAM     0.246319
BIIB     0.265999
KR       0.340662
AGN      0.345303
HBI      0.353735
VFC      0.372753
EOG      0.416801
XEC      0.432573
GRMN     0.436977
CMI      0.443166
CAT      0.472085
MU       0.482476
LYB      0.507795
ABC      0.507990
UNP      0.592222
WHR      0.605606
AZO      0.606507
COF      0.621872
           ...   
AON      0.959581
SPGI     0.959592
HD       0.960617
FIS      0.961045
IT       0.961808
NEE      0.962323
ADBE     0.962626
INTU     0.962677
BSX      0.963307
HII      0.964680
AOS      0.965329
NDAQ     0.965454
MMC      0.966031
EA       0.966192
MSFT     0.966911
TMO      0.967191
UNH      0.970593
SYK      0.971060
BDX      0.971993
GOOGL    0.973151
CTAS     0.973595
AVGO     0.975430
V        0.976401
RTN      0.976650
STZ      0

In [91]:
def min_sum_row(mat):  
    
    idx = -1000  
    minSum = 1000 

    for i in range(0, len(mat[0])):  
        summation = 0
        
        for j in range(0, len(mat)):  
            summation += mat[j][i]  
            
        if (summation < minSum): 
            minSum = summation  
            idx = i  
    return idx, minSum

In [225]:
def correlation_minimized_selection_algorithm(selected_stocks, n=20 , first_stock = 'FB'):
    
    correlations = []
    diverse_stocks = [first_stock]
    
    for i in range(n):
        if i==0:
            stock = selected_stocks[selected_stocks['Name'] == first_stock]
            current_stock = first_stock
        else:
            stock = selected_stocks[selected_stocks['Name'] == next_stock]
            current_stock = next_stock
        
        stock = get_stock_indicators(stock)
        stock_pc1 = principal_components(stock, current_stock)
        scaler = StandardScaler()
        stock_for_pca = scaler.fit_transform(stock.iloc[:,1:].dropna().values)


        transformed_stocks = pd.DataFrame(index = range(len(stock_pc1)), columns = [current_stock])
        transformed_stocks[current_stock] = stock_pc1

        tickers = selected_stocks['Name'].unique().tolist()

        fitter = PCA(svd_solver = 'full').fit(stock_for_pca)

        for ticker in tickers:
            stock = selected_stocks[selected_stocks['Name'] == ticker]

            stock = get_stock_indicators(stock)

            stock_pc = principal_components_with_fitter(stock, fitter)

            if len(stock_pc) != len(transformed_stocks):
                continue
            transformed_stocks[ticker] = stock_pc
        
        correlations.append(transformed_stocks.corr()[current_stock].sort_index().tolist())
        min_index, min_sum = min_sum_row(correlations)
        print('Sum of the correlations is {}'.format(round(min_sum,2)))
        correlations[0][min_index] = 100.0
        next_stock = transformed_stocks.corr()[current_stock].index[min_index]
        if next_stock in diverse_stocks:
            print('Not adding ' + next_stock + ' again')
            continue
        diverse_stocks.append(next_stock)
        print('Next Most Diverse Stock is '+next_stock)
        
    return diverse_stocks, correlations

In [222]:
#First of 'seed' ticker = Facebook (FB)
diverse_stocks, correlations = correlation_minimized_selection_algorithm(selected_stocks, 5, 'FB')
diverse_stocks

% Variance Explained for FB = 0.88
Sum of the correlations is -0.46
Next Most Diverse Stock is NRG
% Variance Explained for NRG = 0.86
Sum of the correlations is -0.41
Next Most Diverse Stock is UAA
% Variance Explained for UAA = 0.83
Sum of the correlations is 0.06
Next Most Diverse Stock is FLIR
% Variance Explained for FLIR = 0.83
Sum of the correlations is 0.37
Next Most Diverse Stock is STX
% Variance Explained for STX = 0.85
Sum of the correlations is 0.18
Next Most Diverse Stock is AWK


['FB', 'NRG', 'UAA', 'FLIR', 'STX', 'AWK']

**(Facebook, Nuclear Energy, Under Armor, Thermal Imaging, Seagate Tech, American Water Works)**

**Clearly, the algorithm is self sorting industries! The stock's couldn't be more different!**

In [226]:
#First of 'seed' ticker = Lockheed Martin (LMT)
diverse_stocks, correlations = correlation_minimized_selection_algorithm(selected_stocks, 5, 'LMT')
diverse_stocks

% Variance Explained for LMT = 0.84
Sum of the correlations is -0.44
Next Most Diverse Stock is NRG
% Variance Explained for NRG = 0.86
Sum of the correlations is -0.37
Next Most Diverse Stock is UAA
% Variance Explained for UAA = 0.83
Sum of the correlations is 0.07
Next Most Diverse Stock is FLIR
% Variance Explained for FLIR = 0.83
Sum of the correlations is 0.41
Next Most Diverse Stock is WYN
% Variance Explained for WYN = 0.83
Sum of the correlations is 0.57
Next Most Diverse Stock is STT


['LMT', 'NRG', 'UAA', 'FLIR', 'WYN', 'STT']

**(Lockheed, Nuclear Energy, Under Armor, Thermal Imaging, Hotels, Financial Services)**

In [228]:
#First of 'seed' ticker = Wynn Resorts (WYNN)
diverse_stocks, correlations = correlation_minimized_selection_algorithm(selected_stocks, 5, 'WYNN')
diverse_stocks

% Variance Explained for WYNN = 0.85
Sum of the correlations is -0.67
Next Most Diverse Stock is OKE
% Variance Explained for OKE = 0.85
Sum of the correlations is -1.24
Next Most Diverse Stock is KMX
% Variance Explained for KMX = 0.83
Sum of the correlations is -0.82
Next Most Diverse Stock is UAA
% Variance Explained for UAA = 0.83
Sum of the correlations is -0.32
Next Most Diverse Stock is HON
% Variance Explained for HON = 0.83
Sum of the correlations is 0.51
Next Most Diverse Stock is AWK


['WYNN', 'OKE', 'KMX', 'UAA', 'HON', 'AWK']

**(Resort Company, Natural Gas, Car Retailer, Under Armor, Honeywell, Water)**

In [229]:
#First of 'seed' ticker = BestBuy (BBY)
diverse_stocks, correlations = correlation_minimized_selection_algorithm(selected_stocks, 5, 'BBY')
diverse_stocks

% Variance Explained for BBY = 0.84
Sum of the correlations is -0.31
Next Most Diverse Stock is UAA
% Variance Explained for UAA = 0.83
Sum of the correlations is -0.54
Next Most Diverse Stock is NOC
% Variance Explained for NOC = 0.87
Sum of the correlations is -0.79
Next Most Diverse Stock is WYN
% Variance Explained for WYN = 0.83
Sum of the correlations is -0.25
Next Most Diverse Stock is NVDA
% Variance Explained for NVDA = 0.88
Sum of the correlations is -0.54
Next Most Diverse Stock is STX


['BBY', 'UAA', 'NOC', 'WYN', 'NVDA', 'STX']

**(BestBuy, UnderArmor, Northrop Grumman, Hotels, NVIDIA, Seagate)**

Further Work: 

Distinguish between period specific and industry specific patterns (like UAA is clearly period specific).

Add more than 5 stocks.

What is the measure of a good 'five-pack'? Maybe the final correlation sum being close to zero?

Start calculating Sharpe Ratios for each portfolio.