In [2]:
import random
import numpy as np
import pandas as pd
import math

from deap import base
from deap import creator
from deap import tools
from deap import algorithms

In [646]:
def random_picks(num_stocks,max_select):
    weights = np.zeros(num_stocks)
    for i in range(max_select):
        stock_pick = random.randint(0,num_stocks-1)
        while (weights[stock_pick] != 0):
            stock_pick = random.randint(0,num_stocks-1)
        weights[stock_pick] = random.randint(1,10)/200
    #weights = weights/np.sum(weights)
    return weights

In [643]:
def mutPick(individual):
    
    stock_pick_mut = random.randint(0,len(individual)-1)
    if random.random() < 0.5:
        individual[stock_pick_mut] = 0
    else:
        individual[stock_pick_mut] = random.randint(1,10)/200
    return individual,

In [909]:
def evalPortfolio(individual,max_stock=50):
    weights = list(individual)/np.sum(individual)
    port_var = (weights * df_stockcov * weights.T).sum().sum()
    returns = (weights * df_returns['return']).sum()
    
    # ensure that we do not exceed 50 stocks
    num_stocks = sum([item > 0.00 for item in individual])
    
    if num_stocks > max_stock:
        port_var += 1000
        
    # ensure that no sector exceeds 25% of the total portfolio
    df_weight_sector = pd.merge(pd.DataFrame(weights),df_symbol_sector,left_index=True,right_index=True)

    df_sectorweights = df_weight_sector.groupby('sector').sum().reset_index()
    #df_sectorweights.columns = ['symbol','sec_weight']
    #print(df_sectorweights)
    #print(df_sectorweights.info())
    for index,row in df_sectorweights.iterrows():
        if row[1] > 0.2:
            port_var += 1000
    
    return port_var,returns

In [910]:
creator.create("Fitness", base.Fitness, weights=(-1.0,1.0))
creator.create("Individual", list, fitness=creator.Fitness)

toolbox = base.Toolbox()
NBR_STOCKS = 503

toolbox.register("attr_weights", random_picks, NBR_STOCKS, 20)

toolbox.register("individual", tools.initIterate, creator.Individual, 
    toolbox.attr_weights)

toolbox.register("population", tools.initRepeat, list, toolbox.individual)

In [911]:
toolbox.register("evaluate", evalPortfolio,max_stock=50)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", mutPick) #tools.mutGaussian,mu=2,sigma=0.8,indpb=0.2)
toolbox.register("select", tools.selNSGA2)

In [486]:
#import stock prices here
df_stocks = pd.read_csv('df_price_data.csv')

df_stocks = df_stocks[(df_stocks['dateYYMD'] >'2007') & (df_stocks['dateYYMD']<'2017')]
df_stocks = df_stocks.set_index(['symbol','dateYYMD']).sort_index()
df_stocks['PctChg'] = df_stocks['close'].astype(float)/df_stocks['close'].shift(1).astype(float) - 1
df_stocks = df_stocks.reset_index()


In [942]:
# get cumulative 10 year return of each stock
df_endprice = df_stocks[df_stocks['dateYYMD']=='2016-12-31'][['symbol','adjclose']] #/(df_stocks[df_stocks['dataYYMD']=='2007-01-04'])
df_endprice = df_endprice[df_endprice['adjclose'].notnull()]

df_startprice_10 = df_stocks[df_stocks['dateYYMD']=='2007-01-04'][['symbol','adjclose']]
df_startprice_10 = df_startprice_10[df_startprice_10['adjclose'].notnull()]

df_startprice_5 = df_stocks[df_stocks['dateYYMD']=='2012-01-04'][['symbol','adjclose']]
df_startprice_5 = df_startprice_5[df_startprice_5['adjclose'].notnull()]

df_startprice_3 = df_stocks[df_stocks['dateYYMD']=='2014-01-03'][['symbol','adjclose']]
df_startprice_3 = df_startprice_3[df_startprice_3['adjclose'].notnull()]

df_startprice_1 = df_stocks[df_stocks['dateYYMD']=='2016-01-05'][['symbol','adjclose']]
df_startprice_1 = df_startprice_1[df_startprice_1['adjclose'].notnull()]

df_returns_10 = pd.merge(df_endprice,df_startprice_10,on='symbol',how='outer',suffixes=['_end','_start'])
df_returns_10['return'] = df_returns_10['adjclose_end']/df_returns_10['adjclose_start']
df_returns_10 = df_returns_10.set_index('symbol').sort_index()
df_returns_10 = df_returns_10.fillna(-1000,axis=1)

df_returns_5 = pd.merge(df_endprice,df_startprice_5,on='symbol',how='outer',suffixes=['_end','_start'])
df_returns_5['return'] = df_returns_5['adjclose_end']/df_returns_5['adjclose_start']
df_returns_5 = df_returns_5.set_index('symbol').sort_index()
df_returns_5 = df_returns_5.fillna(-1000,axis=1)

df_returns_3 = pd.merge(df_endprice,df_startprice_3,on='symbol',how='outer',suffixes=['_end','_start'])
df_returns_3['return'] = df_returns_3['adjclose_end']/df_returns_3['adjclose_start']
df_returns_3 = df_returns_3.set_index('symbol').sort_index()
df_returns_3 = df_returns_3.fillna(-1000,axis=1)

df_returns_1 = pd.merge(df_endprice,df_startprice_1,on='symbol',how='outer',suffixes=['_end','_start'])
df_returns_1['return'] = df_returns_1['adjclose_end']/df_returns_1['adjclose_start']
df_returns_1 = df_returns_1.set_index('symbol').sort_index()
df_returns_1 = df_returns_1.fillna(-1000,axis=1)

#series_symbols = df_returns

In [830]:
NBR_STOCKS=503
df_stock_sector = pd.read_csv('sp500list.csv')
print(len(df_stock_sector))
dict_stock_sector = {}

for row in df_stock_sector.iterrows():
    symbol = row[1]['Symbol']
    sector = row[1]['Sector']
    dict_stock_sector[symbol] = {'symbol':symbol,
                                 'sector':sector,
                                 'value': 0,}
    
df_stocksector = pd.DataFrame.from_dict(dict_stock_sector,orient='index').reset_index().drop(['index','value'],axis=1)
df_stocksector = df_stocksector.set_index('symbol').reset_index()

505


In [943]:
# calculate covariance matrix to get portfolio variance in the fitness function
df_stockcol = pd.pivot_table(df_stocks.reset_index(),values='PctChg',index=['dateYYMD'],columns=['symbol'])
df_stockcov = df_stockcol.cov()

df_variance = df_stocks.groupby('symbol')['PctChg'].var()

df_symbol_sector = pd.merge(df_stocks.groupby('symbol').sum()['PctChg'].reset_index(),df_stocksector).drop('PctChg',axis=1)

In [949]:
def main():
    pop = toolbox.population(n=100)
    hof = tools.HallOfFame(3) #ParetoFront()
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("avg", np.mean,axis=0)
    stats.register("std", np.std, axis=0)
    stats.register("min", np.min,axis=0)
    stats.register("max", np.max,axis=0)
    
    pop, logbook = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.05, ngen=100, stats=stats, halloffame=hof, verbose=True)
    
    return pop, logbook, hof

In [950]:
pop, log, hof = main()

gen	nevals	avg                            	std                                	min                      	max                            
0  	100   	[ 1300.00669354     2.56684097]	[  7.54983522e+02   5.26796834e-01]	[ 0.00554597  1.45375515]	[ 3000.0075984      4.11576263]
1  	64    	[ 1300.00676324     2.56887682]	[  7.68115020e+02   5.54887599e-01]	[ 0.00500878  1.32592064]	[ 3000.00986925     4.14546764]
2  	57    	[ 1150.00675264     2.56482908]	[  7.53326411e+02   5.35920214e-01]	[ 0.0044294   1.47175966]	[ 3000.01035773     4.14546764]
3  	50    	[ 1130.00677814     2.5666941 ]	[  7.70130309e+02   5.53545381e-01]	[ 0.00485355  1.55028476]	[ 3000.01113571     4.14546764]
4  	45    	[ 1190.00677842     2.55364088]	[  7.30685144e+02   5.55423928e-01]	[ 0.00501636  1.41085488]	[ 3000.01406699     4.56356259]
5  	57    	[ 1220.00676149     2.56230426]	[  6.56963082e+02   5.90027057e-01]	[ 0.00536916  1.41085488]	[ 3000.01406699     4.56356259]
6  	59    	[ 1130.00682299     2.56375489

59 	50    	[ 1230.0067294      2.46719117]	[ 881.53364913    0.88319798]      	[ 0.00301397  0.94909815]	[ 3000.01429893     6.63008022]
60 	56    	[ 1290.00673947     2.4617755 ]	[ 840.17935116    0.88894756]      	[ 0.00301397  0.94909815]	[ 3000.0107652      6.63008022]
61 	57    	[ 1240.00674091     2.4572349 ]	[ 849.94219646    0.8839858 ]      	[ 0.00301397  1.020052  ]	[ 3000.01383873     6.20665964]
62 	61    	[ 1280.0067328      2.46128016]	[ 849.47137072    0.88610013]      	[ 0.00301397  0.94460668]	[ 4000.00864656     6.20665964]
63 	49    	[ 1270.00673202     2.45435851]	[ 810.61804619    0.89199762]      	[ 0.00318412  1.020052  ]	[ 3000.01228199     6.23755107]
64 	60    	[ 1250.00675314     2.45408481]	[ 817.00800303    0.89008022]      	[ 0.00318412  1.020052  ]	[ 3000.01714292     6.23755107]
65 	53    	[ 1340.00673706     2.46237025]	[ 790.19088414    0.95589318]      	[ 0.00292578  1.03851011]	[ 3000.01714292     7.2663784 ]
66 	52    	[ 1310.00679152     2.463511  

In [917]:
hof_weights = hof.items[0]/np.sum(hof.items[0])

In [952]:
evalPortfolio(hof.items[0])

(0.002344368074570568, 2.6361923349326797)

In [956]:
def getStockWeightings(individual):
    df_weightings = pd.DataFrame(list(zip(df_returns_10.index,individual,df_returns_10['return'],df_returns_5['return'],df_returns_3['return'],df_returns_1['return'],df_variance)))
    df_weightings.columns = ['symbol','weighting','return10','return5','return3','return1','variance']
    df_weightings = pd.merge(df_weightings,df_stocksector,on='symbol')
    return(df_weightings)

In [958]:
weightings = getStockWeightings(hof.items[0]/np.sum(hof.items[0]))
df_weightings = pd.DataFrame(weightings)

In [959]:
df_weightings.sort_values('weighting',ascending=False).head(50).to_csv('weightings.csv',sep=',')

In [960]:
df_weightings.groupby('sector')['weighting'].sum()

sector
Consumer Discretionary        0.078049
Consumer Staples              0.102439
Energy                        0.034146
Financials                    0.087805
Health Care                   0.107317
Industrials                   0.092683
Information Technology        0.326829
Materials                     0.058537
Real Estate                   0.000000
Telecommunication Services    0.000000
Utilities                     0.112195
Name: weighting, dtype: float64

In [961]:
df_returns_10

Unnamed: 0_level_0,adjclose_end,adjclose_start,return
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,45.252983,23.154404,1.954401
AAL,46.290821,54.455921,0.850060
AAP,168.863815,34.199600,4.937596
AAPL,113.986984,10.731588,10.621632
ABBV,60.415474,-1000.000000,-1000.000000
ABC,76.828621,19.467813,3.946443
ABT,37.539089,17.547220,2.139318
ACN,114.815262,28.963879,3.964084
ADBE,102.949997,39.919998,2.578908
ADI,71.052734,24.162249,2.940651
