In [2]:
import random
import numpy as np
import pandas as pd
import math

from deap import base
from deap import creator
from deap import tools
from deap import algorithms

In [298]:
def evalPortfolio(individual):
    weights = list(individual)/np.sum(individual)
    port_var = (weights * df_stockcov * weights.T).sum().sum()
    returns = (weights * df_returns['return']).sum()
    return port_var,returns

In [299]:
creator.create("Fitness", base.Fitness, weights=(-1.0,1.0))
creator.create("Individual", list, fitness=creator.Fitness)

toolbox = base.Toolbox()
NBR_STOCKS = 503

toolbox.register("attr_weights", random.random)

toolbox.register("individual", tools.initRepeat, creator.Individual, 
    toolbox.attr_weights,n=NBR_STOCKS)

toolbox.register("population", tools.initRepeat, list, toolbox.individual)

In [300]:
toolbox.register("evaluate", evalPortfolio)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutGaussian,mu=0.2,sigma=0.8,indpb=0.2)
toolbox.register("select", tools.selTournament, tournsize=2)

In [364]:
#import stock prices here
df_stocks = pd.read_csv('df_price_data.csv')

df_stocks = df_stocks[(df_stocks['dateYYMD'] >'2007') & (df_stocks['dateYYMD']<'2017')]
df_stocks = df_stocks.set_index(['symbol','dateYYMD']).sort_index()
df_stocks['PctChg'] = df_stocks['close'].astype(float)/df_stocks['close'].shift(1).astype(float) - 1
df_stocks = df_stocks.reset_index()

In [366]:
# get cumulative 10 year return of each stock
df_endprice = df_stocks[df_stocks['dateYYMD']=='2016-12-31'][['symbol','adjclose']] #/(df_stocks[df_stocks['dataYYMD']=='2007-01-04'])
df_endprice = df_endprice[df_endprice['adjclose'].notnull()]
df_startprice = df_stocks[df_stocks['dateYYMD']=='2007-01-04'][['symbol','adjclose']]
df_startprice = df_startprice[df_startprice['adjclose'].notnull()]
df_returns = pd.merge(df_endprice,df_startprice,on='symbol',how='outer',suffixes=['_end','_start'])
df_returns['return'] = df_returns['adjclose_end']/df_returns['adjclose_start']
df_returns = df_returns.set_index('symbol').sort_index()

series_symbols = df_returns

In [367]:
NBR_STOCKS=503
df_stock_sector = pd.read_csv('sp500list.csv')
print(len(df_stock_sector))
dict_stock_sector = {}

for row in df_stock_sector.iterrows():
    symbol = row[1]['Symbol']
    sector = row[1]['Sector']
    dict_stock_sector[symbol] = {'symbol':symbol,
                                 'sector':sector,
                                 'value': 0,}

505


In [368]:
# calculate covariance matrix to get portfolio variance in the fitness function
df_stockcol = pd.pivot_table(df_stocks.reset_index(),values='PctChg',index=['dateYYMD'],columns=['symbol'])
df_stockcov = df_stockcol.cov()

In [369]:
def main():
    pop = toolbox.population(n=100)
    hof = tools.HallOfFame(3) #ParetoFront()
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("avg", np.mean,axis=0)
    stats.register("std", np.std, axis=0)
    stats.register("min", np.min,axis=0)
    stats.register("max", np.max,axis=0)
    
    pop, logbook = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=100, stats=stats, halloffame=hof, verbose=True)
    
    return pop, logbook, hof

In [370]:
pop, log, hof = main()

gen	nevals	avg                                	std                                	min                                	max                                
0  	100   	[  2.78133106e-04   2.69159749e+00]	[  7.27644499e-06   7.27181813e-02]	[  2.59185235e-04   2.50008253e+00]	[  2.98311218e-04   2.84672241e+00]
1  	63    	[  2.98957887e-04   2.68622373e+00]	[  4.28105887e-05   8.26992046e-02]	[  2.59185235e-04   2.48097071e+00]	[  4.22907971e-04   2.93443402e+00]
2  	56    	[  3.00129814e-04   2.67830911e+00]	[  4.15519510e-05   8.54439043e-02]	[  2.59109712e-04   2.42168132e+00]	[  3.96398038e-04   3.04242194e+00]
3  	61    	[  2.96681935e-04   2.67650009e+00]	[  4.09024826e-05   6.69419463e-02]	[  2.59109712e-04   2.51931393e+00]	[  4.42333826e-04   2.83773049e+00]
4  	60    	[  2.90494503e-04   2.69358283e+00]	[  3.94826582e-05   9.23257901e-02]	[  2.56003331e-04   2.50546297e+00]	[  4.25944950e-04   3.15945740e+00]
5  	62    	[  2.83784585e-04   2.67233172e+00]	[  3.39296405e-05   7.9

52 	54    	[  2.48906347e-04   2.59061300e+00]	[  3.73670609e-05   4.03263904e-02]	[  2.25513023e-04   2.35502448e+00]	[  3.85645606e-04   2.72066920e+00]
53 	58    	[  2.45511508e-04   2.58815976e+00]	[  3.43213650e-05   4.29714028e-02]	[  2.25513023e-04   2.38751000e+00]	[  3.49217382e-04   2.69089214e+00]
54 	54    	[  2.48794425e-04   2.58009528e+00]	[  3.72540042e-05   6.18404352e-02]	[  2.25515594e-04   2.31472033e+00]	[  3.42051689e-04   2.76789366e+00]
55 	64    	[  2.55259747e-04   2.58896213e+00]	[  4.18859925e-05   6.16887548e-02]	[  2.22395739e-04   2.39606689e+00]	[  3.83329873e-04   2.77935293e+00]
56 	56    	[  2.46111749e-04   2.58966634e+00]	[  3.60377084e-05   4.97782825e-02]	[  2.25515594e-04   2.44406512e+00]	[  3.50219916e-04   2.75176102e+00]
57 	65    	[  2.45451101e-04   2.58565025e+00]	[  3.65827651e-05   3.99028825e-02]	[  2.25515594e-04   2.45387442e+00]	[  3.74759265e-04   2.72930413e+00]
58 	58    	[  2.44727994e-04   2.58740002e+00]	[  3.60585129e-05   3.5

In [332]:
hof_weights = hof.items[0]/np.sum(hof.items[0])

In [350]:
evalPortfolio(hof.items[0])

(0.00022666092017927506, 2.761944152913746)

In [339]:
def getStockWeightings(individual):
    return(list(zip(df_returns.index,individual)))

In [372]:
weightings = getStockWeightings(hof.items[0]/np.sum(hof.items[0]))
df_weightings = pd.DataFrame(weightings)

In [376]:
df_weightings.sort_values(1,ascending=False)

Unnamed: 0,0,1
182,FDX,0.014199
100,CLX,0.005732
205,GOOG,0.004288
204,GM,0.004277
171,EVHC,0.004116
92,CFG,0.003946
71,BLL,0.003688
103,CME,0.003684
282,LLL,0.003666
84,CBOE,0.003664
