In [None]:
def load_data(symbols):
    import pandas as pd
    from dao import InvestDao
    
    dao = InvestDao()
    data = pd.DataFrame()
    for sym in symbols:
        rows = dao.sql("SELECT close FROM stock_quotes WHERE dtyymmdd > '2016-06-01' AND ticker = '%s' ORDER BY dtyymmdd" % sym)
        data[sym] = [float(r[0]) for r in rows]
    data.columns = symbols
    return data

In [None]:
# helpers to normalize data
scale_function = lambda x: (x - x.mean()) / x.std()
get_we = lambda x: x / x.sum()

In [None]:
# convert a set of observations (stocks) of possibly correlated variables into a set of values of linearly uncorrelated variables called principal components (index levels)
def pca_index(n_comps, data):
    import numpy as np
    from sklearn.decomposition import KernelPCA

    pca = KernelPCA(n_components=n_comps).fit(data.apply(scale_function)) 
    pca_components = pca.transform(-data) 
    weights = get_we(pca.lambdas_)
    return np.dot(pca_components, weights)

In [None]:
stocks_data = load_data(['ALIOR', 'ASSECOPOL', 'BZWBK', 'CCC', 'CYFRPLSAT', 'ENERGA', 'EUROCASH', 'JSW', 'KGHM', 'LOTOS', 'LPP', 'MBANK', 'ORANGEPL', 'PEKAO', 'PGE', 'PGNIG', 'PKNORLEN', 'PKOBP', 'PZU', 'TAURONPE'])
index_data = load_data(['WIG20'])

index_data['PCA_5'] = pca_index(5, stocks_data)

In [None]:
import matplotlib.pyplot as plt 
%matplotlib inline

#compare original wig20 index with pca index:
index_data.apply(scale_function).plot(figsize=(8, 4))