# Build SET100 Dataset

1. Start from set100_union.csv  
2. Build table "stock"  
3. Build table "timeseries"  
4. Build table "set100_membership_periods"  


dataset/Datastream  

1_DY.xlsm
2_EPS.xlsm
3_MV.xlsm
4_P.xlsm
5_PH.xlsm
6_PL.xlsm
7_PO.xlsm
8_VA.xlsm
9_VO.xlsm
10_UPO.xlsm
11_UP.xlsm
12_UPH.xlsm
13_UPL.xlsm
14_UVO.xlsm
15_PTBV.xlsm
16_PE.xlsm
17_PI.xlsm
18_RI.xlsm
19_MTBV.xlsm
20_NOSH.xlsm
21_DWTA.xlsm


In [1]:
import sys, os, os.path
if not 'EXPERIMENT_HOME' in os.environ: # Configure local paths if not already defined
    ML_HOME = os.path.abspath(os.path.join("..", "ml_home")) # ML workspace
    EXPERIMENT_HOME = os.path.abspath(os.path.join(ML_HOME, "..")) # Experiment workspace
    DATA_HOME = os.path.abspath(os.path.join(EXPERIMENT_HOME,"dataset")) # Dataset location
    os.environ["EXPERIMENT_HOME"] = EXPERIMENT_HOME
    os.environ["DATA_HOME"] = DATA_HOME
    sys.path.insert(0, EXPERIMENT_HOME)
    sys.path.insert(0, ML_HOME) # Add to path so can load our library
    os.chdir(EXPERIMENT_HOME) # Change working directory to experiment workspace
print("Experiment Home: ", os.path.abspath(os.curdir), "; Data Home:", DATA_HOME)

Experiment Home:  /home/jovyan ; Data Home: /home/jovyan/dataset


In [2]:
from datetime import datetime
import pandas as pd
import pandas_datareader as pdr
import datetime as dt
import xlrd
import matplotlib.pyplot as plt
%matplotlib inline

df = pd.read_csv(os.path.join(DATA_HOME,"set100_union.csv"))
cols = df.columns
tickers = df[cols[1]]
tickers

0       CPALL
1      ADVANC
2       BLAND
3        GPSC
4         TTW
        ...  
158    SAMTEL
159       PSH
160        QH
161       SIM
162       AAV
Name: 0, Length: 163, dtype: object

In [3]:
os.chdir("/home/jovyan/dataset/eikon-data")

lookup = pd.read_csv('lookup-stock.csv')
lookup.tail()

Unnamed: 0.1,Unnamed: 0,ISIN,RIC,name,symbol
606,,,,JKN GLOBAL MEDIA,JKN
607,,,,JAS ASSET NVDR,J
608,,,,KIATNAKIN PHATRA BANK,KKP
609,,,,MONO NEXT,MONO
610,,,,SEAMICO CAPITAL,ZMICO


In [4]:
# Example lookup

# lookup.loc[lookup['symbol']=='LHFG']
lookup.loc[lookup['symbol']=='SAPPE']

Unnamed: 0.1,Unnamed: 0,ISIN,RIC,name,symbol
400,400.0,TH5614010004,TH:SAPPE,SAPPE,SAPPE


# Price Series

In [5]:
os.chdir("/home/jovyan/dataset/Datastream")

def price_value(x):
    if x.value == 'NA':
        return None
    else:
        return float(x.value)

def get_price_series():
    fname = '4_P.xlsm'
    wb = xlrd.open_workbook(fname)
    series = {}
    for i in range(2,8): # Note the sheet index 2014-2019
        s = wb.sheets()[i]

        # Date index
        s1 = pd.Series(s.col(0)[4:]) # observation starts at row 4
        s1 = s1.apply(lambda x: datetime(*xlrd.xldate_as_tuple(x.value, wb.datemode)))

        for j in range(1, s.ncols):
            c = s.col(j)
            # print("{}".format(c[3].value)) # Print column name
            name = c[3].value
            # df = pd.DataFrame()
            if not "#ERROR" in name:
                # Convert spreadsheet value to python
                s2 = pd.Series(c[4:]) # observation starts at row 4
                s2 = s2.apply(price_value)
                df = pd.concat([s1, s2], axis=1)

                if not name in series:
                    series[name] = []
                series[name].append(df)
    return series

def rekey_and_stack(sin):
    sout = {}
    # Change key - missing key due to name change. Add verbatim name to database.
    keys = sin.keys()
    for k in keys:
        newkey = k.replace('','')
        # TODO change this with lookup function
        # symbol = lookup.loc[lookup['name']==newkey].symbol.values[0]
        symbol = newkey
        sout[symbol] = sin[k]
    # Merge years
    for s in sout.keys():   
        stock = pd.concat(sout[s], axis=0)
        stock.columns = ['Date', 'Price']
        stock = stock.set_index('Date') # Set date index
        sout[s] = stock['Price']
    return sout

series = get_price_series()
price_series = rekey_and_stack(series)
price_series.keys()


dict_keys(['SEVEN UTILITIES AND POWER ORS', 'AREEYA PROPERTY', 'ASIA AVIATION', 'AMATA B GRIMM PWPL.IFCF.', 'ADVANCED CONNECTION CORPORATION', 'ABSOLUTE CLEAN ENERGY', 'AUTOCORP HOLDING', 'ADVANCED INFO SER.', 'AEC SECURITIES', 'AE.THANA SINSAP (THAI.)', 'ASIA FIBER', 'ASIA GREEN ENERGY', 'AAPICO HITECH', 'AIKCHOL HOSPITAL', 'ASIAN INSULATORS', 'AIM COML.GW.FH.&. LH. REIT UNT.', 'AIM INDL.GW.FH.&. LH. REIT', 'ADVANCED INFO.TECH.', 'AJ PLAST', 'AJ ADVANCE TECHNOLOGY ORS', 'EKARAT ENGINEERING', 'ALLA', 'ALT TELECOM', 'ALUCON', 'AMANAH LEASING', 'AMARIN PRINTING & PBL.', 'AMATA', 'AMATA NVDR', 'AMATA VN', 'ASIA METAL', 'ANANDA DEVELOPMENT', 'AIRPORTS OF THAILAND', 'AP (THAILAND)', 'ASIAN PHYTOCEUTICALS', 'ASIA PRECISION', 'APEX DEVELOPMENT', 'AGRIPURE HOLDINGS', 'AQ ESTATE', 'AQUA', 'ASIASOFT', 'SYNERGETIC AUTO PERFORMANCE', 'ASEFA', 'ASIA HOTEL', 'ASIAN SEA CORPORATION', 'ASIAN MARINE SERVICES', 'ASIA SERMKIJ LEASING', 'ASIA PLUS GROUP HDG.', 'ASSET WORLD', 'ALLIANZ AYUDHYA CAPITAL', 'BU

In [6]:
df = pd.DataFrame()
idx = None
for k,v in price_series.items():
    idx = v.index
    v = v.reset_index(drop=True)
    df[k] = v
df.index = idx

df

Unnamed: 0_level_0,SEVEN UTILITIES AND POWER ORS,AREEYA PROPERTY,ASIA AVIATION,AMATA B GRIMM PWPL.IFCF.,ADVANCED CONNECTION CORPORATION,ABSOLUTE CLEAN ENERGY,AUTOCORP HOLDING,ADVANCED INFO SER.,AEC SECURITIES,AE.THANA SINSAP (THAI.),...,WHA PRUM.GW.FH.&. LH. REIT,WHA UTILITIES AND POWER,WICE LOGISTICS,WIIK,WYNCOAST INDUSTRIAL PARK,WORKPOINT ENTERTAINMENT,WP ENERGY,WATTANAPAT HOSPITAL TRANG,ZEN CORPORATION,SEAMICO CAPITAL
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-01-01,0.44,6.65,4.20,3.23,0.51,,,172.5,0.27,182.0,...,11.04,5.55,3.64,1.65,0.54,23.30,4.98,2.32,,1.00
2019-01-02,0.43,6.65,4.12,3.20,0.50,,,173.5,0.27,183.0,...,10.94,5.60,3.66,1.64,0.56,24.00,4.98,2.26,,0.99
2019-01-03,0.43,6.65,3.96,3.20,0.50,,,172.5,0.26,186.5,...,10.84,5.55,3.68,1.67,0.55,23.50,4.92,2.30,,0.98
2019-01-04,0.43,6.60,4.00,3.25,0.50,,,177.0,0.26,186.5,...,10.74,5.55,3.74,1.67,0.55,23.30,5.00,2.28,,0.99
2019-01-07,0.42,6.65,4.02,3.28,0.48,,,177.0,0.27,187.0,...,10.84,5.65,3.78,1.68,0.55,23.40,5.10,2.36,,0.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2014-12-25,3.22,5.45,4.46,4.02,0.78,,,248.0,0.80,109.5,...,9.39,,,2.28,1.00,29.80,,,,1.55
2014-12-26,3.24,5.45,4.42,3.97,0.78,,,251.0,0.81,109.0,...,9.49,,,2.23,0.99,30.51,,,,1.55
2014-12-29,3.05,5.45,4.32,3.99,0.78,,,249.0,0.79,108.0,...,9.49,,,2.16,0.96,29.80,,,,1.54
2014-12-30,3.00,5.45,4.30,3.99,0.78,,,251.0,0.80,108.5,...,9.53,,,2.19,0.96,29.80,,,,1.55


In [7]:
# Change column names: Map company names to ticker symbols


rename = {}

for name in df.columns.values:
    try:
        rename[name] = lookup.loc[lookup['name']==name].symbol.values[0]
    except:
        print(name)
df=df.rename(columns=rename)
df

Unnamed: 0_level_0,7UP,A,AAV,ABPIF,ACC,ACE,ACG,ADVANC,AEC,AEONTS,...,WHART,WHAUP,WICE,WIIK,WIN,WORK,WP,WPH,ZEN,ZMICO
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-01-01,0.44,6.65,4.20,3.23,0.51,,,172.5,0.27,182.0,...,11.04,5.55,3.64,1.65,0.54,23.30,4.98,2.32,,1.00
2019-01-02,0.43,6.65,4.12,3.20,0.50,,,173.5,0.27,183.0,...,10.94,5.60,3.66,1.64,0.56,24.00,4.98,2.26,,0.99
2019-01-03,0.43,6.65,3.96,3.20,0.50,,,172.5,0.26,186.5,...,10.84,5.55,3.68,1.67,0.55,23.50,4.92,2.30,,0.98
2019-01-04,0.43,6.60,4.00,3.25,0.50,,,177.0,0.26,186.5,...,10.74,5.55,3.74,1.67,0.55,23.30,5.00,2.28,,0.99
2019-01-07,0.42,6.65,4.02,3.28,0.48,,,177.0,0.27,187.0,...,10.84,5.65,3.78,1.68,0.55,23.40,5.10,2.36,,0.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2014-12-25,3.22,5.45,4.46,4.02,0.78,,,248.0,0.80,109.5,...,9.39,,,2.28,1.00,29.80,,,,1.55
2014-12-26,3.24,5.45,4.42,3.97,0.78,,,251.0,0.81,109.0,...,9.49,,,2.23,0.99,30.51,,,,1.55
2014-12-29,3.05,5.45,4.32,3.99,0.78,,,249.0,0.79,108.0,...,9.49,,,2.16,0.96,29.80,,,,1.54
2014-12-30,3.00,5.45,4.30,3.99,0.78,,,251.0,0.80,108.5,...,9.53,,,2.19,0.96,29.80,,,,1.55


In [8]:
# Save to database
from sqlalchemy import create_engine
engine = create_engine('postgresql://datauser:1234@172.18.0.1:5432/stockdb', echo=False)

df.to_sql('Price', con=engine, if_exists='replace')

# Saved to CSV
# df.to_csv('4_P.csv')



In [9]:
class DatastreamTable:
    def __init__(self, fname, key_replace, data_col, table_name):        
        os.chdir("/home/jovyan/dataset/Datastream")
        self.fname = fname
        self.key_replace = key_replace
        self.data_col = data_col
        self.table_name = table_name
        ds = self.get_data_series()
        dd = self.rekey_and_stack(ds)
        # Prepare final data frame
        df = pd.DataFrame()
        idx = None
        for k,v in dd.items():
            idx = v.index
            v = v.reset_index(drop=True)
            df[k] = v
        df.index = idx
        
        # Change column names: Map company names to ticker symbols
        rename = {}

        for name in df.columns.values:
            try:
                rename[name] = lookup.loc[lookup['name']==name].symbol.values[0]
            except:
                print(name)
        df=df.rename(columns=rename)
        
        self.df = df
        
    def get_data_series(self):
        fname = self.fname
        wb = xlrd.open_workbook(fname)
        series = {}
        for i in range(2,8): # Note the sheet index 2014-2019
            s = wb.sheets()[i]
            # Date index
            s1 = pd.Series(s.col(0)[4:]) # observation starts at row 4
            s1 = s1.apply(lambda x: datetime(*xlrd.xldate_as_tuple(x.value, wb.datemode)))

            for j in range(1, s.ncols):
                c = s.col(j)
                # print("{}".format(c[3].value)) # Print each column in data file
                name = c[3].value
                # df = pd.DataFrame()
                if not "#ERROR" in name:
                    # Convert spreadsheet market cap value to python
                    s2 = pd.Series(c[4:]) # observation starts at row 4
                    s2 = s2.apply(lambda x: x.value)
                    df = pd.concat([s1, s2], axis=1)

                    if not name in series:
                        series[name] = []
                    series[name].append(df)
        return series

    def rekey_and_stack(self, sin):
        sout = {}
        keys = sin.keys()
        # Change key - column name have attribute description that is to be removed
        for k in keys:
            newkey = k.replace(self.key_replace,'')
            symbol = newkey
            sout[symbol] = sin[k]
        # Merge years
        for s in sout.keys():
            stock = pd.concat(sout[s], axis=0)
            stock.columns = ['Date', self.data_col]
            stock = stock.set_index('Date') # Set date index
            stock.sort_index(inplace=True)
            sout[s] = stock[self.data_col]
        return sout
    
    def save(self):
        self.df.to_sql(self.table_name, con=engine, if_exists='replace')
        self.df.to_csv(self.fname.replace('.xlsm','.csv'))
        print("Saved. %s" % self.table_name)

# DY 

In [10]:
dat = DatastreamTable(fname='1_DY.xlsm', key_replace=' - DIVIDEND YIELD', data_col='DY', 
                       table_name='DividendYield')

In [11]:
dat.df.tail()['CPALL']

Date
2014-12-25    2.21
2014-12-26    2.16
2014-12-29    2.21
2014-12-30    2.12
2014-12-31    2.12
Name: CPALL, dtype: float64

In [12]:
dat.save()

Saved. DividendYield


# EPS

In [13]:
dat = DatastreamTable(fname='2_EPS.xlsm', key_replace=' - EARNINGS PER SHR', data_col='EPS', 
                       table_name='EarningsPerShare')
dat.df.tail()

Unnamed: 0_level_0,7UP,A,AAV,ABPIF,ACC,ACE,ACG,ADVANC,AEC,AEONTS,...,WHABT,WHART,WICE,WIIK,WIN,WORK,WP,WPH,ZEN,ZMICO
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-12-25,0.0,0.02,0.0,1.4,0.01,,,12.02,0.0,9.74,...,,,,0.05,0.0,0.18,,,,0.08
2014-12-26,0.0,0.02,0.0,1.4,0.01,,,12.02,0.0,9.74,...,,,,0.05,0.0,0.18,,,,0.08
2014-12-29,0.0,0.02,0.0,1.4,0.01,,,12.02,0.0,9.74,...,,,,0.05,0.0,0.18,,,,0.08
2014-12-30,0.0,0.02,0.0,1.4,0.01,,,12.02,0.0,9.74,...,,,,0.05,0.0,0.18,,,,0.08
2014-12-31,0.0,0.02,0.0,1.4,0.01,,,12.02,0.0,9.74,...,,,,0.05,0.0,0.18,,,,0.08


In [14]:
dat.save()

Saved. EarningsPerShare


# MV

In [15]:

dat = DatastreamTable(fname='3_MV.xlsm', key_replace=' - MARKET VALUE', data_col='MV', 
                       table_name='MarketValue')
dat.df.tail()

Unnamed: 0_level_0,7UP,A,AAV,ABPIF,ACC,ACE,ACG,ADVANC,AEC,AEONTS,...,WHART,WHAUP,WICE,WIIK,WIN,WORK,WP,WPH,ZEN,ZMICO
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-12-25,2106.0,5341.0,21631.0,5430.0,915.14,,,737327.1,1484.32,27375.0,...,3061.28,,,797.83,496.48,11173.7,,,,2026.99
2014-12-26,2116.8,5341.0,21437.0,5370.0,915.14,,,746246.6,1494.42,27250.0,...,3092.36,,,779.84,491.52,11439.74,,,,2026.99
2014-12-29,2099.52,5341.0,20951.99,5400.0,915.14,,,740299.9,1464.13,27000.0,...,3092.36,,,755.84,476.62,11173.7,,,,2013.91
2014-12-30,2060.64,5341.0,20855.0,5400.0,915.14,,,746246.6,1474.23,27125.0,...,3107.9,,,767.84,476.62,11173.7,,,,2026.99
2014-12-31,2060.64,5341.0,20855.0,5400.0,915.14,,,746246.6,1474.23,27125.0,...,3107.9,,,767.84,476.62,11173.7,,,,2026.99


In [16]:
dat.save()

Saved. MarketValue


# VA

In [17]:
dat = DatastreamTable(fname='8_VA.xlsm', key_replace=' - TURNOVER BY VALUE', data_col='VA', 
                       table_name='TurnoverByValue')
dat.df.tail()

Unnamed: 0_level_0,7UP,A,AAV,ABPIF,ACC,ACE,ACG,ADVANC,AEC,AEONTS,...,WHART,WHAUP,WICE,WIIK,WIN,WORK,WP,WPH,ZEN,ZMICO
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-12-25,20139.5,132.9,105279.0,382.8,,,,441591.0,9628.6,10445.4,...,414.8,,,57782.4,10851.1,12347.5,,,,902.6
2014-12-26,38899.6,54.5,147574.0,149.4,,,,611976.0,8022.4,1785.6,...,5805.1,,,50541.8,3492.0,19288.1,,,,435.3
2014-12-29,29306.8,71.9,296742.0,148.8,,,,570719.0,2110.8,2135.6,...,1024.8,,,8295.3,2050.3,1975.3,,,,673.6
2014-12-30,6423.7,81.8,140091.0,55.1,,,,575501.0,4006.6,2239.9,...,386.3,,,4835.8,1581.0,3981.8,,,,1338.0
2014-12-31,,,,,,,,,,,...,,,,,,,,,,


In [18]:
dat.save()

Saved. TurnoverByValue


# VO

In [19]:
dat = DatastreamTable(fname='9_VO.xlsm', key_replace=' - TURNOVER BY VOLUME', data_col='VO', 
                       table_name='TurnoverByVolume')
dat.df.tail()

Unnamed: 0_level_0,7UP,A,AAV,ABPIF,ACC,ACE,ACG,ADVANC,AEC,AEONTS,...,WHART,WHAUP,WICE,WIIK,WIN,WORK,WP,WPH,ZEN,ZMICO
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-12-25,6309.1,24.3,23459.2,96.2,,,,1774.2,11830.9,95.3,...,44.2,,,24971.8,10649.8,416.6,,,,583.1
2014-12-26,12083.5,10.0,33451.5,37.4,,,,2453.1,10006.2,16.5,...,613.3,,,21615.4,3507.5,638.1,,,,280.6
2014-12-29,9612.5,13.2,69141.7,37.4,,,,2282.7,2648.4,19.8,...,108.0,,,3864.2,2089.8,65.5,,,,437.5
2014-12-30,2167.5,15.0,32556.0,13.7,,,,2303.0,5094.5,20.7,...,40.6,,,2252.7,1655.9,134.2,,,,865.7
2014-12-31,,,,,,,,,,,...,,,,,,,,,,


In [20]:
dat.save()

Saved. TurnoverByVolume


# UPO

In [21]:
dat = DatastreamTable(fname='10_UPO.xlsm', key_replace=' - UNADJ. PRICE OPEN', data_col='UPO', 
                       table_name='UnadjustedPriceOpen')
dat.df.tail()

Unnamed: 0_level_0,7UP,A,AAV,ABPIF,ACC,ACE,ACG,ADVANC,AEC,AEONTS,...,WHART,WHAUP,WICE,WIIK,WIN,WORK,WP,WPH,ZEN,ZMICO
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-12-25,3.92,5.5,4.52,9.0,,,,251.0,1.48,112.0,...,9.9,,,2.68,1.0,42.0,,,,1.57
2014-12-26,3.88,5.45,4.46,9.0,,,,248.0,1.48,108.5,...,9.95,,,2.74,1.0,41.75,,,,1.56
2014-12-29,3.32,5.45,4.28,8.95,,,,252.0,1.5,108.0,...,9.85,,,2.6,1.0,43.0,,,,1.56
2014-12-30,3.2,5.45,4.34,9.05,,,,249.0,1.45,108.0,...,9.95,,,2.5,0.96,41.75,,,,1.55
2014-12-31,,,,,,,,,,,...,,,,,,,,,,


In [22]:
dat.save()

Saved. UnadjustedPriceOpen


# UP

In [23]:
dat = DatastreamTable(fname='11_UP.xlsm', key_replace=' - UNADJUSTED PRICE', data_col='UP', 
                       table_name='UnadjustedPrice')
dat.df.tail()

Unnamed: 0_level_0,7UP,A,AAV,ABPIF,ACC,ACE,ACG,ADVANC,AEC,AEONTS,...,WHART,WHAUP,WICE,WIIK,WIN,WORK,WP,WPH,ZEN,ZMICO
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-12-25,3.9,5.45,4.46,9.05,4.86,,,248.0,1.47,109.5,...,9.85,,,2.66,1.0,42.0,,,,1.55
2014-12-26,3.92,5.45,4.42,8.95,4.86,,,251.0,1.48,109.0,...,9.95,,,2.6,0.99,43.0,,,,1.55
2014-12-29,3.24,5.45,4.32,9.0,4.86,,,249.0,1.45,108.0,...,9.95,,,2.52,0.96,42.0,,,,1.54
2014-12-30,3.18,5.45,4.3,9.0,4.86,,,251.0,1.46,108.5,...,10.0,,,2.56,0.96,42.0,,,,1.55
2014-12-31,3.18,5.45,4.3,9.0,4.86,,,251.0,1.46,108.5,...,10.0,,,2.56,0.96,42.0,,,,1.55


In [24]:
dat.save()

Saved. UnadjustedPrice


# UPH

In [25]:
dat = DatastreamTable(fname='12_UPH.xlsm', key_replace=' - UNADJ. PRICE HIGH', data_col='UPH', 
                       table_name='UnadjustedPriceHigh')
dat.df.tail()

Unnamed: 0_level_0,7UP,A,AAV,ABPIF,ACC,ACE,ACG,ADVANC,AEC,AEONTS,...,WHART,WHAUP,WICE,WIIK,WIN,WORK,WP,WPH,ZEN,ZMICO
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-12-25,3.92,5.5,4.58,9.05,,,,252.0,1.54,112.0,...,9.9,,,2.78,1.05,42.5,,,,1.57
2014-12-26,3.98,5.45,4.46,9.05,,,,251.0,1.5,109.0,...,9.95,,,2.84,1.02,43.0,,,,1.56
2014-12-29,3.32,5.45,4.34,9.0,,,,253.0,1.5,108.0,...,10.0,,,2.6,1.0,43.5,,,,1.56
2014-12-30,3.24,5.45,4.36,9.05,,,,252.0,1.46,109.0,...,10.0,,,2.58,0.97,42.0,,,,1.56
2014-12-31,,,,,,,,,,,...,,,,,,,,,,


In [26]:
dat.save()

Saved. UnadjustedPriceHigh


In [27]:
# UPL

In [28]:
dat = DatastreamTable(fname='13_UPL.xlsm', key_replace=' - UNADJ. PRICE LOW', data_col='UPL', 
                       table_name='UnadjustedPriceLow')
dat.df.tail()

Unnamed: 0_level_0,7UP,A,AAV,ABPIF,ACC,ACE,ACG,ADVANC,AEC,AEONTS,...,WHART,WHAUP,WICE,WIIK,WIN,WORK,WP,WPH,ZEN,ZMICO
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-12-25,3.82,5.45,4.44,8.95,,,,246.0,1.46,109.5,...,9.85,,,2.62,0.99,41.25,,,,1.54
2014-12-26,3.82,5.45,4.38,8.95,,,,246.0,1.43,106.5,...,9.9,,,2.6,0.98,41.5,,,,1.54
2014-12-29,3.18,5.45,4.24,8.95,,,,248.0,1.45,107.5,...,9.85,,,2.46,0.96,42.0,,,,1.53
2014-12-30,3.04,5.45,4.28,9.0,,,,247.0,1.42,108.0,...,9.95,,,2.44,0.95,41.5,,,,1.53
2014-12-31,,,,,,,,,,,...,,,,,,,,,,


In [29]:
dat.save()

Saved. UnadjustedPriceLow


# UVO

In [30]:
dat = DatastreamTable(fname='14_UVO.xlsm', key_replace=' - UNADJ. VOLUME', data_col='UVO', 
                       table_name='UnadjustedVolume')
dat.df.tail()

Unnamed: 0_level_0,7UP,A,AAV,ABPIF,ACC,ACE,ACG,ADVANC,AEC,AEONTS,...,WHART,WHAUP,WICE,WIIK,WIN,WORK,WP,WPH,ZEN,ZMICO
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-12-25,5207.0,24.3,23459.2,42.7,,,,1774.2,6459.2,95.3,...,42.1,,,21404.3,10649.8,295.6,,,,583.1
2014-12-26,9972.8,10.0,33451.5,16.6,,,,2453.1,5463.0,16.5,...,584.8,,,18527.4,3507.5,452.8,,,,280.6
2014-12-29,9058.0,13.2,69141.7,16.6,,,,2282.7,1445.9,19.8,...,103.0,,,3312.2,2089.8,46.5,,,,437.5
2014-12-30,2042.5,15.0,32556.0,6.1,,,,2303.0,2781.4,20.7,...,38.7,,,1930.9,1655.9,95.2,,,,865.7
2014-12-31,,,,,,,,,,,...,,,,,,,,,,


In [31]:
dat.save()

Saved. UnadjustedVolume


# PTBV

In [32]:
dat = DatastreamTable(fname='15_PTBV.xlsm', key_replace=' - PRICE TO BOOK VAL', data_col='PTBV', 
                       table_name='PriceToBookValue')
dat.df.tail()

Unnamed: 0_level_0,7UP,A,AAV,ABPIF,ACC,ACE,ACG,ADVANC,AEC,AEONTS,...,WHABT,WHART,WICE,WIIK,WIN,WORK,WP,WPH,ZEN,ZMICO
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-12-25,1866.57,1.86,1.11,0.92,1.18,,,15.77,1.34,2.62,...,,,,1.84,3.07,6.59,,,,0.84
2014-12-26,1876.14,1.86,1.1,0.91,1.18,,,15.96,1.35,2.61,...,,,,1.8,3.04,6.75,,,,0.84
2014-12-29,1770.49,1.86,1.07,0.91,1.18,,,15.83,1.32,2.59,...,,,,1.74,2.95,6.59,,,,0.84
2014-12-30,1737.7,1.86,1.07,0.91,1.18,,,15.96,1.33,2.6,...,,,,1.77,2.95,6.59,,,,0.84
2014-12-31,1737.7,1.86,1.07,0.91,1.18,,,15.96,1.33,2.6,...,,,,1.77,2.95,6.59,,,,0.84


In [33]:
dat.save()

Saved. PriceToBookValue


# PE

In [34]:
dat = DatastreamTable(fname='16_PE.xlsm', key_replace=' - PER', data_col='PE', 
                       table_name='PriceToEarningRatio')
dat.df.tail()

ValueError: Length mismatch: Expected axis has 522 elements, new values have 261 elements

# PI

In [35]:
dat = DatastreamTable(fname='17_PI.xlsm', key_replace=' - PRICE INDEX', data_col='PI', 
                       table_name='PriceIndex')
dat.df.tail()

Unnamed: 0_level_0,7UP,A,AAV,ABPIF,ACC,ACE,ACG,ADVANC,AEC,AEONTS,...,WHART,WHAUP,WICE,WIIK,WIN,WORK,WP,WPH,ZEN,ZMICO
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-12-25,110.5,140.8,120.5,97.5,27.6,,,5829.8,130.7,862.2,...,99.5,,,103.3,0.4,305.2,,,,10.9
2014-12-26,111.1,140.8,119.5,96.4,27.6,,,5900.3,131.6,858.3,...,100.5,,,101.0,0.4,312.5,,,,10.9
2014-12-29,104.8,140.8,116.8,97.0,27.6,,,5853.3,128.9,850.4,...,100.5,,,97.9,0.4,305.2,,,,10.8
2014-12-30,102.9,140.8,116.2,97.0,27.6,,,5900.3,129.8,854.3,...,101.0,,,99.4,0.4,305.2,,,,10.9
2014-12-31,102.9,140.8,116.2,97.0,27.6,,,5900.3,129.8,854.3,...,101.0,,,99.4,0.4,305.2,,,,10.9


In [36]:
dat.save()

Saved. PriceIndex


# RI

In [37]:
dat = DatastreamTable(fname='18_RI.xlsm', key_replace=' - TOT RETURN IND', data_col='PI', 
                       table_name='TotalReturnIndex')
dat.df.tail()

Unnamed: 0_level_0,7UP,A,AAV,ABPIF,ACC,ACE,ACG,ADVANC,AEC,AEONTS,...,WHART,WHAUP,WICE,WIIK,WIN,WORK,WP,WPH,ZEN,ZMICO
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-12-25,161.38,146.75,120.54,109.61,103.25,,,16144.92,147.52,1502.58,...,99.49,,,209.78,0.41,516.65,,,,33.39
2014-12-26,162.2,146.75,119.46,108.4,103.25,,,16340.22,148.53,1495.72,...,100.51,,,205.05,0.41,528.95,,,,33.39
2014-12-29,153.07,146.75,116.76,109.0,103.25,,,16210.02,145.52,1482.0,...,100.51,,,198.74,0.4,516.65,,,,33.18
2014-12-30,150.23,146.75,116.22,109.0,103.25,,,16340.22,146.52,1488.86,...,101.01,,,201.89,0.4,516.65,,,,33.39
2014-12-31,150.23,146.75,116.22,109.0,103.25,,,16340.22,146.52,1488.86,...,101.01,,,201.89,0.4,516.65,,,,33.39


In [38]:
dat.save()

Saved. TotalReturnIndex


# MTBV

In [39]:
dat = DatastreamTable(fname='19_MTBV.xlsm', key_replace=' - MRKT VALUE TO BOOK', data_col='MTBV', 
                       table_name='MarketValueToBook')
dat.df.tail()

Unnamed: 0_level_0,7UP,A,AAV,ABPIF,ACC,ACE,ACG,ADVANC,AEC,AEONTS,...,WHABT,WHART,WICE,WIIK,WIN,WORK,WP,WPH,ZEN,ZMICO
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-12-25,2133.41,1.86,1.11,0.92,1.15,,,15.77,1.34,2.62,...,,,,1.84,3.07,6.59,,,,0.84
2014-12-26,2144.35,1.86,1.1,0.91,1.15,,,15.96,1.35,2.61,...,,,,1.8,3.04,6.75,,,,0.84
2014-12-29,2126.84,1.86,1.07,0.91,1.15,,,15.84,1.32,2.59,...,,,,1.74,2.95,6.59,,,,0.84
2014-12-30,2087.46,1.86,1.07,0.91,1.15,,,15.96,1.33,2.6,...,,,,1.77,2.95,6.59,,,,0.84
2014-12-31,2087.46,1.86,1.07,0.91,1.15,,,15.96,1.33,2.6,...,,,,1.77,2.95,6.59,,,,0.84


In [40]:
dat.save()

Saved. MarketValueToBook


# NOSH

In [42]:
dat = DatastreamTable(fname='20_NOSH.xlsm', key_replace=' - NUMBER OF SHARES', data_col='NOSH', 
                       table_name='NumberOfShares')
dat.df.tail()

Unnamed: 0_level_0,7UP,A,AAV,ABPIF,ACC,ACE,ACG,ADVANC,AEC,AEONTS,...,WHART,WHAUP,WICE,WIIK,WIN,WORK,WP,WPH,ZEN,ZMICO
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-12-25,540000.0,980000.0,4849999.0,600000.0,188300.0,,,2973094.0,1009744.0,250000.0,...,310790,,,299938.0,496484.0,266040.0,,,,1307736.0
2014-12-26,540000.0,980000.0,4849999.0,600000.0,188300.0,,,2973094.0,1009744.0,250000.0,...,310790,,,299938.0,496484.0,266040.0,,,,1307736.0
2014-12-29,648000.0,980000.0,4849999.0,600000.0,188300.0,,,2973094.0,1009744.0,250000.0,...,310790,,,299938.0,496484.0,266040.0,,,,1307736.0
2014-12-30,648000.0,980000.0,4849999.0,600000.0,188300.0,,,2973094.0,1009744.0,250000.0,...,310790,,,299938.0,496484.0,266040.0,,,,1307736.0
2014-12-31,648000.0,980000.0,4849999.0,600000.0,188300.0,,,2973094.0,1009744.0,250000.0,...,310790,,,299938.0,496484.0,266040.0,,,,1307736.0


In [43]:
dat.save()

Saved. NumberOfShares


# DWTA

In [45]:
dat = DatastreamTable(fname='21_DWTA.xlsm', key_replace=' - TOTAL ASSETS', data_col='DWTA', 
                       table_name='TotalAssets')
dat.df.tail()

EARTH TECH ENVIRONMENT
KERRY EXPRESS (THAILAND)
MICRO LEASING
NEXT CAPITAL
NR INSTANT PRODUCE
RIGHT TUNNELLING
SIAMESE ASSET
SUCCESSMORE BEING
SIAMRAJATHANEE
WELL GRADED ENGINEERING


Unnamed: 0_level_0,7UP,A,AAV,ABPIF,ACC,ACE,ACG,ADVANC,AEC,AEONTS,...,WHABT,WHART,WICE,WIIK,WIN,WORK,WP,WPH,ZEN,ZMICO
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-12-25,1059741.0,11123819.0,48288505.0,5808714.0,737013.0,,,118771381.0,3283644.0,64260681.0,...,,,291987.0,942729.0,407540.0,5075291.0,7859722.0,554108.0,,2475131.0
2014-12-26,1059741.0,11123819.0,48288505.0,5808714.0,737013.0,,,118771381.0,3283644.0,64260681.0,...,,,291987.0,942729.0,407540.0,5075291.0,7859722.0,554108.0,,2475131.0
2014-12-29,1059741.0,11123819.0,48288505.0,5808714.0,737013.0,,,118771381.0,3283644.0,64260681.0,...,,,291987.0,942729.0,407540.0,5075291.0,7859722.0,554108.0,,2475131.0
2014-12-30,1059741.0,11123819.0,48288505.0,5808714.0,737013.0,,,118771381.0,3283644.0,64260681.0,...,,,291987.0,942729.0,407540.0,5075291.0,7859722.0,554108.0,,2475131.0
2014-12-31,922593.0,11948544.0,49501801.0,5943741.0,737013.0,,,124908707.0,3318383.0,64260681.0,...,,,357981.0,911939.0,402480.0,5207929.0,5772582.0,485069.0,1036620.0,2577531.0


In [46]:
dat.save()

KeyError: 'KERRY EXPRESS (THAILAND'


# Save Ticker Lookup Table

In [None]:
companies = []
for k,v in rename.items():
    companies.append({'name':k, 'ticker':v})

pd.DataFrame(data=companies).to_sql('Company', if_exists='replace', con=engine)

* ======

# PER

In [None]:
def get_PE_series():
    fname = 'SET_PE_2014-2020.xlsm'
    wb = xlrd.open_workbook(fname)
    series = {}
    for i in range(2,8): # Note the sheet index 2014-2019
        s = wb.sheets()[i]

        # Date index
        s1 = pd.Series(s.col(0)[4:]) # observation starts at row 4
        s1 = s1.apply(lambda x: datetime(*xlrd.xldate_as_tuple(x.value, wb.datemode)))

        for j in range(1, s.ncols):
            c = s.col(j)
            print("{}".format(c[3].value))
            name = c[3].value
            # df = pd.DataFrame()
            if not "#ERROR" in name:
                # Convert spreadsheet market cap value to python
                s2 = pd.Series(c[4:]) # observation starts at row 4
                s2 = s2.apply(lambda x: x.value)
                df = pd.concat([s1, s2], axis=1)

                if not name in series:
                    series[name] = []
                series[name].append(df)
    return series

def rekey_and_stack(sin):
    sout = {}
    keys = sin.keys()
    # Change key
    for k in keys:
        if k == 'PER':
            # Add fix for PERM. It looks like the loader table messed up column name too.
            sout['PERM'] = sin[k]
        else:
            newkey = k[:-len(' - PER')]
            print(k," ==> ",newkey) # Debug above erro.
            symbol = lookup.loc[lookup['name']==newkey].symbol.values[0]
            sout[symbol] = sin[k]
    # Merge years
    for s in sout.keys():
        stock = pd.concat(sout[s], axis=0)
        stock.columns = ['Date', 'PE']
        stock = stock.set_index('Date') # Set date index
        sout[s] = stock 
    return sout

pe = get_PE_series()
per = rekey_and_stack(pe)

In [None]:
per_data = {}
for t in tickers:
    if t in per:
        # print("Has price data for %s" % t)
        per_data[t] = per[t]
    else:
        print("No PE data for %s" % t)

# DIPE

In [None]:
def get_DIPE_series():
    fname = 'SET_DIPE_2014-2020.xlsm'
    wb = xlrd.open_workbook(fname)
    series = {}
    for i in range(2,8): # Note the sheet index 2014-2019
        s = wb.sheets()[i]

        # Date index
        s1 = pd.Series(s.col(0)[4:]) # observation starts at row 4
        s1 = s1.apply(lambda x: datetime(*xlrd.xldate_as_tuple(x.value, wb.datemode)))

        for j in range(1, s.ncols):
            c = s.col(j)
            print("{}".format(c[3].value))
            name = c[3].value
            # df = pd.DataFrame()
            if not "#ERROR" in name:
                # Convert spreadsheet market cap value to python
                s2 = pd.Series(c[4:]) # observation starts at row 4
                s2 = s2.apply(lambda x: x.value)
                df = pd.concat([s1, s2], axis=1)

                if not name in series:
                    series[name] = []
                series[name].append(df)
    return series

def rekey_and_stack(sin):
    sout = {}
    keys = sin.keys()
    # Change key
    for k in keys:
        newkey = k.replace(' - PE RATIO 12M FWD','')
        symbol = lookup.loc[lookup['name']==newkey].symbol.values[0]
        sout[symbol] = sin[k]
    # Merge years
    for s in sout.keys():
        stock = pd.concat(sout[s], axis=0)
        stock.columns = ['Date', 'DIPE']
        stock = stock.set_index('Date') # Set date index
        sout[s] = stock 
    return sout

dipe = get_DIPE_series()
dipe_p = rekey_and_stack(dipe)

In [None]:
dipe_data = {}
for t in tickers:
    if t in dipe_p:
        # print("Has price data for %s" % t)
        dipe_data[t] = dipe_p[t]
    else:
        print("No DIPE data for %s" % t)

In [None]:
price['AAV'].plot()
price['AAV']

In [None]:
turnover_va['AAV']

In [None]:
(turnover_va['AAV']['VA'] == 'NA').sum()

In [None]:
(price['AAV']['Price']=='NA').sum()

In [None]:

# This will cause error in get_df because value will be assigned to the price data frame.
"""
df = price['AAV']
df['VA'] = turnover_va['AAV']
df['VO'] = turnover_vo['AAV']
df['MV'] = market_val['AAV']
df['PE'] = per_data['AAV']
df['DY'] = dy_data['AAV']
df['DIPE'] = dipe_data['AAV']
df = df.sort_index()
df
"""

In [None]:
# Check data loaded from Eikon
# df[['Price', 'DY']].plot()

In [None]:
df

# Process all 150 stocks

In [None]:
os.chdir(os.environ['DATA_HOME'])
def get_df_old(t):
    df = pd.concat([price[t], 
                    turnover_va[t], 
                    turnover_vo[t], 
                    market_val[t], 
                    dy_data[t]], axis=1)
    return df[['Price', 'MV', 'VA', 'VO', 'DY']]


def get_df(t):
    df_price = price[t]
    df_va = turnover_va[t]
    df_vo = turnover_vo[t]
    df_mv = market_val[t]
    df_dy = dy_data[t]

    # pd.concat([price['BIG'], turnover_va['BIG']], axis=1)
    df2 =  df_price.merge(df_va, how="left", left_on=df_price.index, right_on=df_va.index)
    df2 = df2.set_index('key_0')
    df2 = df2.sort_index()
    df2 = df2.merge(df_vo, how="left", left_on=df2.index, right_on=df_vo.index)
    df2 = df2.set_index('key_0')
    df2 = df2.sort_index()
    df2 = df2.merge(df_mv, how="left", left_on=df2.index, right_on=df_mv.index)
    df2 = df2.set_index('key_0')
    df2 = df2.sort_index()
    df2 = df2.merge(df_dy, how="left", left_on=df2.index, right_on=df_dy.index)
    df2 = df2.set_index('key_0')
    df2 = df2.sort_index()
    df2.index.rename('Date', inplace=True)
    return df2[['Price', 'MV', 'VA', 'VO', 'DY']]

stocks_with_data = []

for t in tickers:
    print("Preparing %s" % t)
    try:
        df = get_df(t)
        df = df.sort_index()
        df.to_csv('set100.data/%s.csv' % t)
        print("wrote %s." % t)
        stocks_with_data.append(t)
    except KeyError:
        print("No data %s" % t)

In [None]:
# Check individual stock: BLAND

pd.concat([price['BLAND'], turnover_va['BLAND'], turnover_vo['BLAND'], market_val['BLAND'], dy_data['BLAND']], axis=1)
# get_df('BLAND')

In [None]:
pd.concat([price['BIG'], market_val['BIG'], dy_data['BIG']], axis=1)
# get_df('BIG') 

df_price = price['BIG']
df_va = turnover_va['BIG']
df_vo = turnover_vo['BIG']
df_mv = market_val['BIG']
df_dy = dy_data['BIG']

# pd.concat([price['BIG'], turnover_va['BIG']], axis=1)
df2 =  df_price.merge(df_va, how="left", left_on=df_price.index, right_on=df_va.index)
df2 = df2.set_index('key_0')
df2 = df2.sort_index()
df2 = df2.merge(df_vo, how="left", left_on=df2.index, right_on=df_vo.index)
df2 = df2.set_index('key_0')
df2 = df2.sort_index()
df2 = df2.merge(df_mv, how="left", left_on=df2.index, right_on=df_mv.index)
df2 = df2.set_index('key_0')
df2 = df2.sort_index()
df2 = df2.merge(df_dy, how="left", left_on=df2.index, right_on=df_dy.index)
df2 = df2.set_index('key_0')
df2 = df2.sort_index()
df2.index.rename('Date', inplace=True)

df2

In [None]:
# U missing PER data
# UV missing DIPE data
# WHAUP missing PER data

# get_df('WHAUP') 

In [None]:
# pd.concat([price['CBG'], turnover_va['CBG'], turnover_vo['CBG'], market_val['CBG'], dy_data['CBG']], axis=1)

In [None]:
# pd.concat([price['CBG'], turnover_va['CBG'], turnover_vo['CBG'], 
#            market_val['CBG'], dy_data['CBG'], per_data['CBG'], 
#            dipe_data['CBG']], axis=1, sort=False)

# df2 = dipe_data['BLAND']
# df['DIPE'] = dipe_data['BLAND'].sort_index()

# df_out = pd.merge(df1,df2, left_on=df1.index, right_on=df2.index, how='left')
# df_out


In [None]:
price.keys()

# per_data['CBG'].plot()

In [None]:
df1 = price['BLAND']
df2 = dipe_data['BLAND']
df_tmp = pd.merge(df1,df2, left_on=df1.index, right_on=df2.index, how='left')
df_tmp

In [None]:


# df_tmp.index = df1.index
df3 = turnover_va['BLAND']
df4 = turnover_vo['BLAND']
df_tmp2 = pd.merge(df3,df4, left_on=df3.index, right_on=df4.index, how='left')
# df_tmp2.index = df3.index

df5 = market_val['BLAND']
df6 = per_data['BLAND']
df_tmp3 = pd.merge(df5,df6, left_on=df5.index, right_on=df6.index, how='left')
# df_tmp3.index = df5.index

df_tmp4 = pd.merge(df_tmp, df_tmp2, left_on=df_tmp.index, right_on=df_tmp2.index, how='left')


df_tmp5 = pd.merge(df_tmp4, df_tmp3, left_on=df_tmp4.index, right_on=df_tmp3.index, how='left')



df_tmp5

In [None]:
price['BIG']

In [None]:
turnover_va['BIG'].sort_index()

In [None]:
market_val['MBK']

In [None]:
sector_lookup = os.path.join(os.environ['EXPERIMENT_HOME'],"""1.0 Data Acquisition/stock_ticker.csv""")
sector_lookup = pd.read_csv(sector_lookup)

df_lookup = pd.DataFrame()

for t in stocks_with_data:
    print(t)
    s = t
    if t == 'GOLD':
        t = 'FPT'
    # symbol = lookup.loc[lookup['name']==newkey].symbol.values[0]
    name = sector_lookup.loc[sector_lookup['symbol']==t].name.values[0]
    sector = sector_lookup.loc[sector_lookup['symbol']==t].sector.values[0]
    industry = sector_lookup.loc[sector_lookup['symbol']==t].industry.values[0]
    print(t,sector,industry, name)
    dat = {'ticker': t, 'company_name': name, 'industry': industry, 'sector': sector, 'symbol': s}
    df_lookup = df_lookup.append(dat,ignore_index=True)

In [None]:
df_lookup = df_lookup.set_index('ticker')

In [None]:
df_lookup = df_lookup.sort_index()

In [None]:
df_lookup

In [None]:
outfile = os.path.join(os.environ['DATA_HOME'],"set100.data","_directory.csv")
df_lookup.to_csv(outfile)

In [None]:
sorted(stocks_with_data)
len(stocks_with_data)

In [None]:
get_df('AAV')