SET100 Data: Feature Set 01

See Check2015-2019 for how to obtain the 129 stocks with complete observations.

In [2]:
import sys, os, os.path
if not 'EXPERIMENT_HOME' in os.environ: # Configure local paths if not already defined
    ML_HOME = os.path.abspath(os.path.join("/home/jovyan", "ml_home")) # ML workspace
    EXPERIMENT_HOME = os.path.abspath(os.path.join(ML_HOME, "..")) # Experiment workspace
    DATA_HOME = os.path.abspath(os.path.join(EXPERIMENT_HOME,"dataset")) # Dataset location
    os.environ["EXPERIMENT_HOME"] = EXPERIMENT_HOME
    os.environ["DATA_HOME"] = DATA_HOME
    sys.path.insert(0, EXPERIMENT_HOME)
    sys.path.insert(0, ML_HOME) # Add to path so can load our library
    os.chdir(EXPERIMENT_HOME) # Change working directory to experiment workspace
print("Experiment Home: ", os.path.abspath(os.curdir), "; Data Home:", DATA_HOME)

Experiment Home:  /home/jovyan ; Data Home: /home/jovyan/dataset


In [3]:
from datetime import datetime
import pandas as pd
import pandas_datareader as pdr
import datetime as dt
import xlrd
import matplotlib.pyplot as plt
%matplotlib inline


In [4]:
os.chdir("/home/jovyan/dataset/set100.data")
stocks = pd.read_csv('_directory.csv', index_col=0)
print("Stocks count:", len(stocks))
stocks.tail()

Stocks count: 150


Unnamed: 0_level_0,company_name,industry,sector,symbol
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
VIBHA,VIBHAVADI MEDICAL CENTER PUBLIC COMPANY LIMITED,Services,Health Care Services,VIBHA
VNG,VANACHAI GROUP PUBLIC COMPANY LIMITED,Property & Construction,Construction Materials,VNG
WHA,WHA CORPORATION PUBLIC COMPANY LIMITED,Property & Construction,Property Development,WHA
WHAUP,WHA UTILITIES AND POWER PUBLIC COMPANY LIMITED,Resources,Energy & Utilities,WHAUP
WORK,WORKPOINT ENTERTAINMENT PUBLIC COMPANY LIMITED,Services,Media & Publishing,WORK


In [5]:
period_start='2014-01-01'
period_end='2014-12-31'
series = {}
for s in stocks.symbol.values:
    df = pd.read_csv("%s.csv" % s, index_col=0)
    # Add column ticker
    df.loc[:,('Date')] = df.index
    df['Ticker'] = s
    
    # Add sector column
    sector = stocks[stocks['symbol']==s].sector.values[0]
    df['Sector'] = sector

    # Add industry column
    industry = stocks[stocks['symbol']==s].industry.values[0]
    df['Industry'] = industry
    
    # Add to dictionary
    series[s] = df[period_start:period_end]

In [6]:
# Select single stock series for analysis    
ticker = "SCB"
df = series[ticker]
df.index = pd.to_datetime(df.index)
df

Unnamed: 0_level_0,Price,MV,VA,VO,DY,Date,Ticker,Sector,Industry
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2014-01-01,143.5,487040.3,,,3.48,2014-01-01,SCB,Banking,Financials
2014-01-02,131.5,446312.2,2001558.0,14727.9,3.80,2014-01-02,SCB,Banking,Financials
2014-01-03,132.5,449706.3,2051395.0,15411.3,3.77,2014-01-03,SCB,Banking,Financials
2014-01-06,140.0,475161.3,1740385.0,12798.9,3.57,2014-01-06,SCB,Banking,Financials
2014-01-07,144.0,488737.4,1322406.0,9353.2,3.47,2014-01-07,SCB,Banking,Financials
...,...,...,...,...,...,...,...,...,...
2014-12-25,183.0,621158.6,460363.9,2521.7,2.87,2014-12-25,SCB,Banking,Financials
2014-12-26,184.5,626250.0,450219.8,2458.3,2.85,2014-12-26,SCB,Banking,Financials
2014-12-29,183.0,621158.6,485902.8,2633.2,2.87,2014-12-29,SCB,Banking,Financials
2014-12-30,182.0,617764.2,547944.4,2993.7,2.88,2014-12-30,SCB,Banking,Financials


In [7]:
# Stack dataframes together
# https://stackoverflow.com/questions/29351840/stack-two-pandas-data-frames
rows = []
for k, d in series.items():
    rows.append(d)
df = pd.concat(rows, ignore_index=True)
df

Unnamed: 0,Price,MV,VA,VO,DY,Date,Ticker,Sector,Industry
0,3.66,17751.00,,,0.0,2014-01-01,AAV,Transportation & Logistics,Services
1,3.44,16683.99,65373.3,19045.1,0.0,2014-01-02,AAV,Transportation & Logistics,Services
2,3.42,16586.99,34585.3,10163.8,0.0,2014-01-03,AAV,Transportation & Logistics,Services
3,3.54,17168.99,41467.2,11936.8,0.0,2014-01-06,AAV,Transportation & Logistics,Services
4,3.72,18041.99,92143.9,25572.0,0.0,2014-01-07,AAV,Transportation & Logistics,Services
...,...,...,...,...,...,...,...,...,...
33664,29.80,11173.70,12347.5,416.6,0.0,2014-12-25,WORK,Media & Publishing,Services
33665,30.51,11439.74,19288.1,638.1,0.0,2014-12-26,WORK,Media & Publishing,Services
33666,29.80,11173.70,1975.3,65.5,0.0,2014-12-29,WORK,Media & Publishing,Services
33667,29.80,11173.70,3981.8,134.2,0.0,2014-12-30,WORK,Media & Publishing,Services


In [8]:
len(series)

150

In [9]:
# Pivot table: Value=Price
ptable = df.pivot_table(values='Price', index='Date', columns=['Industry', 'Sector', 'Ticker'])
ptable

Industry,Agro & Food Industry,Agro & Food Industry,Agro & Food Industry,Agro & Food Industry,Agro & Food Industry,Agro & Food Industry,Agro & Food Industry,Agro & Food Industry,Agro & Food Industry,Agro & Food Industry,...,Technology,Technology,Technology,Technology,Technology,Technology,Technology,Technology,Technology,Technology
Sector,Agribusiness,Agribusiness,Food & Beverage,Food & Beverage,Food & Beverage,Food & Beverage,Food & Beverage,Food & Beverage,Food & Beverage,Food & Beverage,...,Electronic Components,Information & Communication Technology,Information & Communication Technology,Information & Communication Technology,Information & Communication Technology,Information & Communication Technology,Information & Communication Technology,Information & Communication Technology,Information & Communication Technology,Information & Communication Technology
Ticker,GFPT,STA,CBG,CPF,ICHI,KTIS,M,MALEE,MINT,SAPPE,...,SVI,ADVANC,DTAC,INTUCH,JAS,JMART,SAMART,SAMTEL,THCOM,TRUE
Date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2014-01-01,12.8,12.53,,32.00,,,51.25,15.00,18.54,,...,4.14,199.5,97.00,67.75,6.04,8.61,14.80,12.9,40.50,6.51
2014-01-02,12.2,11.95,,30.50,,,48.25,14.62,17.01,,...,4.00,187.0,93.00,64.00,5.59,8.18,14.00,12.1,38.00,5.90
2014-01-03,12.0,11.56,,30.50,,,48.75,15.37,16.75,,...,4.10,196.0,92.50,65.50,5.41,7.92,14.10,11.7,36.75,5.82
2014-01-06,11.9,11.47,,29.25,,,48.00,15.12,18.00,,...,3.94,201.0,95.25,66.00,5.72,7.92,14.90,12.0,36.25,5.99
2014-01-07,12.2,11.95,,30.50,,,48.75,16.12,18.80,,...,4.06,210.0,96.25,67.75,5.90,7.80,14.80,12.4,36.75,6.17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2014-12-25,18.3,12.33,32.75,27.25,20.0,11.1,57.50,13.75,30.00,34.75,...,4.06,248.0,97.50,78.25,6.58,5.46,39.00,21.6,35.50,11.10
2014-12-26,18.1,12.33,33.25,27.75,20.1,11.5,57.75,13.87,29.55,33.75,...,4.08,251.0,97.00,79.00,6.67,5.46,38.25,21.8,35.25,10.80
2014-12-29,18.1,12.33,33.00,27.50,20.0,11.7,56.25,13.75,28.65,33.50,...,4.00,249.0,96.75,78.25,6.44,5.35,39.00,22.2,34.75,10.90
2014-12-30,17.8,12.33,33.50,27.25,20.0,11.6,58.50,13.75,29.10,33.00,...,4.00,251.0,96.50,78.75,6.62,5.41,39.25,22.7,34.75,10.90


In [10]:
len(series)

150

In [20]:
cols = []
for c in ptable.columns:
    cols.append(c[2])
# Determine what is missing
cols_set = set(cols)
series_set = set(series.keys())

In [23]:
# These tickers are in series, but not in columns

series_set.difference(cols_set)

{'AWC',
 'BCPG',
 'BEM',
 'BGRIM',
 'BPP',
 'COM7',
 'GGC',
 'GPSC',
 'GULF',
 'JWD',
 'ORI',
 'OSP',
 'PLANB',
 'PLAT',
 'PRM',
 'SCN',
 'SPRC',
 'TKN',
 'TOA',
 'TPIPP',
 'WHAUP'}

In [24]:
series['TOA']

Unnamed: 0_level_0,Price,MV,VA,VO,DY,Date,Ticker,Sector,Industry
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1


In [25]:
series['SPRC']

Unnamed: 0_level_0,Price,MV,VA,VO,DY,Date,Ticker,Sector,Industry
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
