In [27]:
import pandas as pd
import numpy as np

print("Preprocessing and cleaning data...\n")
##Read excel spread sheet of data
data = pd.read_excel('cleaned_data.xls', header=None)

##Declare the column names
data.columns = ['date', 's_and_p_comp', 'dividend', 'earnings',
                'CPI', 'fraction_date', 'long_interest_rate', 'real_price',
                'real_dividend', 'real_total_return_price','real_earnings',
                'real_scaled_earnings', 'CAPE', 'TR_CAPE', 'excess_CAPE', 'montly_bond_return',
                'real_bond_return','10_year_stock_return', '10_year_bond_return',
                '10_year_excess_return']

##Drop all rows with missing data
data = data.replace('NA',np.NaN)

data.head()

##Drop "10 year" columns so there are no rows with missing data after 2011
print("Dropping unfinished columns...")    
data = data.drop(['10_year_stock_return'],axis=1)
data = data.drop(['10_year_bond_return'],axis=1)
data = data.drop(['10_year_excess_return'],axis=1)

##Drop rows with missing data
print("Dropping unfinished rows...")
print('\n\nNumber of rows in original data = %d' % (data.shape[0]))
data = data.dropna()
print('Number of rows after discarding missing values = %d\n' % (data.shape[0]))

#Number
print('Number of instances = %d' % (data.shape[0]))
print('Number of attributes = %d\n' % (data.shape[1]))

##Check to make sure there are no missing values in each column
print('Number of missing values:')
for col in data.columns:
    print('\t%s: %d' % (col,data[col].isna().sum()))

print("\n\nPreprocessing done.")

Preprocessing and cleaning data...

Dropping unfinished columns...
Dropping unfinished rows...


Number of rows in original data = 1810
Number of rows after discarding missing values = 1687

Number of instances = 1687
Number of attributes = 17

Number of missing values:
	date: 0
	s_and_p_comp: 0
	dividend: 0
	earnings: 0
	CPI: 0
	fraction_date: 0
	long_interest_rate: 0
	real_price: 0
	real_dividend: 0
	real_total_return_price: 0
	real_earnings: 0
	real_scaled_earnings: 0
	CAPE: 0
	TR_CAPE: 0
	excess_CAPE: 0
	montly_bond_return: 0
	real_bond_return: 0


Preprocessing done.


## Sampling
For our data set, random sampling does not make sense since we are looking at fincial change over *time*. Instead, our sample is a random 10 conescutive rows of data.

In [26]:
import random
randomIndex = random.randint(0,data.shape[0])
sample = data[randomIndex:randomIndex+10]
sample

Unnamed: 0,date,s_and_p_comp,dividend,earnings,CPI,fraction_date,long_interest_rate,real_price,real_dividend,real_total_return_price,real_earnings,real_scaled_earnings,CAPE,TR_CAPE,excess_CAPE,montly_bond_return,real_bond_return
398,1904-02,6.5,0.3433,0.5233,8.46793,1904.12,3.40667,210.31,11.1076,1125.33,16.9316,90.5978,15.0215,17.803,0.055357,1.00228,6.16161
399,1904-03,6.48,0.34,0.52,8.37284,1904.21,3.41333,212.044,11.1258,1139.57,17.0159,91.4469,15.0819,17.8965,0.0567927,1.00228,6.24578
400,1904-04,6.64,0.3367,0.5167,8.27768,1904.29,3.42,219.778,11.1444,1186.12,17.1023,92.2996,15.5655,18.4903,0.0534956,1.00229,6.33202
401,1904-05,6.5,0.3333,0.5133,8.08738,1904.37,3.42667,220.206,11.2915,1193.51,17.3895,94.2508,15.5258,18.4648,0.0512156,1.0023,6.49586
402,1904-06,6.51,0.33,0.51,8.08738,1904.46,3.43333,220.545,11.1797,1200.4,17.2777,94.0405,15.4744,18.4247,0.0513628,1.0023,6.51077
403,1904-07,6.78,0.3267,0.5067,8.08738,1904.54,3.44,229.692,11.0679,1255.21,17.1659,93.8072,16.0364,19.1128,0.0490316,1.00231,6.52576
404,1904-08,7.01,0.3233,0.5033,8.18251,1904.62,3.44667,234.723,10.8254,1287.63,16.8525,92.4483,16.3047,19.4498,0.046217,1.00231,6.46477
405,1904-09,7.32,0.32,0.5,8.27768,1904.71,3.45333,242.285,10.5917,1333.95,16.5495,91.117,16.7426,19.9882,0.0442991,1.00232,6.40523
406,1904-10,7.75,0.3167,0.4967,8.27768,1904.79,3.46,256.518,10.4825,1417.12,16.4403,90.8239,17.6332,21.0655,0.0440907,1.00232,6.42008
407,1904-11,8.17,0.3133,0.4933,8.46793,1904.87,3.46667,264.344,10.1369,1465.03,15.9609,88.4574,18.0762,21.6054,0.0449591,1.00233,6.29043
