In [1]:
# IMPORTS
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.metrics import confusion_matrix, mean_squared_error
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from ISLP import load_data
from datetime import datetime

In [2]:
# load the data
stocks = pd.read_csv('./data/dow_jones_index.data')

# get rid of $ signs
dollar_sign_list = []
for col in stocks.columns:
    # Check if any value in the column contains a dollar sign
    if stocks[col].astype(str).str.contains('\$').any():
        dollar_sign_list.append(col)
        
# Change dates to numeric
date_format = "%m/%d/%Y"
stocks['date'] = stocks['date'].apply(lambda x: pd.to_datetime(x, format='%m/%d/%Y'))
        
for col in dollar_sign_list:
    stocks[col] = stocks[col].replace('[\$,]', '', regex=True).astype(float)

# we can either use close or this for our dep var. Not both
stocks.drop('percent_change_price', axis = 1)

# seperate quarters
train = stocks[stocks['quarter'] == 1]
test = stocks[stocks['quarter'] == 2]

# set Xs and ys
X_train = train['close']
X_test = test['close']

y_train = train.drop('close', axis = 1)
y_test = test.drop('close', axis = 1)

In [7]:
# don't use future 
# next_weeks_close next_weeks_open percent_return_next_dividend
stocks

Unnamed: 0,quarter,stock,date,open,high,low,close,volume,percent_change_price,percent_change_volume_over_last_wk,previous_weeks_volume,next_weeks_open,next_weeks_close,percent_change_next_weeks_price,days_to_next_dividend,percent_return_next_dividend
0,1,AA,2011-01-07,15.82,16.72,15.78,16.42,239655616,3.79267,,,16.71,15.97,-4.428490,26,0.182704
1,1,AA,2011-01-14,16.71,16.71,15.64,15.97,242963398,-4.42849,1.380223,239655616.0,16.19,15.79,-2.470660,19,0.187852
2,1,AA,2011-01-21,16.19,16.38,15.60,15.79,138428495,-2.47066,-43.024959,242963398.0,15.87,16.13,1.638310,12,0.189994
3,1,AA,2011-01-28,15.87,16.63,15.82,16.13,151379173,1.63831,9.355500,138428495.0,16.18,17.14,5.933250,5,0.185989
4,1,AA,2011-02-04,16.18,17.39,16.18,17.14,154387761,5.93325,1.987452,151379173.0,17.33,17.37,0.230814,97,0.175029
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,2,XOM,2011-05-27,80.22,82.63,80.07,82.63,68230855,3.00424,-21.355713,86758820.0,83.28,81.18,-2.521610,75,0.568801
746,2,XOM,2011-06-03,83.28,83.75,80.18,81.18,78616295,-2.52161,15.221032,68230855.0,80.93,79.78,-1.420980,68,0.578960
747,2,XOM,2011-06-10,80.93,81.87,79.72,79.78,92380844,-1.42098,17.508519,78616295.0,80.00,79.02,-1.225000,61,0.589120
748,2,XOM,2011-06-17,80.00,80.82,78.33,79.02,100521400,-1.22500,8.811952,92380844.0,78.65,76.78,-2.377620,54,0.594786


In [None]:
# make lag plots of 1,2,3
# see what is the most correlated with close.

In [4]:
def return_stock(df, tic):
    df = df[df['stock'] == tic]
    df = df.drop('stock', axis=1)
    
    # Make lag variables
    df['lag1'] = df['close'].shift(1)
#     df['lag2'] = df['close'].shift(2)
#     df['lag3'] = df['close'].shift(3)

    
    train = df[df['quarter'] == 1]
    test = df[df['quarter'] == 2]
    
    y_train = train['close']
    y_test = test['close']
    X_train = train.drop('close', axis = 1)
    X_test = test.drop('close', axis = 1)
    
    return X_train, X_test, y_train, y_test

In [5]:
unique_tics = stocks['stock'].unique()
results_dict = {'Tic':[],'MSE':[]}
for tic in unique_tics:
    X_train, X_test, y_train, y_test = return_stock(stocks, tic)
    reg = DecisionTreeRegressor(random_state=123)
    reg.fit(X_train.drop('date', axis = 1), y_train)

    y_pred = reg.predict(X_test.drop('date', axis = 1))
    mse = mean_squared_error(y_test, y_pred)
    
    results_dict['Tic'].append(tic)
    results_dict['MSE'].append(mse)

In [6]:
pd.DataFrame(results_dict)

Unnamed: 0,Tic,MSE
0,AA,0.48902
1,AXP,9.219515
2,BA,18.182794
3,BAC,5.371472
4,CAT,38.707548
5,CSCO,3.518037
6,CVX,31.319445
7,DD,3.0344
8,DIS,2.344212
9,GE,0.391578


In [None]:
# linear regression
# SVM 

In [None]:
# last week in dataset (q2)
# use to predict future 1 week.w/ best model
# compute beta for risk meausure
# run lin reg to compute betas
    # regress stock prices w/ index fund. 
    # make picture of future returns w/ betas on scatter