In [1]:
# IMPORTS
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.metrics import confusion_matrix, mean_squared_error
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from ISLP import load_data
from datetime import datetime

In [2]:
# load the data
stocks = pd.read_csv('./data/dow_jones_index.data')

# get rid of $ signs
dollar_sign_list = []
for col in stocks.columns:
    # Check if any value in the column contains a dollar sign
    if stocks[col].astype(str).str.contains('\$').any():
        dollar_sign_list.append(col)
        
# Change dates to numeric
date_format = "%m/%d/%Y"
stocks['date'] = stocks['date'].apply(lambda x: pd.to_datetime(x, format='%m/%d/%Y'))
        
for col in dollar_sign_list:
    stocks[col] = stocks[col].replace('[\$,]', '', regex=True).astype(float)

# we can either use close or this for our dep var. Not both
stocks.drop('percent_change_price', axis = 1)

# seperate quarters
train = stocks[stocks['quarter'] == 1]
test = stocks[stocks['quarter'] == 2]

# set Xs and ys
X_train = train['close']
X_test = test['close']

y_train = train.drop('close', axis = 1)
y_test = test.drop('close', axis = 1)

In [3]:
stocks['date']

0     2011-01-07
1     2011-01-14
2     2011-01-21
3     2011-01-28
4     2011-02-04
         ...    
745   2011-05-27
746   2011-06-03
747   2011-06-10
748   2011-06-17
749   2011-06-24
Name: date, Length: 750, dtype: datetime64[ns]

In [4]:
def return_stock(df, tic):
    df = df[df['stock'] == tic]
    df = df.drop('stock', axis=1)
    
    # Make lag variables
    df['lag1'] = df['close'].shift(1)
    df['lag2'] = df['close'].shift(2)
    df['lag3'] = df['close'].shift(3)

    
    train = df[df['quarter'] == 1]
    test = df[df['quarter'] == 2]
    
    y_train = train['close']
    y_test = test['close']
    X_train = train.drop('close', axis = 1)
    X_test = test.drop('close', axis = 1)
    
    return X_train, X_test, y_train, y_test

In [5]:
unique_tics = stocks['stock'].unique()
results_dict = {'Tic':[],'MSE':[]}
for tic in unique_tics:
    X_train, X_test, y_train, y_test = return_stock(stocks, tic)
    reg = DecisionTreeRegressor(random_state=123)
    reg.fit(X_train.drop('date', axis = 1), y_train)

    y_pred = reg.predict(X_test.drop('date', axis = 1))
    mse = mean_squared_error(y_test, y_pred)
    
    results_dict['Tic'].append(tic)
    results_dict['MSE'].append(mse)

In [6]:
pd.DataFrame(results_dict)

Unnamed: 0,Tic,MSE
0,AA,0.48902
1,AXP,9.219515
2,BA,18.182794
3,BAC,5.371472
4,CAT,38.707548
5,CSCO,3.518037
6,CVX,31.319445
7,DD,3.0344
8,DIS,2.344212
9,GE,0.391578
