In [1]:
import warnings
warnings.filterwarnings('ignore')

import yfinance as yf
import pandas as pd
import numpy as np
import datetime
import project_functions2 as pf
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [2]:
stock_list = ['AMZN', 'AAPL', 'FB','GOOGL', 'MSFT', 'TSLA']
stock_objects = {}
for stock in stock_list:
    stock_objects[stock] = yf.Ticker(stock)

In [4]:
stock_dfs = {}
for key in stock_objects:
    stock_dfs[key] = stock_objects[key].history(period='max')
    for key in stock_dfs:
        stock_dfs[key] = pf.date_time_prep(stock_dfs[key]) 
        stock_dfs[key] = pf.rolling_aves(stock_dfs[key])
stock_dfs['FB'].columns

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits',
       'Day', 'Month', 'Year', 'Days From IPO', 'Mt', 'Price Change',
       '5 Day Open Mean', '5 Day High Mean', '5 Day Low Mean',
       '5 Day Close Mean', '5 Day Volume Mean', '5 Day Open Var',
       '5 Day High Var', '5 Day Low Var', '5 Day Close Var',
       '5 Day Volume Var', '5 Day High', '5 Day Low', '5 Day SMt', '5 Day Dt',
       '10 Day Open Mean', '10 Day High Mean', '10 Day Low Mean',
       '10 Day Close Mean', '10 Day Volume Mean', '10 Day Open Var',
       '10 Day High Var', '10 Day Low Var', '10 Day Close Var',
       '10 Day Volume Var', '10 Day High', '10 Day Low', '10 Day SMt',
       '10 Day Dt', '20 Day Open Mean', '20 Day High Mean', '20 Day Low Mean',
       '20 Day Close Mean', '20 Day Volume Mean', '20 Day Open Var',
       '20 Day High Var', '20 Day Low Var', '20 Day Close Var',
       '20 Day Volume Var', '20 Day High', '20 Day Low', '20 Day SMt',
       '20 Day Dt', '5 Day Wei

In [28]:
def feature_reduction(stock_objects, split_time):
    drop_list = []
    i = 4
    
    stock_dfs = {}
    for key in stock_objects:
        stock_dfs[key] = stock_objects[key].history(period='max')

    for key in stock_dfs:
        stock_dfs[key] = pf.date_time_prep(stock_dfs[key]) 
        stock_dfs[key] = pf.rolling_aves(stock_dfs[key])
        stock_dfs[key] = pf.future_close_setup(stock_dfs[key], 5)
        
    combine_df = pf.combiner(stock_dfs)
    
    X = combine_df.iloc[:,:-1]
    y = combine_df.iloc[:,-1:]
    
    split_mark = int(len(combine_df)-(split_time*len(stock_dfs)))
    X_train = X.head(split_mark)
    X_test = X.tail(len(combine_df) - split_mark)
    y_train = y.head(split_mark)
    y_test = y.tail(len(combine_df) - split_mark)
    
    stock_model = LinearRegression().fit(X_train, y_train)
    stock_close_pred = stock_model.predict(X_test)
    max_score = r2_score(y_test.dropna(), stock_close_pred[:len(y_test.dropna())])
    
    total_cols = len(combine_df.columns)

    while i + len(drop_list) < total_cols:
        stock_dfs = {}
        for key in stock_objects:
            stock_dfs[key] = stock_objects[key].history(period='max')
    
        for key in stock_dfs:
            stock_dfs[key] = pf.date_time_prep(stock_dfs[key]) 
            stock_dfs[key] = pf.rolling_aves(stock_dfs[key])
            stock_dfs[key] = pf.future_close_setup(stock_dfs[key], 5)
            
        combine_df = pf.combiner(stock_dfs)
        
        if len(drop_list) > 0:
            combine_df.drop(drop_list, inplace=True, axis=1)
            
        curr_col = combine_df.columns[i]
        combine_df.drop(curr_col, inplace=True, axis=1)
            
        X = combine_df.iloc[:,:-1]
        y = combine_df.iloc[:,-1:]
        
        split_mark = int(len(combine_df)-(split_time*len(stock_dfs)))
        X_train = X.head(split_mark)
        X_test = X.tail(len(combine_df) - split_mark)
        y_train = y.head(split_mark)
        y_test = y.tail(len(combine_df) - split_mark)
        
        stock_model = LinearRegression().fit(X_train, y_train)
        stock_close_pred = stock_model.predict(X_test)
        curr_score = r2_score(y_test.dropna(), stock_close_pred[:len(y_test.dropna())])
        
        if curr_score >= max_score:
            drop_list.append(curr_col)
            max_score = curr_score
        else:
            i += 1
    
    return max_score, drop_list
        

In [29]:
max_score, drop_list = feature_reduction(stock_objects, 365)

In [30]:
max_score

0.9964896323194726

In [31]:
drop_list

['Stock Splits',
 'Day',
 'Days From IPO',
 '5 Day Volume Mean',
 '5 Day High Var',
 '5 Day Low Var',
 '5 Day Close Var',
 '5 Day Low',
 '10 Day Low Var',
 '10 Day Close Var',
 '10 Day High',
 '10 Day Low',
 '10 Day SMt',
 '20 Day Low Mean',
 '20 Day Low',
 '5 Day Stochastic K',
 '20 Day Stochastic K',
 '5 Day RSI']

In [42]:
def feature_reduction2(stock_objects, split_time):
    drop_list = []
    i = 0
    drop_flag = True
    drop_col = ''
    
    stock_dfs = {}
    for key in stock_objects:
        stock_dfs[key] = stock_objects[key].history(period='max')

    for key in stock_dfs:
        stock_dfs[key] = pf.date_time_prep(stock_dfs[key]) 
        stock_dfs[key] = pf.rolling_aves(stock_dfs[key])
        stock_dfs[key] = pf.future_close_setup(stock_dfs[key], 5)
        
    combine_df = pf.combiner(stock_dfs)
    
    X = combine_df.iloc[:,:-1]
    y = combine_df.iloc[:,-1:]
    
    split_mark = int(len(combine_df)-(split_time*len(stock_dfs)))
    X_train = X.head(split_mark)
    X_test = X.tail(len(combine_df) - split_mark)
    y_train = y.head(split_mark)
    y_test = y.tail(len(combine_df) - split_mark)
    
    stock_model = LinearRegression().fit(X_train, y_train)
    stock_close_pred = stock_model.predict(X_test)
    max_score = r2_score(y_test.dropna(), stock_close_pred[:len(y_test.dropna())])
    
    total_cols = len(combine_df.columns)

    while drop_flag:
        i = 0
        drop_flag = False
        
        while i + len(drop_list) < total_cols:
            stock_dfs = {}
            
            for key in stock_objects:
                stock_dfs[key] = stock_objects[key].history(period='max')
        
            for key in stock_dfs:
                stock_dfs[key] = pf.date_time_prep(stock_dfs[key]) 
                stock_dfs[key] = pf.rolling_aves(stock_dfs[key])
                stock_dfs[key] = pf.future_close_setup(stock_dfs[key], 5)
                
            combine_df = pf.combiner(stock_dfs)
            
            if len(drop_list) > 0:
                combine_df.drop(drop_list, inplace=True, axis=1)
                
            curr_col = combine_df.columns[i]
            combine_df.drop(curr_col, inplace=True, axis=1)
                
            X = combine_df.iloc[:,:-1]
            y = combine_df.iloc[:,-1:]
            
            split_mark = int(len(combine_df)-(split_time*len(stock_dfs)))
            X_train = X.head(split_mark)
            X_test = X.tail(len(combine_df) - split_mark)
            y_train = y.head(split_mark)
            y_test = y.tail(len(combine_df) - split_mark)
            
            stock_model = LinearRegression().fit(X_train, y_train)
            stock_close_pred = stock_model.predict(X_test)
            curr_score = r2_score(y_test.dropna(), stock_close_pred[:len(y_test.dropna())])
            
            if curr_score >= max_score:
                drop_col = curr_col
                max_score = curr_score
                drop_flag = True
            
            i += 1
        drop_list.append(drop_col)
    
    return max_score, drop_list

In [43]:
max_score, drop_list = feature_reduction2(stock_objects, 365)

In [44]:
max_score

0.9964578645451901

In [45]:
drop_list

['10 Day Low Var',
 '20 Day CCI',
 '5 Day CCI',
 '5 Day High Var',
 '5 Day Close Var',
 '10 Day Stochastic K',
 '5 Day Low Var',
 '5 Day Low Var']