In [8]:
import time
import datetime
import pandas as pd
import yfinance
import statistics
import pandas as pd
import numpy as np
from scipy import stats
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import joblib
%matplotlib inline

In [3]:
def date_to_sec(date):
    i = str(date).split('-')
    return int(time.mktime(datetime.datetime(int(i[0]), int(i[1]), int(i[2]), 0).timetuple()))

In [4]:
# Gets historical Stock price data from yahoo finance. 
# Imputs: ticker - the stock ticker. String. 1-4 characters.
#         time_period - the time, in days history back from the present that data should be retrived. Int.
# output: a pandas dataframe.
def get_historical_data(ticker, time_period):
    period2 = int(time.mktime(datetime.date.today().timetuple()))
    period1 = int(time.mktime(datetime.date.today().timetuple())) - 3600*24*7*time_period
    query_string = f'https://query1.finance.yahoo.com/v7/finance/download/{ticker}?period1={period1}&period2={period2}&interval=1d&events=history&includeAdjustedClose=true'
    df = pd.read_csv(query_string) 
    df['Time'] = df['Date'].apply(date_to_sec)
    df.drop(columns ='Date', inplace = True)
    return df

In [19]:
# Inputs: lookback: the maximum number of weeks backwards from today that the function should look at. 
#         df: the dataframe with the stock data on it.
# Outputs: tuple: [of days backwards the model uses, mean R^2]
#
def find_best_lin_reg(lookback, df):
    starttime = int(time.mktime(datetime.datetime.now().timetuple()))
    #stores the max mean adjusted R^2 found, and the number of weeks back it was aquired.
    maxscore = [0, 0]
    for l in range(10, lookback+1):
        #cuts out all data that is outside of the timeframe the given model is looking at.
        df2 = df[df.Time >= (int(time.mktime(datetime.date.today().timetuple())) - 3600*24*7*l)]
        # sets the dependent and independent variables.
        X = pd.DataFrame(df2['Time'])
        y = pd.DataFrame(df2['Open'])
        #selects the model to be used.
        model = LinearRegression()
        #stores the adjusted R^2 values for each k fold.
        scores = []
        # folds that dough, fits the model, etc.
        kfold = KFold(n_splits=3, shuffle=True, random_state=42)
        for i, (train, test) in enumerate(kfold.split(X, y)):
         model.fit(X.iloc[train,:], y.iloc[train,:])
         #calculates the ADJUSTED R^2.
         score = 1 - ( 1-model.score(X.iloc[test,:], y.iloc[test,:]) ) * ( len(y) - 1 ) / ( len(y) - X.shape[1] - 1 )
         scores.append(score)
        # If it's the best score, keep it.
        if statistics.mean(scores) >= maxscore[0]:
            maxscore[0] = statistics.mean(scores)
            maxscore[1] = l
    endtime = int(time.mktime(datetime.datetime.now().timetuple()))
    print("Total time: "+str(endtime-starttime)+" seconds")
    return maxscore

In [20]:
ticker = 'TSLA'
period = 300
df = get_historical_data(ticker, period)
print(find_best_lin_reg(period, df))


Total time: 4 seconds
[0.8581014898462324, 99]


In [14]:
def make_lin_reg_model(df, period, ticker):
    # cut the dataset to the appropriate length.
    df2 = df[df.Time >= (int(time.mktime(datetime.date.today().timetuple())) - 3600*24*7*period)]
    # sets the dependent and independent variables.
    X = pd.DataFrame(df2['Time'])
    y = pd.DataFrame(df2['Open'])
    # build the model using all data. 
    model = LinearRegression()
    model.fit(X,y)
    # save model to a file.
    filename = f'lin_reg_model_{ticker}.sav'
    joblib.dump(model, filename)
    return filename

In [12]:
ticker = 'TSLA'
period = 300
df = get_historical_data(ticker, period)
best_model = find_best_lin_reg(period, df)
make_lin_reg_model(df, best_model[1], ticker)

Total time: 7 seconds


In [33]:
# Inputs: lookforward is in number of days
def predict_price(ticker, filename, lookforward):
    # load the model
    loaded_model = joblib.load(filename)
    predictions = []
    for i in range(0, lookforward+1):
        prediction = loaded_model.predict([[(int(time.mktime(datetime.date.today().timetuple())) + 3600*24*i)]])
        predictions.append(prediction[0][0])
    return predictions

In [35]:
ticker = 'TSLA'
period = 300
lookforward = 30
df = get_historical_data(ticker, period)
best_model = find_best_lin_reg(period, df)
filename = make_lin_reg_model(df, best_model[1], ticker)
predict_price(ticker, filename, lookforward)

Total time: 4 seconds


[787.2518706852898,
 788.4722855761356,
 789.6927004669815,
 790.9131153578273,
 792.1335302486732,
 793.3539451395227,
 794.5743600303686,
 795.7947749212144,
 797.0151898120603,
 798.2356047029061,
 799.456019593752,
 800.6764344845978,
 801.8968493754437,
 803.1172642662896,
 804.3376791571391,
 805.5580940479849,
 806.7785089388308,
 807.9989238296766,
 809.2193387205225,
 810.4397536113684,
 811.6601685022142,
 812.8805833930601,
 814.1009982839096,
 815.3214131747554,
 816.5418280656013,
 817.7622429564472,
 818.982657847293,
 820.2030727381389,
 821.4234876289847,
 822.6439025198306,
 823.8643174106764]

In [None]:
def make_models(ticker):
    