In [153]:
import yfinance as yf
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, GridSearchCV
import pandas as pd
df = yf.download('META', start = '2010-01-01', end = '2024-01-01').reset_index()
df.columns = ['Date', 'Close', 'High', 'Low', 'Open', 'Volume']
print(df.head())

[*********************100%***********************]  1 of 1 completed

        Date      Close       High        Low       Open     Volume
0 2012-05-18  38.050671  44.788914  37.821750  41.852752  573576400
1 2012-05-21  33.870369  36.488033  32.845202  36.358642  168192700
2 2012-05-22  30.854582  33.432433  30.794864  32.457030  101786600
3 2012-05-23  31.849894  32.347548  31.212896  31.222850   73600000
4 2012-05-24  32.875061  33.054217  31.620973  32.795438   50237200





In [154]:
df = df[['Close']]
print(df.head())
# A vairable for predicting 'x' days out into the future
future_day = 100

# Create another column (target for dependent variable) shifted 'x' unit up
df['Prediction'] = df[['Close']].shift(-future_day)
print(df.head())

       Close
0  38.050671
1  33.870369
2  30.854582
3  31.849894
4  32.875061
       Close  Prediction
0  38.050671   19.547871
1  33.870369   19.657354
2  30.854582   19.428436
3  31.849894   19.428436
4  32.875061   19.388620


In [155]:
# Create the independent data set X
# convert the dataframe to a numpy array
X = np.array(df.drop(['Prediction'], axis=1))
# remove the last few rows
X = X[:-future_day]
print(X)

# Create the dependent data set y
# Convert the dataframe to a numpy array(including the Nan)
y = np.array(df.drop(['Close'], axis=1))
print(y)
# Get all of the y values except the last x rows
y = y[:-future_day]
print(y)

[[ 38.05067062]
 [ 33.87036896]
 [ 30.85458183]
 ...
 [309.2723999 ]
 [315.07504272]
 [311.17346191]]
[[19.54787064]
 [19.65735435]
 [19.42843628]
 ...
 [        nan]
 [        nan]
 [        nan]]
[[ 19.54787064]
 [ 19.65735435]
 [ 19.42843628]
 ...
 [356.15145874]
 [356.63919067]
 [352.29959106]]


In [156]:
# split data 80/20
x_train, x_test, y_train, y_test = train_test_split(X,y,train_size=0.8, test_size=0.2, random_state=1)

In [157]:
# create and train the support vector machine (Regressor)
svr = SVR()
parameters_svr = {'kernel':['linear', 'poly', 'rbf'], 'C':[0.01, 0.1, 1]}
clf_svr = GridSearchCV(svr, parameters_svr)
clf_svr.fit(x_train, y_train.reshape(-1))
best_model_svr = clf_svr.best_estimator_
print(best_model_svr)
best_params_svr = clf_svr.best_params_
print(best_params_svr) 
# prints out best_params as C=1, kernel=rbf
best_score_svr = clf_svr.best_score_
print(best_score_svr)
# prints out best score as 0.7744334702078793
print(clf_svr.score(x_test, y_test))
print(clf_svr.cv_results_)
hyperparameter_grid = pd.DataFrame(clf_svr.cv_results_['params'])
grid_scores = pd.DataFrame(clf_svr.cv_results_['mean_test_score'], columns=['score'])
df_svr_params_scores = pd.concat([hyperparameter_grid, grid_scores], axis = 1)
print(df_svr_params_scores)


SVR(C=1)
{'C': 1, 'kernel': 'rbf'}
0.7744334702078793
0.7990393348106678
{'mean_fit_time': array([0.07177   , 0.04397163, 0.04401937, 0.20447474, 0.05119858,
       0.04344006, 1.3057426 , 0.07395597, 0.04205661]), 'std_fit_time': array([0.00382882, 0.00079516, 0.00037227, 0.01700184, 0.00525086,
       0.00033899, 0.08474279, 0.00436401, 0.00024629]), 'mean_score_time': array([0.00640244, 0.00501103, 0.01864424, 0.0062119 , 0.00495596,
       0.01846051, 0.00632734, 0.00517521, 0.01834507]), 'std_score_time': array([1.94001618e-04, 1.46433066e-04, 3.49798379e-04, 1.42864630e-04,
       7.49443916e-05, 4.19709103e-04, 1.82894583e-04, 9.71993219e-05,
       3.09531098e-04]), 'param_C': masked_array(data=[0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 1.0, 1.0, 1.0],
             mask=[False, False, False, False, False, False, False, False,
                   False],
       fill_value=1e+20), 'param_kernel': masked_array(data=['linear', 'poly', 'rbf', 'linear', 'poly', 'rbf',
                   'linea

In [158]:
# Create and train the Linear Regression Model
lr = LinearRegression()
parameters_lr = {'fit_intercept': [True, False], 'n_jobs':[1,-1]}
clf_lr = GridSearchCV(lr, parameters_lr)
clf_lr = GridSearchCV(lr, parameters_lr)
clf_lr.fit(x_train, y_train)
best_model_lr = clf_lr.best_estimator_
print(best_model_lr)
best_params_lr = clf_lr.best_params_
print(best_params_lr) 
# prints out best_params as C=1, kernel=rbf
best_score_lr = clf_lr.best_score_
print(best_score_lr)
# prints out best score as 0.7744334702078793
print(clf_lr.score(x_test, y_test))
print(clf_lr.cv_results_)
hyperparameter_grid = pd.DataFrame(clf_lr.cv_results_['params'])
grid_scores = pd.DataFrame(clf_lr.cv_results_['mean_test_score'], columns=['score'])
df_lr_params_scores = pd.concat([hyperparameter_grid, grid_scores], axis = 1)
print(df_lr_params_scores)

# note to self, hyperparameter tuning for linear regression is not as useful as for support vector regression

LinearRegression(n_jobs=1)
{'fit_intercept': True, 'n_jobs': 1}
0.7530130357372388
0.7632502357138924
{'mean_fit_time': array([0.00030799, 0.00019178, 0.00017109, 0.00016618]), 'std_fit_time': array([1.72108848e-04, 4.41325606e-06, 2.37750070e-06, 6.64499569e-06]), 'mean_score_time': array([0.0001688 , 0.00015225, 0.00014615, 0.00014586]), 'std_score_time': array([3.17778965e-05, 2.27486810e-06, 1.55246931e-06, 5.81350351e-06]), 'param_fit_intercept': masked_array(data=[True, True, False, False],
             mask=[False, False, False, False],
       fill_value=True), 'param_n_jobs': masked_array(data=[1, -1, 1, -1],
             mask=[False, False, False, False],
       fill_value=999999), 'params': [{'fit_intercept': True, 'n_jobs': 1}, {'fit_intercept': True, 'n_jobs': -1}, {'fit_intercept': False, 'n_jobs': 1}, {'fit_intercept': False, 'n_jobs': -1}], 'split0_test_score': array([0.77776096, 0.77776096, 0.75880883, 0.75880883]), 'split1_test_score': array([0.69767877, 0.69767877, 0.

In [159]:
x_forecast =np.array(df.drop(['Prediction'], axis=1))[-future_day:]

In [160]:
# print prediction for next x days
lr_predict = clf_lr.predict(x_forecast)
print(lr_predict) 
svr_predict = clf_svr.predict(x_forecast)
print(svr_predict)

[[295.69288574]
 [296.15887031]
 [292.55437035]
 [296.55447333]
 [292.82692024]
 [286.09266553]
 [278.00455676]
 [276.38691344]
 [282.2332178 ]
 [280.21118387]
 [286.04870065]
 [279.46391574]
 [278.3649825 ]
 [282.54973257]
 [289.34550038]
 [286.80478333]
 [287.499299  ]
 [287.93007932]
 [291.24445422]
 [290.38289357]
 [289.94332567]
 [289.25757062]
 [297.7589115 ]
 [292.57197247]
 [295.56107197]
 [301.41611001]
 [291.38510948]
 [293.3544179 ]
 [295.5698326 ]
 [290.82246148]
 [287.35861678]
 [290.3037514 ]
 [291.84225255]
 [290.19824648]
 [289.1257299 ]
 [294.5940064 ]
 [291.29720668]
 [297.1083607 ]
 [291.93899685]
 [296.01821505]
 [295.32369937]
 [304.67775939]
 [307.25365374]
 [310.31305391]
 [315.57034752]
 [312.35268997]
 [304.0272086 ]
 [309.70644104]
 [312.21198079]
 [306.03164041]
 [302.37438799]
 [298.71713557]
 [303.42938331]
 [302.14585688]
 [290.69938139]
 [280.87052225]
 [288.23780651]
 [293.45113524]
 [292.229068  ]
 [301.53040251]
 [300.66886883]
 [303.94809338]
 [305.00

In [161]:
'''
note to self: support vector regression is better(more accurate) than linear regression when it comes to capturing non-linear relationships. (using kernel=rbf for svr)
svr handle outliers better, more flexible(control hyperparameters), '
linear regression is better when it comes to handling linear relationships, computationally less expensive, it is faster, easy to understand'
'''

"\nnote to self: support vector regression is better(more accurate) than linear regression when it comes to capturing non-linear relationships. (using kernel=rbf for svr)\nsvr handle outliers better, more flexible(control hyperparameters), '\nlinear regression is better when it comes to handling linear relationships, computationally less expensive, it is faster, easy to understand'\n"