In [127]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt 
import csv
import yfinance as yf
import statsmodels.api as sm
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV as rcv
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

# CPI and PPI

In [192]:
cpippi = pd.read_excel("desco.xls")
cpippi = cpippi[-24:]
cpippi.tail(5)

Unnamed: 0,DATE,CPIAUCSL,PPIACO
264,2022-01-01,281.933,246.453
265,2022-02-01,284.182,252.617
266,2022-03-01,287.708,259.825
267,2022-04-01,288.663,265.183
268,2022-05-01,291.474,273.209


In [66]:
cpi = cpippi['CPIAUCSL']
ppi = cpippi['PPIACO']
model = sm.OLS(cpi, ppi).fit()
predictions = model.predict(ppi)
model.summary()

0,1,2,3
Dep. Variable:,CPIAUCSL,R-squared (uncentered):,0.995
Model:,OLS,Adj. R-squared (uncentered):,0.995
Method:,Least Squares,F-statistic:,4606.0
Date:,"Fri, 08 Jul 2022",Prob (F-statistic):,5.3e-28
Time:,21:03:28,Log-Likelihood:,-104.87
No. Observations:,24,AIC:,211.7
Df Residuals:,23,BIC:,212.9
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
PPIACO,1.1964,0.018,67.866,0.000,1.160,1.233

0,1,2,3
Omnibus:,2.69,Durbin-Watson:,0.036
Prob(Omnibus):,0.261,Jarque-Bera (JB):,1.265
Skew:,-0.11,Prob(JB):,0.531
Kurtosis:,1.897,Cond. No.,1.0


In [124]:
print(predictions)
model.predict(240)[0]

245    228.752509
246    230.906037
247    232.461363
248    233.897048
249    235.093452
250    237.246980
251    239.879069
252    245.023608
253    251.962753
254    257.226932
255    260.696505
256    269.071335
257    273.856953
258    277.386345
259    279.258718
260    281.966181
261    287.693369
262    291.069622
263    288.737830
264    294.857438
265    302.232074
266    310.855757
267    317.266091
268    326.868432
dtype: float64


287.1370406615597

# Semiconductors LASSO Multiple Regression

Below, I try both a multiple regression in order to make a prediction. Both Lasso and Ridge estimators shrink the estimator to zero, however, Ridge penalizes large squared values of the coefficients. Then it minimizes the penalized sum of squared residuals:

$$\min\{\sum_{i=1}^n(Y_i - X_i^{'}b)^2 + \lambda_{Ridge}\sum_{j=1}^{k}b_j^2\}$$

Similarly, Lasso shrinks the estimate towards zero by penalizing absolute values of the coefficients as shown below:

$$\min\{\sum_{i=1}^n(Y_i - X_i^{'}b)^2 + \lambda_{Lasso}\sum_{j=1}^{k}|b_j|\}$$

In both cases we $10$-fold cross validation in order to determine the $\lambda_{Ridge}$ and $\lambda_{Lasso}$ which yield the lowest MSPE$(\lambda_{Ridge})$ and MSPE$(\lambda_{Lasso})$ respectively.

It is worth noting that Lasso looks very similar to Ridge, but turns out to have one very special property: Lasso
tends to set many the $\hat{\beta}$’s exactly to $0$. Ridge shrinks them relative to OLS, but doesn’t shrink
them all the way to zero.


This feature means that Lasso can work especially well when in reality many of the predictors are irrelevant (but we don’t know which ones). Since the dataframe is very small and likely all of them have, at least to some extent, relevance on the desired price prediction, Ridge would be the prudent decision here


In [120]:
# list of securities: SOXX, TATAELXSI, TSMC, RENESAS

# features in the dataframe will be CPI, SP500, VT (vanguard total world etf), Fed Funds Rate (risk-free)

# trying to emulate what consumers might be thinking at any given time to make decisions using public and 
# accesible data that could realistically act as a proxy for how a consumer feels about making purchases
# for discretionary electronics and durable goods, which is what semiconductors are primarily used for

# also trying to avoid multicolinearity concerns that would arise by including the price of companies that 
# use semiconductors or manufacture goods with semiconductor parts -- just want how a consumer might price in
# an adjusted CPI into making decisions

In [96]:
cpi = pd.read_excel("desco.xls").drop(['PPIACO'],axis=1).rename(columns={"CPIAUCSL":"CPI","DATE":"Date"})
fedfunds = pd.read_csv("FEDFUNDS.csv").rename(columns={"DATE":"Date"})
sp500 = pd.read_csv("SPY.csv").drop(['Open','High','Low','Adj Close','Volume'],axis=1).rename(columns={'Close':'SP500'})
vt = pd.read_csv("VT.csv").drop(['Open','High','Low','Adj Close','Volume'],axis=1).rename(columns={'Close':'VT'})

In [210]:
# need to convert CPI and FedFunds date to object to merge 
cpi['Date'] = cpi['Date'].astype(str)
cpi = cpi.replace("2022-05-01","2022-05-02")
fedfunds['Date'] = fedfunds['Date'].astype(str)
fedfunds = fedfunds.replace("2022-05-01","2022-05-02")

## SOXX

In [275]:
soxx = pd.read_csv("soxx.csv")
soxx = soxx.drop(['Open','High','Low','Adj Close','Volume'], axis = 1)
soxx = soxx.rename(columns={"Close": "soxx"})
soxx = soxx.merge(sp500,on='Date').merge(vt,on='Date').merge(fedfunds,how='left',on='Date').merge(cpi,how='left',on='Date')

# going to use basic fill forward here -- other interpolation methods may work better, but in a real setting
# consumers would be making decision with the most recent release of CPI (effectively filling forward)
soxx = soxx.ffill()[-200:] # only taking the last 200 values, since that is the most relevant during inflationary times
soxx

Unnamed: 0,Date,soxx,SP500,VT,FEDFUNDS,CPI
1059,2021-09-22,468.26001,437.859985,104.110001,0.08,274.214
1060,2021-09-23,474.5,443.179993,105.279999,0.08,274.214
1061,2021-09-24,472.799988,443.910004,104.900002,0.08,274.214
1062,2021-09-27,471.649994,442.640015,104.830002,0.08,274.214
1063,2021-09-28,453.200012,433.720001,102.650002,0.08,274.214
1064,2021-09-29,446.049988,434.450012,102.529999,0.08,274.214
1065,2021-09-30,445.869995,429.140015,101.830002,0.08,274.214
1066,2021-10-01,446.160004,434.23999,102.650002,0.08,276.59
1067,2021-10-04,434.670013,428.640015,101.449997,0.08,276.59
1068,2021-10-05,441.25,433.100006,102.239998,0.08,276.59


In [276]:
cv = RepeatedKFold(n_splits = 10, n_repeats = 3, random_state = 1)
ridge = make_pipeline(StandardScaler(with_mean=False), RidgeCV(alphas = np.arange(0.01, 20, 0.01), cv = cv))
X = soxx.drop(['soxx','Date'],axis=1)
y = soxx['soxx']
ridge.fit(X,y)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=False, with_std=True)),
                ('ridgecv',
                 RidgeCV(alphas=array([1.000e-02, 2.000e-02, 3.000e-02, ..., 1.997e+01, 1.998e+01,
       1.999e+01]),
                         cv=RepeatedKFold(n_repeats=3, n_splits=10, random_state=1),
                         fit_intercept=True, gcv_mode=None, normalize=False,
                         scoring=None, store_cv_values=False))],
         verbose=False)

In [277]:
print(ridge[1].coef_) # coefficients of ridge regression

[ 72.2844146  -25.9306061    5.50713729 -12.19887563]


In [278]:
print('alpha: %f' % ridge[1].alpha_)
adj_vect = soxx[-1:].drop(['Date','soxx'], axis=1).replace(291.474, 287.137)
adj_pred = ridge.predict(adj_vect)[0]
print("Adjusted Prediction: " + str(adj_pred))
old_pred = ridge.predict(soxx[-1:].drop(['Date','soxx'], axis=1))[0]
print("Old Prediction: " + str(old_pred))
soxx_price_pred = (adj_pred/old_pred)*(list(y[-1:])[0])
print("New Price Prediction: " + str(soxx_price_pred))

alpha: 0.040000
Adjusted Prediction: 389.59824574109746
Old Prediction: 380.2860511741659
New Price Prediction: 367.32992415573824


## Renesas Electronics

In [279]:
renesas = pd.read_csv("renesas.csv")
renesas = renesas.drop(['Open','High','Low','Adj Close','Volume'], axis = 1)
renesas = renesas.rename(columns={"Close": "renesas"})
renesas = renesas.merge(sp500,on='Date').merge(vt,on='Date').merge(fedfunds,how='left',on='Date').merge(cpi,how='left',on='Date')
renesas = renesas.ffill()[-200:] # only taking the last 200 values, since that is the most relevant during inflationary times
renesas

Unnamed: 0,Date,renesas,SP500,VT,FEDFUNDS,CPI
1059,2021-09-22,6.23,437.859985,104.110001,0.08,274.214
1060,2021-09-23,6.27,443.179993,105.279999,0.08,274.214
1061,2021-09-24,6.5,443.910004,104.900002,0.08,274.214
1062,2021-09-27,6.46,442.640015,104.830002,0.08,274.214
1063,2021-09-28,6.22,433.720001,102.650002,0.08,274.214
1064,2021-09-29,6.31,434.450012,102.529999,0.08,274.214
1065,2021-09-30,6.21,429.140015,101.830002,0.08,274.214
1066,2021-10-01,6.2,434.23999,102.650002,0.08,276.59
1067,2021-10-04,5.79,428.640015,101.449997,0.08,276.59
1068,2021-10-05,5.85,433.100006,102.239998,0.08,276.59


In [280]:
X = renesas.drop(['renesas','Date'],axis=1)
y = renesas['renesas']
ridge.fit(X,y)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=False, with_std=True)),
                ('ridgecv',
                 RidgeCV(alphas=array([1.000e-02, 2.000e-02, 3.000e-02, ..., 1.997e+01, 1.998e+01,
       1.999e+01]),
                         cv=RepeatedKFold(n_repeats=3, n_splits=10, random_state=1),
                         fit_intercept=True, gcv_mode=None, normalize=False,
                         scoring=None, store_cv_values=False))],
         verbose=False)

In [281]:
print(ridge[1].coef_) # coefficients of ridge regression

[-0.39159208  1.00146094  0.14123104  0.04565423]


In [282]:
print('alpha: %f' % ridge[1].alpha_)
adj_vect = renesas[-1:].drop(['Date','renesas'], axis=1).replace(291.474, 287.137)
adj_pred = ridge.predict(adj_vect)[0]
print("Adjusted Prediction: " + str(adj_pred))
old_pred = ridge.predict(renesas[-1:].drop(['Date','renesas'], axis=1))[0]
print("Old Prediction: " + str(old_pred))
renesas_price_pred = (adj_pred/old_pred)*(list(y[-1:])[0])
print("New Price Prediction: " + str(renesas_price_pred))

alpha: 0.140000
Adjusted Prediction: 4.878894454999991
Old Prediction: 4.913745296433885
New Price Prediction: 4.527658104490297


## TATAELXSI

In [283]:
tata = pd.read_csv("tata.csv")
tata = tata.drop(['Open','High','Low','Adj Close','Volume'], axis = 1)
tata = tata.rename(columns={"Close": "tata"})
tata = tata.merge(sp500,on='Date').merge(vt,on='Date').merge(fedfunds,how='left',on='Date').merge(cpi,how='left',on='Date')
tata = tata.ffill()[-200:] # only taking the last 200 values, since that is the most relevant during inflationary times
tata

Unnamed: 0,Date,tata,SP500,VT,FEDFUNDS,CPI
1000,2021-09-09,4928.75,448.980011,106.540001,0.08,274.214
1001,2021-09-13,5009.299805,446.579987,106.379997,0.08,274.214
1002,2021-09-14,4987.0,444.170013,105.75,0.08,274.214
1003,2021-09-15,5113.399902,447.880005,106.389999,0.08,274.214
1004,2021-09-16,5153.600098,447.170013,106.150002,0.08,274.214
1005,2021-09-17,5490.649902,441.399994,105.099998,0.08,274.214
1006,2021-09-20,5831.700195,434.040009,102.800003,0.08,274.214
1007,2021-09-21,5619.950195,433.630005,103.120003,0.08,274.214
1008,2021-09-22,5580.149902,437.859985,104.110001,0.08,274.214
1009,2021-09-23,5726.799805,443.179993,105.279999,0.08,274.214


In [284]:
X = tata.drop(['tata','Date'],axis=1)
y = tata['tata']
ridge.fit(X,y)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=False, with_std=True)),
                ('ridgecv',
                 RidgeCV(alphas=array([1.000e-02, 2.000e-02, 3.000e-02, ..., 1.997e+01, 1.998e+01,
       1.999e+01]),
                         cv=RepeatedKFold(n_repeats=3, n_splits=10, random_state=1),
                         fit_intercept=True, gcv_mode=None, normalize=False,
                         scoring=None, store_cv_values=False))],
         verbose=False)

In [285]:
print(ridge[1].coef_) # coefficients of ridge regression

[-752.22003139 1005.90989833 -214.42225218 1410.44177904]


In [286]:
print('alpha: %f' % ridge[1].alpha_)
adj_vect = tata[-1:].drop(['Date','tata'], axis=1).replace(291.474, 287.137)
adj_pred = ridge.predict(adj_vect)[0]
print("Adjusted Prediction: " + str(adj_pred))
old_pred = ridge.predict(tata[-1:].drop(['Date','tata'], axis=1))[0]
print("Old Prediction: " + str(old_pred))
tata_price_pred = (adj_pred/old_pred)*(list(y[-1:])[0])
print("New Price Prediction: " + str(tata_price_pred))

alpha: 0.160000
Adjusted Prediction: 6814.819157661208
Old Prediction: 7876.941328211244
New Price Prediction: 6915.358235196771


## TSMC

In [287]:
tsm = pd.read_csv("tsm.csv")
tsm = tsm.drop(['Open','High','Low','Adj Close','Volume'], axis = 1)
tsm = tsm.rename(columns={"Close": "tsm"})
tsm = tsm.merge(sp500,on='Date').merge(vt,on='Date').merge(fedfunds,how='left',on='Date').merge(cpi,how='left',on='Date')
tsm = tsm.ffill()[-200:] # only taking the last 200 values, since that is the most relevant during inflationary times
tsm

Unnamed: 0,Date,tsm,SP500,VT,FEDFUNDS,CPI
1059,2021-09-22,115.870003,437.859985,104.110001,0.08,274.214
1060,2021-09-23,116.190002,443.179993,105.279999,0.08,274.214
1061,2021-09-24,115.639999,443.910004,104.900002,0.08,274.214
1062,2021-09-27,116.150002,442.640015,104.830002,0.08,274.214
1063,2021-09-28,111.900002,433.720001,102.650002,0.08,274.214
1064,2021-09-29,111.620003,434.450012,102.529999,0.08,274.214
1065,2021-09-30,111.650002,429.140015,101.830002,0.08,274.214
1066,2021-10-01,111.559998,434.23999,102.650002,0.08,276.59
1067,2021-10-04,109.019997,428.640015,101.449997,0.08,276.59
1068,2021-10-05,109.260002,433.100006,102.239998,0.08,276.59


In [288]:
X = tsm.drop(['tsm','Date'],axis=1)
y = tsm['tsm']
ridge.fit(X,y)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=False, with_std=True)),
                ('ridgecv',
                 RidgeCV(alphas=array([1.000e-02, 2.000e-02, 3.000e-02, ..., 1.997e+01, 1.998e+01,
       1.999e+01]),
                         cv=RepeatedKFold(n_repeats=3, n_splits=10, random_state=1),
                         fit_intercept=True, gcv_mode=None, normalize=False,
                         scoring=None, store_cv_values=False))],
         verbose=False)

In [289]:
print(ridge[1].coef_) # coefficients of ridge regression

[ 3.86839477  3.24723447 -3.26429858 -2.98837915]


In [290]:
print('alpha: %f' % ridge[1].alpha_)
adj_vect = tsm[-1:].drop(['Date','tsm'], axis=1).replace(291.474, 287.137)
adj_pred = ridge.predict(adj_vect)[0]
print("Adjusted Prediction: " + str(adj_pred))
old_pred = ridge.predict(tsm[-1:].drop(['Date','tsm'], axis=1))[0]
print("Old Prediction: " + str(old_pred))
tsm_price_pred = (adj_pred/old_pred)*(list(y[-1:])[0])
print("New Price Prediction: " + str(tsm_price_pred))

alpha: 15.450000
Adjusted Prediction: 86.90262516769211
Old Prediction: 84.62140115161655
New Price Prediction: 83.70734890731026


## Conclusion

In [299]:
print("SOXX Current vs. Altered: "  + '\t' + str(list(soxx['soxx'][-1:])[0]) + '\t' + str(soxx_price_pred))
print("Renesas Electronics Current vs. Altered: "  + '\t' + str(list(renesas['renesas'][-1:])[0]) + '\t' + str(renesas_price_pred))
print("TATA ELXSI Current vs. Altered: "  + '\t' + str(list(tata['tata'][-1:])[0]) + '\t' + str(tata_price_pred))
print("TSMC Current vs. Altered: "  + '\t' + str(list(tsm['tsm'][-1:])[0]) + '\t' + str(tsm_price_pred))

SOXX Current vs. Altered: 	358.549988	367.32992415573824
Renesas Electronics Current vs. Altered: 	4.56	4.527658104490297
TATA ELXSI Current vs. Altered: 	7993.149902	6915.358235196771
TSMC Current vs. Altered: 	81.510002	83.70734890731026
