# Time Series on Stocks - Lineer Regression
## 4 stocks/tickers on time series
- 1259 rows, 4 stocks 
- Percent change, rolling
- Porfolio decisions with lineer regression

In [None]:
import pandas as pd
stocks = pd.read_csv('data/tickers.csv', index_col=0)

In [3]:
stocks.head()

Unnamed: 0_level_0,cmg,tan,flsr,rgse
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-05-19,501.290009,33.851028,59.830002,27720.0
2014-05-20,495.649994,33.168442,58.630001,26280.0
2014-05-21,504.179993,34.892876,60.389999,24960.0
2014-05-22,520.47998,35.764076,60.630001,27840.0
2014-05-23,522.320007,36.356853,60.669998,28080.0


In [27]:
stocks.describe()

Unnamed: 0,cmg,tan,flsr,rgse
count,1259.0,1259.0,1259.0,1259.0
mean,502.587093,25.363513,51.912296,2909.743074
std,133.282875,7.233387,11.836525,7116.603034
min,251.330002,15.893574,26.33,0.07
25%,408.044998,20.073106,43.765001,0.93
50%,469.290009,23.266693,50.830002,21.9
75%,636.424988,29.658448,61.5,846.0
max,757.77002,45.710979,77.949997,35760.0


In [5]:
stocks.memory_usage()

Index    10072
cmg      10072
tan      10072
flsr     10072
rgse     10072
dtype: int64

In [7]:
# percent change per past day
stocks.pct_change()

Unnamed: 0_level_0,cmg,tan,flsr,rgse
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-05-19,,,,
2014-05-20,-0.011251,-0.020164,-0.020057,-0.051948
2014-05-21,0.017210,0.051990,0.030019,-0.050228
2014-05-22,0.032330,0.024968,0.003974,0.115385
2014-05-23,0.003535,0.016575,0.000660,0.008621
...,...,...,...,...
2019-05-13,-0.025757,-0.037810,-0.026504,0.000000
2019-05-14,0.023150,0.011871,0.015925,0.363636
2019-05-15,0.003854,0.020227,0.008933,0.000000
2019-05-16,0.004307,0.024980,0.013532,0.066667


In [9]:
# percent change per past 5 days, week period
stocks.pct_change(periods=5)

Unnamed: 0_level_0,cmg,tan,flsr,rgse
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-05-19,,,,
2014-05-20,,,,
2014-05-21,,,,
2014-05-22,,,,
2014-05-23,,,,
...,...,...,...,...
2019-05-13,-0.041023,-0.043836,-0.031348,0.571429
2019-05-14,-0.004727,-0.054685,-0.038256,1.142857
2019-05-15,-0.005186,-0.020963,-0.007791,0.875000
2019-05-16,0.005389,0.041499,0.029701,1.000000


In [11]:
# percent change per past 20 days, month period
stocks.rolling(window=20).mean()

Unnamed: 0_level_0,cmg,tan,flsr,rgse
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-05-19,,,,
2014-05-20,,,,
2014-05-21,,,,
2014-05-22,,,,
2014-05-23,,,,
...,...,...,...,...
2019-05-13,698.909503,25.3525,60.6460,0.0940
2019-05-14,698.455002,25.3525,60.6010,0.0930
2019-05-15,698.425504,25.3670,60.5685,0.0945
2019-05-16,699.461502,25.4015,60.5690,0.0970


A function called up_or_down that takes in a dataframe object and returns a dataframe object whose values are a count of the periods, by column, of positive percent change.

For example, if a stock tan had 222 days of positive percent change, we'd see a returned Series object that may look like this:

cmg     111
tan     222
flsr    333
rgse    444
dtype: int64

In [23]:
def up_or_down(df):
    import pandas as pd
    '''
    This function takes in a value
    and returns a 0 if the value is
    negative and a 1 where the value is
    positive.
    '''
    return df.pct_change()\
        .apply(lambda x: np.where(x>0, 1, 0))\
        .sum()

up_or_down

<function __main__.up_or_down(df)>

In [24]:
stocks.pct_change().corr()

Unnamed: 0,cmg,tan,flsr,rgse
cmg,1.0,0.1418,0.087468,0.031374
tan,0.1418,1.0,0.653843,0.1328
flsr,0.087468,0.653843,1.0,0.092919
rgse,0.031374,0.1328,0.092919,1.0


## Making portfolio decisions
Correlated stocks are not good to have in a single portfolio, it's time to determine whether there are any symbol(s) that should be dropped if one owns tan.

In [16]:
def stocks_to_drop(df, stock='tan'):
    s = df.pct_change().corr()[stock] > 0.5
    return set([ x for x in list(s[s == True].index) if x != stock])

stocks_to_drop(pd.read_csv('data/tickers.csv', index_col=0))

{'flsr'}

## Regression with statsmodel

In [18]:
# Select the non-tan features
# percent change values to X and
# the tan percent change series to y
# Also, use .dropna() to drop the missing values
# from each prior to forming the X and y.
X = pd.read_csv('data/tickers.csv', index_col=0).pct_change().dropna().drop('tan', axis=1)
y = pd.read_csv('data/tickers.csv', index_col=0).pct_change().dropna()['tan']

In [21]:
import statsmodels.api as sm
# Prepare the input features by adding a constant term for the intercept term
X_const = sm.add_constant(X)

In [22]:
# Instantinate an empty model class with sm.OLS()
model = sm.regression.linear_model.OLS(y, X)

In [29]:
# Fit the model
res = model.fit()
print(res.summary())

                                 OLS Regression Results                                
Dep. Variable:                    tan   R-squared (uncentered):                   0.440
Model:                            OLS   Adj. R-squared (uncentered):              0.438
Method:                 Least Squares   F-statistic:                              328.1
Date:                Wed, 01 Jul 2020   Prob (F-statistic):                   2.94e-157
Time:                        18:18:26   Log-Likelihood:                          3614.0
No. Observations:                1258   AIC:                                     -7222.
Df Residuals:                    1255   BIC:                                     -7207.
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

## p-value is significantly low and R-square is moderately high
## flsr is correlated strong enough to avoid to match with tan 