In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df_news = pd.read_csv("Reliance_sentiment.csv")

In [3]:
df_price = pd.read_csv("reliance_prices.csv")

In [4]:
df_price.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1384 entries, 0 to 1383
Data columns (total 21 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         1384 non-null   object 
 1   Open         1384 non-null   float64
 2   High         1384 non-null   float64
 3   Low          1384 non-null   float64
 4   Close        1384 non-null   float64
 5   Adj Close    1384 non-null   float64
 6   Volume       1384 non-null   int64  
 7   1d_diff      1384 non-null   float64
 8   5d_diff      1384 non-null   float64
 9   10d_diff     1384 non-null   float64
 10  SMA3         1384 non-null   float64
 11  SMA5         1384 non-null   float64
 12  SMA9         1384 non-null   float64
 13  SMA15        1384 non-null   float64
 14  SMA30        1384 non-null   float64
 15  EMA3         1384 non-null   float64
 16  EMA5         1384 non-null   float64
 17  EMA9         1384 non-null   float64
 18  EMA15        1384 non-null   float64
 19  EMA30 

In [5]:
df_news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 654 entries, 0 to 653
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   date              654 non-null    object 
 1   title_senti_comp  654 non-null    float64
 2   news_senti_comp   654 non-null    float64
 3   avg_senti_comp    654 non-null    float64
dtypes: float64(3), object(1)
memory usage: 20.6+ KB


In [6]:
df_news['date'] = pd.to_datetime(df_news.date)
df_price['Date'] = pd.to_datetime(df_price.Date)

In [7]:
df_combined = pd.merge(left=df_price,right=df_news,how="left",left_on = 'Date',right_on='date')

In [8]:
df_combined.drop(['date'],axis=1,inplace=True)

In [9]:
df_combined.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,1d_diff,5d_diff,10d_diff,...,SMA30,EMA3,EMA5,EMA9,EMA15,EMA30,Volume_100k,title_senti_comp,news_senti_comp,avg_senti_comp
1379,2022-07-11,2376.5,2428.800049,2370.0,2423.899902,2423.899902,6390604,3.449951,1.649902,3.5,...,2574.979997,2411.709763,2417.68741,2439.967735,2472.975022,2523.815943,63.906,,,
1380,2022-07-12,2404.0,2439.699951,2404.0,2420.449951,2420.449951,4974502,42.899902,-16.650146,-1.050049,...,2567.906665,2416.079857,2418.608257,2436.064178,2466.409388,2517.147169,49.745,,,
1381,2022-07-13,2427.300049,2434.0,2373.0,2377.550049,2377.550049,6564435,-19.599854,-125.449951,-41.649902,...,2559.375,2396.814953,2404.922187,2424.361352,2455.301971,2508.140903,65.644,0.0,-0.0772,-0.04
1382,2022-07-14,2388.0,2433.949951,2376.949951,2397.149902,2397.149902,7831798,-4.650146,-89.150146,-59.950195,...,2548.469995,2396.982428,2402.331426,2418.919062,2448.032962,2500.980193,78.318,0.0,0.4703,0.24
1383,2022-07-15,2415.0,2415.0,2383.100098,2401.800049,2401.800049,4431880,-20.449951,-101.300049,-107.649902,...,2535.879997,2399.391238,2402.1543,2415.49526,2442.253848,2494.581474,44.319,,,


In [10]:
df_combined[['Date','Close','1d_diff']].head()

Unnamed: 0,Date,Close,1d_diff
0,2016-12-13,515.018127,-9.757507
1,2016-12-14,524.775635,3.244202
2,2016-12-15,521.531433,-2.080261
3,2016-12-16,523.611694,-2.674622
4,2016-12-19,526.286316,2.303162


In [11]:
df_combined.isna().sum()

Date                  0
Open                  0
High                  0
Low                   0
Close                 0
Adj Close             0
Volume                0
1d_diff               0
5d_diff               0
10d_diff              0
SMA3                  0
SMA5                  0
SMA9                  0
SMA15                 0
SMA30                 0
EMA3                  0
EMA5                  0
EMA9                  0
EMA15                 0
EMA30                 0
Volume_100k           0
title_senti_comp    846
news_senti_comp     846
avg_senti_comp      846
dtype: int64

In [12]:
df_combined.fillna(0.0,inplace=True)

In [13]:
df_combined['next_day_close'] = df_combined['Close'] - df_combined['1d_diff']

In [14]:
df_combined[['Date','Close','next_day_close','1d_diff']].tail()

Unnamed: 0,Date,Close,next_day_close,1d_diff
1379,2022-07-11,2423.899902,2420.449951,3.449951
1380,2022-07-12,2420.449951,2377.550049,42.899902
1381,2022-07-13,2377.550049,2397.149902,-19.599854
1382,2022-07-14,2397.149902,2401.800049,-4.650146
1383,2022-07-15,2401.800049,2422.25,-20.449951


## model creation

## 1d prediction

In [15]:
x_features_1d = ['Volume_100k','EMA3','EMA5','avg_senti_comp']

In [16]:
df_combined[x_features_1d].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1384 entries, 0 to 1383
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Volume_100k     1384 non-null   float64
 1   EMA3            1384 non-null   float64
 2   EMA5            1384 non-null   float64
 3   avg_senti_comp  1384 non-null   float64
dtypes: float64(4)
memory usage: 54.1 KB


In [17]:
X1d = df_combined[x_features_1d]
y1d= df_combined['next_day_close']

In [19]:
## train test split
X_train, X_test, y_train, y_test = train_test_split(X1d,
                                                    y1d,
                                                    shuffle=False,
                                                    train_size = 0.7,
                                                    random_state = 80)

In [20]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((968, 4), (416, 4), (968,), (416,))

## defining transformations

In [22]:
scaler = StandardScaler()

In [23]:
lreg_v1 = Pipeline(steps=[('scaler', scaler),
                          ('regressor', SGDRegressor(max_iter=100, eta0=0.01))])

In [24]:
lreg_v1.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('regressor', SGDRegressor(max_iter=100))])

In [25]:
lreg_v1['regressor'].intercept_

array([1169.62116002])

In [26]:
lreg_v1['regressor'].coef_

array([  4.50464322, 208.61240288, 204.04474956,   0.63259265])

In [27]:
dict(zip(x_features_1d, np.round(lreg_v1['regressor'].coef_, 2)))

{'Volume_100k': 4.5, 'EMA3': 208.61, 'EMA5': 204.04, 'avg_senti_comp': 0.63}

In [28]:
y_pred = lreg_v1.predict(X_test)

In [29]:
y_df = pd.DataFrame({"actual": y_test,
                     "predicted": y_pred,
                     "residual": y_pred - y_test})

In [30]:
y_df.head(10)

Unnamed: 0,actual,predicted,residual
968,1996.400024,2006.178308,9.778283
969,2002.300049,2006.501019,4.20097
970,1993.25,1995.522154,2.272154
971,1987.199951,2004.063128,16.863177
972,1973.150024,1994.243181,21.093157
973,1899.5,1985.224156,85.724156
974,1950.699951,1958.114312,7.414361
975,1964.050049,1955.502045,-8.548004
976,1947.800049,1959.21928,11.419231
977,1952.599976,1954.39431,1.794335


In [31]:
mse_v1 = mean_squared_error(y_test, y_pred)

In [32]:
mse_v1

2483.545937513821

In [33]:
rmse_v1 = np.sqrt(mse_v1)

In [34]:
rmse_v1

49.83518774434206

In [35]:
r2_score(y_test, y_pred)

0.9620005130275666