Stock Market Prediction using Numerical and Textual Analysis

Create a hybrid model for stock price or performance prediction using numerical analysis of historical stock prices, and sentimental analysis of news headlines.

In [3]:
# Importing the important libraries

import os 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use("fivethirtyeight")

import warnings
warnings.filterwarnings('ignore')

import nltk
import re
from nltk.corpus import stopwords
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.preprocessing import MinMaxScaler
from nltk.stem.porter import PorterStemmer

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import xgboost

In [5]:
#Importing the BSESN Dataset

df_stocks = pd.read_csv('./^BSESN.csv')
df_stocks

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2019-12-17,41052.359375,41401.648438,41005.179688,41352.171875,41352.171875,19000.0
1,2019-12-18,41442.750000,41614.769531,41358.468750,41558.570313,41558.570313,24300.0
2,2019-12-19,41571.820313,41719.289063,41456.398438,41673.921875,41673.921875,33300.0
3,2019-12-20,41746.199219,41809.960938,41636.109375,41681.539063,41681.539063,33600.0
4,2019-12-23,41548.261719,41701.621094,41474.609375,41642.660156,41642.660156,6200.0
...,...,...,...,...,...,...,...
247,2020-12-11,46060.320313,46309.628906,45706.218750,46099.011719,46099.011719,26300.0
248,2020-12-14,46284.699219,46373.339844,45951.531250,46253.460938,46253.460938,18400.0
249,2020-12-15,46287.390625,46350.300781,45841.671875,46263.171875,46263.171875,16400.0
250,2020-12-16,46573.308594,46704.968750,46402.199219,46666.460938,46666.460938,13100.0


In [6]:
#Importing the News Headlines Dataset

df_news = pd.read_csv('./india-news-headlines.csv', names = ['Date', 'Category', 'News'])
df_news = df_news.dropna(axis = 0, how = 'any')
df_news

Unnamed: 0,Date,Category,News
0,publish_date,headline_category,headline_text
1,20010102,unknown,Status quo will not be disturbed at Ayodhya; s...
2,20010102,unknown,Fissures in Hurriyat over Pak visit
3,20010102,unknown,America's unwanted heading for India?
4,20010102,unknown,For bigwigs; it is destination Goa
...,...,...,...
3650966,20220331,city.srinagar,J&K sacks 2 cops; 3 other employees over terro...
3650967,20220331,entertainment.hindi.bollywood,Ranbir Kapoor says 'Rishi Kapoor enjoyed his a...
3650968,20220331,city.trichy,As Covid-19 cases drop to nil in southern dist...
3650969,20220331,city.erode,Tamil Nadu sees marginal rise of Covid cases w...


In [7]:
#Cleaning and Pre-preocessing

df_news.drop(0, inplace = True)
df_news.drop('Category', axis = 1, inplace = True)
df_news

Unnamed: 0,Date,News
1,20010102,Status quo will not be disturbed at Ayodhya; s...
2,20010102,Fissures in Hurriyat over Pak visit
3,20010102,America's unwanted heading for India?
4,20010102,For bigwigs; it is destination Goa
5,20010102,Extra buses to clear tourist traffic
...,...,...
3650966,20220331,J&K sacks 2 cops; 3 other employees over terro...
3650967,20220331,Ranbir Kapoor says 'Rishi Kapoor enjoyed his a...
3650968,20220331,As Covid-19 cases drop to nil in southern dist...
3650969,20220331,Tamil Nadu sees marginal rise of Covid cases w...


In [8]:
#Converting the date columns to type datetime

df_news['Date'] = pd.to_datetime(df_news['Date'], format = '%Y%m%d')
df_stocks['Date'] = pd.to_datetime(df_stocks['Date'])
df_stocks

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2019-12-17,41052.359375,41401.648438,41005.179688,41352.171875,41352.171875,19000.0
1,2019-12-18,41442.750000,41614.769531,41358.468750,41558.570313,41558.570313,24300.0
2,2019-12-19,41571.820313,41719.289063,41456.398438,41673.921875,41673.921875,33300.0
3,2019-12-20,41746.199219,41809.960938,41636.109375,41681.539063,41681.539063,33600.0
4,2019-12-23,41548.261719,41701.621094,41474.609375,41642.660156,41642.660156,6200.0
...,...,...,...,...,...,...,...
247,2020-12-11,46060.320313,46309.628906,45706.218750,46099.011719,46099.011719,26300.0
248,2020-12-14,46284.699219,46373.339844,45951.531250,46253.460938,46253.460938,18400.0
249,2020-12-15,46287.390625,46350.300781,45841.671875,46263.171875,46263.171875,16400.0
250,2020-12-16,46573.308594,46704.968750,46402.199219,46666.460938,46666.460938,13100.0


In [11]:
df_news['News'] = df_news.groupby(['Date']).transform(lambda x : ' '.join(x))
df_news = df_news.drop_duplicates()
df_news.reset_index(inplace = True, drop = True)
df_news

Unnamed: 0,Date,News
0,2001-01-02,Status quo will not be disturbed at Ayodhya; s...
1,2001-01-03,Powerless north India gropes in the dark Think...
2,2001-01-04,The string that pulled Stephen Hawking to Indi...
3,2001-01-05,Light combat craft takes India into club class...
4,2001-01-06,Light combat craft takes India into club class...
...,...,...
7712,2022-03-27,Playing a dead man in Body God meant sitting i...
7713,2022-03-28,As fashion becomes democratic; demand for styl...
7714,2022-03-29,Bobby Deol: Like my dad; I hope to be working ...
7715,2022-03-30,I will not give in to trends: Dossmode The tim...


In [None]:
ps = PorterStemmer()

corpus = []

for i in range(0, len(df_news['News'])):
    news = re.sub('[^a-zA-Z]',' ',df_news['News'][i])
    news = news.lower()
    news = news.split()
    news = [ps.stem(word) for word in news if not word in set(stopwords.words('english'))]
    print(i)
    news = ' '.join(news)
    corpus.append(news)

In [13]:
#Subjectivity and Polarity

def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

def getPolarity(text):
    return TextBlob(text).sentiment.polarity

In [14]:
df_news['Subjectivity'] = df_news['News'].apply(getSubjectivity)
df_news['Polarity'] = df_news['News'].apply(getPolarity)
df_news

Unnamed: 0,Date,News,Subjectivity,Polarity
0,2001-01-02,Status quo will not be disturbed at Ayodhya; s...,0.282333,0.151333
1,2001-01-03,Powerless north India gropes in the dark Think...,0.407692,0.088462
2,2001-01-04,The string that pulled Stephen Hawking to Indi...,0.446847,0.087961
3,2001-01-05,Light combat craft takes India into club class...,0.476612,0.262024
4,2001-01-06,Light combat craft takes India into club class...,0.439394,0.248485
...,...,...,...,...
7712,2022-03-27,Playing a dead man in Body God meant sitting i...,0.369592,0.025109
7713,2022-03-28,As fashion becomes democratic; demand for styl...,0.389939,0.055878
7714,2022-03-29,Bobby Deol: Like my dad; I hope to be working ...,0.404240,0.095198
7715,2022-03-30,I will not give in to trends: Dossmode The tim...,0.350965,0.037729


In [16]:
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()
df_news['Compound'] = [sia.polarity_scores(v)['compound'] for v in df_news['News']]
df_news['Negative'] = [sia.polarity_scores(v)['neg'] for v in df_news['News']]
df_news['Neutral'] = [sia.polarity_scores(v)['neu'] for v in df_news['News']]
df_news['Positive'] = [sia.polarity_scores(v)['pos'] for v in df_news['News']]
df_news

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\farza\AppData\Roaming\nltk_data...


Unnamed: 0,Date,News,Subjectivity,Polarity,Compound,Negative,Neutral,Positive
0,2001-01-02,Status quo will not be disturbed at Ayodhya; s...,0.282333,0.151333,-0.9811,0.122,0.807,0.071
1,2001-01-03,Powerless north India gropes in the dark Think...,0.407692,0.088462,-0.2640,0.122,0.766,0.111
2,2001-01-04,The string that pulled Stephen Hawking to Indi...,0.446847,0.087961,0.8738,0.100,0.797,0.103
3,2001-01-05,Light combat craft takes India into club class...,0.476612,0.262024,0.9769,0.124,0.719,0.157
4,2001-01-06,Light combat craft takes India into club class...,0.439394,0.248485,-0.4215,0.152,0.704,0.143
...,...,...,...,...,...,...,...,...
7712,2022-03-27,Playing a dead man in Body God meant sitting i...,0.369592,0.025109,-0.9999,0.154,0.764,0.082
7713,2022-03-28,As fashion becomes democratic; demand for styl...,0.389939,0.055878,-0.9999,0.158,0.742,0.100
7714,2022-03-29,Bobby Deol: Like my dad; I hope to be working ...,0.404240,0.095198,-0.9999,0.152,0.766,0.082
7715,2022-03-30,I will not give in to trends: Dossmode The tim...,0.350965,0.037729,-0.9999,0.151,0.775,0.074


In [18]:
#Merging the updated News dataframe with the Stocks dataframe

df_merge = pd.merge(df_stocks, df_news, how='inner', on='Date')
df_merge

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,News,Subjectivity,Polarity,Compound,Negative,Neutral,Positive
0,2019-12-17,41052.359375,41401.648438,41005.179688,41352.171875,41352.171875,19000.0,After Jagannath Temple; Puri beach devpt plan ...,0.361641,0.035806,-1.0000,0.195,0.728,0.076
1,2019-12-18,41442.750000,41614.769531,41358.468750,41558.570313,41558.570313,24300.0,What is Citizenship (Amendment) Bill 2019: All...,0.399387,0.037550,-0.9999,0.159,0.761,0.080
2,2019-12-19,41571.820313,41719.289063,41456.398438,41673.921875,41673.921875,33300.0,Only 61% of CCA covered after revamp of canals...,0.375940,0.028690,-0.9998,0.129,0.804,0.067
3,2019-12-20,41746.199219,41809.960938,41636.109375,41681.539063,41681.539063,33600.0,I'm done playing the good girl; I want to be a...,0.374114,0.041297,-0.9999,0.168,0.746,0.086
4,2019-12-23,41548.261719,41701.621094,41474.609375,41642.660156,41642.660156,6200.0,Weekly Horoscope; December 22-28: Check predic...,0.404214,0.032329,-0.9999,0.153,0.760,0.088
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,2020-12-11,46060.320313,46309.628906,45706.218750,46099.011719,46099.011719,26300.0,Ways to get you in the mood for sex when you'r...,0.383091,0.054693,-0.9999,0.162,0.751,0.088
248,2020-12-14,46284.699219,46373.339844,45951.531250,46253.460938,46253.460938,18400.0,No threat from Covid claims to balance sheets ...,0.386875,0.072025,-0.9996,0.125,0.783,0.092
249,2020-12-15,46287.390625,46350.300781,45841.671875,46263.171875,46263.171875,16400.0,1;147 RT-PCR tests in a day; only 3 found +ve ...,0.376347,0.041861,-0.9999,0.141,0.775,0.083
250,2020-12-16,46573.308594,46704.968750,46402.199219,46666.460938,46666.460938,13100.0,Asmita Sood's favourite holiday destinations a...,0.429164,0.066554,-0.9997,0.120,0.800,0.081


In [19]:
#Creating merged Dataset for Model training

df_merge1 = df_merge[['Close', 'Subjectivity', 'Polarity', 'Compound', 'Negative', 'Neutral', 'Positive']]
df_merge1

Unnamed: 0,Close,Subjectivity,Polarity,Compound,Negative,Neutral,Positive
0,41352.171875,0.361641,0.035806,-1.0000,0.195,0.728,0.076
1,41558.570313,0.399387,0.037550,-0.9999,0.159,0.761,0.080
2,41673.921875,0.375940,0.028690,-0.9998,0.129,0.804,0.067
3,41681.539063,0.374114,0.041297,-0.9999,0.168,0.746,0.086
4,41642.660156,0.404214,0.032329,-0.9999,0.153,0.760,0.088
...,...,...,...,...,...,...,...
247,46099.011719,0.383091,0.054693,-0.9999,0.162,0.751,0.088
248,46253.460938,0.386875,0.072025,-0.9996,0.125,0.783,0.092
249,46263.171875,0.376347,0.041861,-0.9999,0.141,0.775,0.083
250,46666.460938,0.429164,0.066554,-0.9997,0.120,0.800,0.081


In [20]:
#Normalizing Data

scaler = MinMaxScaler()
df = pd.DataFrame(scaler.fit_transform(df_merge1))
df.columns = df_merge1.columns
df.index = df_merge1.index
df.head()

Unnamed: 0,Close,Subjectivity,Polarity,Compound,Negative,Neutral,Positive
0,0.735131,0.179412,0.282922,0.0,0.857143,0.088889,0.2875
1,0.745002,0.486673,0.298534,5e-05,0.571429,0.333333,0.3375
2,0.750519,0.295814,0.219221,0.0001,0.333333,0.651852,0.175
3,0.750884,0.280945,0.332077,5e-05,0.642857,0.222222,0.4125
4,0.749024,0.525962,0.251798,5e-05,0.52381,0.325926,0.4375


In [28]:
df.fillna(method = 'bfill')

Unnamed: 0,Close,Subjectivity,Polarity,Compound,Negative,Neutral,Positive
0,0.735131,0.179412,0.282922,0.000000,0.857143,0.088889,0.2875
1,0.745002,0.486673,0.298534,0.000050,0.571429,0.333333,0.3375
2,0.750519,0.295814,0.219221,0.000100,0.333333,0.651852,0.1750
3,0.750884,0.280945,0.332077,0.000050,0.642857,0.222222,0.4125
4,0.749024,0.525962,0.251798,0.000050,0.523810,0.325926,0.4375
...,...,...,...,...,...,...,...
247,0.962154,0.354017,0.452002,0.000050,0.595238,0.259259,0.4375
248,0.969541,0.384819,0.607154,0.000201,0.301587,0.496296,0.4875
249,0.970005,0.299127,0.337124,0.000050,0.428571,0.437037,0.3750
250,0.989293,0.729058,0.558175,0.000151,0.261905,0.622222,0.3500


In [29]:
X = df.drop('Close', axis=1)
X

Unnamed: 0,Subjectivity,Polarity,Compound,Negative,Neutral,Positive
0,0.179412,0.282922,0.000000,0.857143,0.088889,0.2875
1,0.486673,0.298534,0.000050,0.571429,0.333333,0.3375
2,0.295814,0.219221,0.000100,0.333333,0.651852,0.1750
3,0.280945,0.332077,0.000050,0.642857,0.222222,0.4125
4,0.525962,0.251798,0.000050,0.523810,0.325926,0.4375
...,...,...,...,...,...,...
247,0.354017,0.452002,0.000050,0.595238,0.259259,0.4375
248,0.384819,0.607154,0.000201,0.301587,0.496296,0.4875
249,0.299127,0.337124,0.000050,0.428571,0.437037,0.3750
250,0.729058,0.558175,0.000151,0.261905,0.622222,0.3500


In [30]:
X.isnull()

Unnamed: 0,Subjectivity,Polarity,Compound,Negative,Neutral,Positive
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
...,...,...,...,...,...,...
247,False,False,False,False,False,False
248,False,False,False,False,False,False
249,False,False,False,False,False,False
250,False,False,False,False,False,False


In [31]:
y = df['Close']
y

0      0.735131
1      0.745002
2      0.750519
3      0.750884
4      0.749024
         ...   
247    0.962154
248    0.969541
249    0.970005
250    0.989293
251    1.000000
Name: Close, Length: 252, dtype: float64

In [32]:
y.isnull()

0      False
1      False
2      False
3      False
4      False
       ...  
247    False
248    False
249    False
250    False
251    False
Name: Close, Length: 252, dtype: bool

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
X_train.shape

(201, 6)

In [42]:
#Cleaning the data off NaN values

def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(axis=1)
    return df[indices_to_keep].astype(np.float64)

clean_dataset(df)

Unnamed: 0,Close,Subjectivity,Polarity,Compound,Negative,Neutral,Positive
0,0.735131,0.179412,0.282922,0.000000,0.857143,0.088889,0.2875
1,0.745002,0.486673,0.298534,0.000050,0.571429,0.333333,0.3375
2,0.750519,0.295814,0.219221,0.000100,0.333333,0.651852,0.1750
3,0.750884,0.280945,0.332077,0.000050,0.642857,0.222222,0.4125
4,0.749024,0.525962,0.251798,0.000050,0.523810,0.325926,0.4375
...,...,...,...,...,...,...,...
247,0.962154,0.354017,0.452002,0.000050,0.595238,0.259259,0.4375
248,0.969541,0.384819,0.607154,0.000201,0.301587,0.496296,0.4875
249,0.970005,0.299127,0.337124,0.000050,0.428571,0.437037,0.3750
250,0.989293,0.729058,0.558175,0.000151,0.261905,0.622222,0.3500


In [43]:
X = df.drop('Close', axis=1)
X

Unnamed: 0,Subjectivity,Polarity,Compound,Negative,Neutral,Positive
0,0.179412,0.282922,0.000000,0.857143,0.088889,0.2875
1,0.486673,0.298534,0.000050,0.571429,0.333333,0.3375
2,0.295814,0.219221,0.000100,0.333333,0.651852,0.1750
3,0.280945,0.332077,0.000050,0.642857,0.222222,0.4125
4,0.525962,0.251798,0.000050,0.523810,0.325926,0.4375
...,...,...,...,...,...,...
247,0.354017,0.452002,0.000050,0.595238,0.259259,0.4375
248,0.384819,0.607154,0.000201,0.301587,0.496296,0.4875
249,0.299127,0.337124,0.000050,0.428571,0.437037,0.3750
250,0.729058,0.558175,0.000151,0.261905,0.622222,0.3500


In [44]:
y = df['Close']
y

0      0.735131
1      0.745002
2      0.750519
3      0.750884
4      0.749024
         ...   
247    0.962154
248    0.969541
249    0.970005
250    0.989293
251    1.000000
Name: Close, Length: 250, dtype: float64

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)


In [46]:
# Testing different models for accuracy

rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
rfr_pred = rfr.predict(X_test)

In [48]:
print(rfr_pred[:10])
print(y_test[:10])
print('Mean Squared error: ', mean_squared_error(rfr_pred, y_test))

[0.53238047 0.54173419 0.65229868 0.6640895  0.4110694  0.45096582
 0.55703745 0.47701912 0.42813777 0.53300502]
226    0.842333
123    0.393456
93     0.272824
158    0.576027
155    0.524095
162    0.592488
199    0.621378
84     0.258180
64     0.110334
156    0.559884
Name: Close, dtype: float64
Mean Squared error:  0.03462093817179341


In [51]:
dtr = DecisionTreeRegressor()
dtr.fit(X_train, y_train)
dtr_pred = dtr.predict(X_test)

In [52]:
print(dtr_pred[:10])
print(y_test[:10])
print('Mean Squared error: ', mean_squared_error(dtr_pred, y_test))

[0.74535291 0.50246262 0.51141749 0.62806284 0.18338186 0.7337747
 0.50563446 0.83102341 0.13813457 0.28130527]
226    0.842333
123    0.393456
93     0.272824
158    0.576027
155    0.524095
162    0.592488
199    0.621378
84     0.258180
64     0.110334
156    0.559884
Name: Close, dtype: float64
Mean Squared error:  0.07674735136541805


In [53]:
xgb = xgboost.XGBRegressor()
xgb.fit(X_train, y_train)

In [54]:
xgb_pred = xgb.predict(X_test)

In [55]:
print(xgb_pred[:10])
print(y_test[:10])
print('Mean Squared error: ', mean_squared_error(xgb_pred, y_test))

[0.6503968  0.41167367 0.60225475 0.7534987  0.42742026 0.32416967
 0.5275839  0.42354864 0.450107   0.41419423]
226    0.842333
123    0.393456
93     0.272824
158    0.576027
155    0.524095
162    0.592488
199    0.621378
84     0.258180
64     0.110334
156    0.559884
Name: Close, dtype: float64
Mean Squared error:  0.03969347995704404
