# Stock Market Prediction using Numerical and Textual Analysis

 

## Submitted By : Gaurav Jain




In [None]:
'''     Objective : Create a hybrid model for stock price/performance prediction using numerical analysis 
        of historical stock prices, and sentimental analysis of news headlines

                                       Grips@ Sparks Foundation'''

#### IMPORTING LIBRARIES

In [2]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import cufflinks as cf
import chart_studio.plotly as py
import plotly.graph_objects as go
import os
%matplotlib inline

# Make Plotly work in your Jupyter Notebook
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
# Use Plotly locally
cf.go_offline()

In [2]:
import flair
# Flair is a powerful library to conduct Sentimental analysis using Pytorch.

#### Cleaning Textual Data

In [3]:
df_news = pd.read_csv('C:\\Users\\gnaha\\Documents\\Jupyter Notebooks\\Datasets\\india-news-headlines.csv')
df_news.head()
# importing news dataset 


Unnamed: 0,publish_date,headline_category,headline_text
0,20010102,unknown,Status quo will not be disturbed at Ayodhya; says Vajpayee
1,20010102,unknown,Fissures in Hurriyat over Pak visit
2,20010102,unknown,America's unwanted heading for India?
3,20010102,unknown,For bigwigs; it is destination Goa
4,20010102,unknown,Extra buses to clear tourist traffic


In [17]:
infosys_news_df = df.loc[df['headline_text'].str.contains('Infosys', case = False)]
infosys_news_df

# Extracting all the news which containg infosys.
# This dataset will be used to conduct Sentimental Analysis. 

Unnamed: 0,publish_date,headline_category,headline_text
3060,20010407,unknown,Infosys likely to meet the target
5594,20010620,city.lucknow,Infosys chief showers funds on IIT; Kanpur
6672,20010711,city.bengaluru,Despite slowdown; Infosys comes out tops
6676,20010711,business.india-business,Infosys sees Rs 800 cr profit this fiscal; Q1 net soars 56%
6691,20010711,business.india-business,US slowdown takes its toll on Infosys
...,...,...,...
3342206,20200720,business.india-business,Infosys' Vanguard deal value pegged at $1.5 billion
3378751,20201002,city.mysuru,Infosys foundation donates Rs 20 lakh to Mysuru Zoo
3385548,20201015,business.india-business,Infosys beats TCS; Wipro in Q2 revenue growth; to give pay hikes from January
3410056,20201203,city.bengaluru,Bengaluru: IISc professor and five other achievers bag Infosys Prize


In [18]:

infosys_news_df['publish_date'] = pd.to_datetime(infosys_news_df['publish_date'], format= '%Y%m%d') 
infosys_news_df

# Converting publish_date into datetime type.

Unnamed: 0,publish_date,headline_category,headline_text
3060,2001-04-07,unknown,Infosys likely to meet the target
5594,2001-06-20,city.lucknow,Infosys chief showers funds on IIT; Kanpur
6672,2001-07-11,city.bengaluru,Despite slowdown; Infosys comes out tops
6676,2001-07-11,business.india-business,Infosys sees Rs 800 cr profit this fiscal; Q1 net soars 56%
6691,2001-07-11,business.india-business,US slowdown takes its toll on Infosys
...,...,...,...
3342206,2020-07-20,business.india-business,Infosys' Vanguard deal value pegged at $1.5 billion
3378751,2020-10-02,city.mysuru,Infosys foundation donates Rs 20 lakh to Mysuru Zoo
3385548,2020-10-15,business.india-business,Infosys beats TCS; Wipro in Q2 revenue growth; to give pay hikes from January
3410056,2020-12-03,city.bengaluru,Bengaluru: IISc professor and five other achievers bag Infosys Prize


In [3]:
infosys_stock_df = pd.read_csv('C:\\Users\\gnaha\\Documents\\Jupyter Notebooks\\Datasets\\Infosys Stock data.csv')
infosys_stock_df
# Importing Infosys stock data 

In [5]:
infosys_stock_df['Date'] = pd.to_datetime(infosys_stock_df['Date']).dt.date
infosys_stock_df['Date'] = pd.to_datetime(infosys_stock_df['Date'])
infosys_stock_df
# Removing the time stamp from the Date coloumn
# Converting Date into Datetime type. 

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,2001-04-02,63.47,65.14,58.70,64.26,630207
1,2001-04-03,63.38,65.28,61.97,63.66,450738
2,2001-04-04,61.72,61.72,59.84,61.16,324554
3,2001-04-06,61.72,66.06,61.72,62.95,500757
4,2001-04-09,61.72,62.73,60.19,62.04,248610
...,...,...,...,...,...,...
4878,2020-12-24,1249.90,1249.90,1226.00,1236.05,7313885
4879,2020-12-28,1238.45,1248.00,1236.00,1240.30,4607051
4880,2020-12-29,1235.00,1254.45,1235.00,1250.30,6878105
4881,2020-12-30,1253.00,1253.30,1238.15,1246.80,5194690


In [22]:
sentiment_model = flair.models.TextClassifier.load('en-sentiment')

# Making our Sentiment Model using Flair

2021-06-15 13:32:19,050 loading file C:\Users\gnaha\.flair\models\sentiment-en-mix-distillbert_4.pt


In [23]:
Sentiment = []
confidence = []

for sentence in infosys_news_df['headline_text']:
    sample = flair.data.Sentence(sentence)
    sentiment_model.predict(sample)
    
    Sentiment.append(sample.labels[0].value)
    confidence.append(sample.labels[0].score)
    
print(Sentiment)
print(confidence)

# Running the model on all the news containing infosys
# Sentiment bifurcate the news wether being Positve or Negative
# Confidence contain the score of how strong the sentiment is.

['POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'NEGATIVE', 'NEGATIVE', 'POSITIVE', 'NEGATIVE', 'NEGATIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'NEGATIVE', 'POSITIVE', 'POSITIVE', 'NEGATIVE', 'NEGATIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'NEGATIVE', 'POSITIVE', 'NEGATIVE', 'NEGATIVE', 'NEGATIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'NEGATIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'NEGATIVE', 'NEGATIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'NEGATIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'NEGATIVE', 'POSITIVE', 'POSITIVE', 'NEGATIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'NEGATIVE', 'NEGATIVE', 'NEGATIVE', 'NEGATIVE', 'POSITIVE', 'NEGATIVE', 'NEGATIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'NEGATIVE', 'NE

In [52]:
infosys_news_df['Sentiment'] = Sentiment
infosys_news_df['Confidence'] = confidence
infosys_news_df
# Adding two new coloumns Sentiment and Confidence in dataset

Unnamed: 0,Date,headline_category,headline_text,Sentiment,Confidence
3060,2001-04-07,unknown,Infosys likely to meet the target,POSITIVE,0.930539
5594,2001-06-20,city.lucknow,Infosys chief showers funds on IIT; Kanpur,POSITIVE,0.994436
6672,2001-07-11,city.bengaluru,Despite slowdown; Infosys comes out tops,POSITIVE,0.986148
6676,2001-07-11,business.india-business,Infosys sees Rs 800 cr profit this fiscal; Q1 net soars 56%,POSITIVE,0.884255
6691,2001-07-11,business.india-business,US slowdown takes its toll on Infosys,POSITIVE,0.595294
...,...,...,...,...,...
3342206,2020-07-20,business.india-business,Infosys' Vanguard deal value pegged at $1.5 billion,POSITIVE,0.691462
3378751,2020-10-02,city.mysuru,Infosys foundation donates Rs 20 lakh to Mysuru Zoo,POSITIVE,0.666270
3385548,2020-10-15,business.india-business,Infosys beats TCS; Wipro in Q2 revenue growth; to give pay hikes from January,NEGATIVE,0.762694
3410056,2020-12-03,city.bengaluru,Bengaluru: IISc professor and five other achievers bag Infosys Prize,POSITIVE,0.999195


In [46]:
# infosys_news_df.to_csv('infosys_news.csv', index=False)
# infosys_news_df = pd.read_csv('infosys_news.csv')
infosys_news_df['date'] = pd.to_datetime(infosys_news_df['publish_date'])
infosys_news_df.drop(columns='publish_date', inplace=True)
infosys_news_df.set_index('date', inplace=True)
infosys_news_df

# Renaming Pubish_date as Date and setting it as the index of the dataframe.

Unnamed: 0_level_0,headline_category,headline_text,Sentiment,Confidence
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2001-04-07,unknown,Infosys likely to meet the target,POSITIVE,0.930539
2001-06-20,city.lucknow,Infosys chief showers funds on IIT; Kanpur,POSITIVE,0.994436
2001-07-11,city.bengaluru,Despite slowdown; Infosys comes out tops,POSITIVE,0.986148
2001-07-11,business.india-business,Infosys sees Rs 800 cr profit this fiscal; Q1 net soars 56%,POSITIVE,0.884255
2001-07-11,business.india-business,US slowdown takes its toll on Infosys,POSITIVE,0.595294
...,...,...,...,...
2020-07-20,business.india-business,Infosys' Vanguard deal value pegged at $1.5 billion,POSITIVE,0.691462
2020-10-02,city.mysuru,Infosys foundation donates Rs 20 lakh to Mysuru Zoo,POSITIVE,0.666270
2020-10-15,business.india-business,Infosys beats TCS; Wipro in Q2 revenue growth; to give pay hikes from January,NEGATIVE,0.762694
2020-12-03,city.bengaluru,Bengaluru: IISc professor and five other achievers bag Infosys Prize,POSITIVE,0.999195


In [53]:
infosys_stock_df.set_index('Date', inplace=True)
infosys_stock_df

#Setting Date as the index of infosys stock information dataframe


Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2001-04-02,63.47,65.14,58.70,64.26,630207
2001-04-03,63.38,65.28,61.97,63.66,450738
2001-04-04,61.72,61.72,59.84,61.16,324554
2001-04-06,61.72,66.06,61.72,62.95,500757
2001-04-09,61.72,62.73,60.19,62.04,248610
...,...,...,...,...,...
2020-12-24,1249.90,1249.90,1226.00,1236.05,7313885
2020-12-28,1238.45,1248.00,1236.00,1240.30,4607051
2020-12-29,1235.00,1254.45,1235.00,1250.30,6878105
2020-12-30,1253.00,1253.30,1238.15,1246.80,5194690


In [63]:
# final_df1 = pd.merge(infosys_news_df, infosys_stock_df, left_index=True ,right_index=True, indicator='categorical')

In [69]:
final_df = infosys_stock_df.join(infosys_news_df, how='outer')
final_df['Sentiment'].isna().sum()
final_df.to_csv('merged data.csv')

# Merging both the dataset and saving it to csv file. 

In [3]:
merge_df = pd.read_csv('merged data.csv')
merge_df['Date'] = pd.to_datetime(merge_df['Date'], format= '%d-%m-%Y') 


# merge_df['Sentiment'].isna().sum()

# Looking at the NA values in Sentiment and Confidence.

In [4]:
merge_df = merge_df.set_index('Date')


In [5]:
merge_df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,headline_category,headline_text,Sentiment,Confidence
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2001-04-02,63.47,65.14,58.70,64.26,630207.0,,,,
2001-04-03,63.38,65.28,61.97,63.66,450738.0,,,,
2001-04-04,61.72,61.72,59.84,61.16,324554.0,,,,
2001-04-06,61.72,66.06,61.72,62.95,500757.0,,,,
2001-04-07,,,,,,unknown,Infosys likely to meet the target,POSITIVE,0.930539
...,...,...,...,...,...,...,...,...,...
2020-12-24,1249.90,1249.90,1226.00,1236.05,7313885.0,business.india-business,Infosys rallies on $3 billion Daimler deal; mcap crosses $70 billion,NEGATIVE,0.968163
2020-12-28,1238.45,1248.00,1236.00,1240.30,4607051.0,,,,
2020-12-29,1235.00,1254.45,1235.00,1250.30,6878105.0,,,,
2020-12-30,1253.00,1253.30,1238.15,1246.80,5194690.0,,,,


In [6]:
for i in range(0, merge_df.shape[0]-2):
    if np.isnan(merge_df.iloc[i,0]):
        if np.isnan(merge_df.iloc[i+2, 8]):
            merge_df.iloc[i+2,7] = merge_df.iloc[i,7]
            merge_df.iloc[i+2,6] = merge_df.iloc[i,6]
            merge_df.iloc[i+2,5] = merge_df.iloc[i,5]
            merge_df.iloc[i+2,8] = merge_df.iloc[i,8]
            merge_df.iloc[i,0] = 0
merge_df

# The dataset contains the news on dates when there is a holiday on Stock exchange. 
# Since the news cannot affect the share price movements on the days when stock excange is closed.
# These news impact the price movements on next day or next to next day when stock exchane opens
# I have moved those news two days ahead in this loop 



Unnamed: 0_level_0,Open,High,Low,Close,Volume,headline_category,headline_text,Sentiment,Confidence
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2001-04-02,63.47,65.14,58.70,64.26,630207.0,,,,
2001-04-03,63.38,65.28,61.97,63.66,450738.0,,,,
2001-04-04,61.72,61.72,59.84,61.16,324554.0,,,,
2001-04-06,61.72,66.06,61.72,62.95,500757.0,,,,
2001-04-07,0.00,,,,,unknown,Infosys likely to meet the target,POSITIVE,0.930539
...,...,...,...,...,...,...,...,...,...
2020-12-24,1249.90,1249.90,1226.00,1236.05,7313885.0,business.india-business,Infosys rallies on $3 billion Daimler deal; mcap crosses $70 billion,NEGATIVE,0.968163
2020-12-28,1238.45,1248.00,1236.00,1240.30,4607051.0,,,,
2020-12-29,1235.00,1254.45,1235.00,1250.30,6878105.0,,,,
2020-12-30,1253.00,1253.30,1238.15,1246.80,5194690.0,,,,


In [7]:
for i in range(0, merge_df.shape[0]-1):
    if np.isnan(merge_df.iloc[i,0]):
        if np.isnan(merge_df.iloc[i+1, 8]):
            merge_df.iloc[i+1,7] = merge_df.iloc[i,7]
            merge_df.iloc[i+1,6] = merge_df.iloc[i,6]
            merge_df.iloc[i+1,5] = merge_df.iloc[i,5]
            merge_df.iloc[i+1,8] = merge_df.iloc[i,8]
            merge_df.iloc[i,0] = 0
merge_df

# In this loop, I have moved the  remaining news one day ahead.

Unnamed: 0_level_0,Open,High,Low,Close,Volume,headline_category,headline_text,Sentiment,Confidence
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2001-04-02,63.47,65.14,58.70,64.26,630207.0,,,,
2001-04-03,63.38,65.28,61.97,63.66,450738.0,,,,
2001-04-04,61.72,61.72,59.84,61.16,324554.0,,,,
2001-04-06,61.72,66.06,61.72,62.95,500757.0,,,,
2001-04-07,0.00,,,,,unknown,Infosys likely to meet the target,POSITIVE,0.930539
...,...,...,...,...,...,...,...,...,...
2020-12-24,1249.90,1249.90,1226.00,1236.05,7313885.0,business.india-business,Infosys rallies on $3 billion Daimler deal; mcap crosses $70 billion,NEGATIVE,0.968163
2020-12-28,1238.45,1248.00,1236.00,1240.30,4607051.0,,,,
2020-12-29,1235.00,1254.45,1235.00,1250.30,6878105.0,,,,
2020-12-30,1253.00,1253.30,1238.15,1246.80,5194690.0,,,,


In [8]:
merge_df.loc[merge_df['Open'] == 0]
merge_df['Open'].isna().sum()
merge_df['Sentiment'].isna().sum()
data_saved = 3634-3388
data_saved

# Using this exercise, we saved the data for 246 days.

246

In [9]:
final_df = merge_df.fillna(value={'Sentiment' : '0', 'Confidence' : 0 })
df = final_df[final_df['High'].notna()]
df['High'].isna().sum()
df

# Removed all those days where the Stock exchange was closed and the 
# news cannot be moved ahead on those days. 

Unnamed: 0_level_0,Open,High,Low,Close,Volume,headline_category,headline_text,Sentiment,Confidence
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2001-04-02,63.47,65.14,58.70,64.26,630207.0,,,0,0.000000
2001-04-03,63.38,65.28,61.97,63.66,450738.0,,,0,0.000000
2001-04-04,61.72,61.72,59.84,61.16,324554.0,,,0,0.000000
2001-04-06,61.72,66.06,61.72,62.95,500757.0,,,0,0.000000
2001-04-09,61.72,62.73,60.19,62.04,248610.0,,,0,0.000000
...,...,...,...,...,...,...,...,...,...
2020-12-24,1249.90,1249.90,1226.00,1236.05,7313885.0,business.india-business,Infosys rallies on $3 billion Daimler deal; mcap crosses $70 billion,NEGATIVE,0.968163
2020-12-28,1238.45,1248.00,1236.00,1240.30,4607051.0,,,0,0.000000
2020-12-29,1235.00,1254.45,1235.00,1250.30,6878105.0,,,0,0.000000
2020-12-30,1253.00,1253.30,1238.15,1246.80,5194690.0,,,0,0.000000


In [10]:
df['per_change_day'] = ((df['Close'] - df['Open'])/df['Open'])*100


df['per_change_day']= df['per_change_day'].round(decimals=2)
df

# Making a new column of percent change in price of stock on a particular day.

Unnamed: 0_level_0,Open,High,Low,Close,Volume,headline_category,headline_text,Sentiment,Confidence,per_change_day
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2001-04-02,63.47,65.14,58.70,64.26,630207.0,,,0,0.000000,1.24
2001-04-03,63.38,65.28,61.97,63.66,450738.0,,,0,0.000000,0.44
2001-04-04,61.72,61.72,59.84,61.16,324554.0,,,0,0.000000,-0.91
2001-04-06,61.72,66.06,61.72,62.95,500757.0,,,0,0.000000,1.99
2001-04-09,61.72,62.73,60.19,62.04,248610.0,,,0,0.000000,0.52
...,...,...,...,...,...,...,...,...,...,...
2020-12-24,1249.90,1249.90,1226.00,1236.05,7313885.0,business.india-business,Infosys rallies on $3 billion Daimler deal; mcap crosses $70 billion,NEGATIVE,0.968163,-1.11
2020-12-28,1238.45,1248.00,1236.00,1240.30,4607051.0,,,0,0.000000,0.15
2020-12-29,1235.00,1254.45,1235.00,1250.30,6878105.0,,,0,0.000000,1.24
2020-12-30,1253.00,1253.30,1238.15,1246.80,5194690.0,,,0,0.000000,-0.49


# Training ML Model using Regression 

#### Importing Libraries

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score, GridSearchCV 
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
# from sklearn.discriminant_analysis import LinearClassifierMixin, LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.svm import SVR

#### Preprocessing Data

In [None]:
''' Calculating Simple Moving Average of closing price from last 5 days which will be used to 
    train our Model. 

    Per_change_day is lagged for a one period, It is assumed that the percent change of last day affects the 
    share price movement of next day. ''' 




In [12]:

df['SMA_5_Close'] = df.iloc[:,3].rolling(window = 5).mean()
df['SMA_5_Volume'] = df.iloc[:,4].rolling(window=5).mean()
df['per_change_day'] = df['per_change_day'].shift(1)
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,headline_category,headline_text,Sentiment,Confidence,per_change_day,SMA_5_Close,SMA_5_Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2001-04-02,63.47,65.14,58.70,64.26,630207.0,,,0,0.000000,,,
2001-04-03,63.38,65.28,61.97,63.66,450738.0,,,0,0.000000,1.24,,
2001-04-04,61.72,61.72,59.84,61.16,324554.0,,,0,0.000000,0.44,,
2001-04-06,61.72,66.06,61.72,62.95,500757.0,,,0,0.000000,-0.91,,
2001-04-09,61.72,62.73,60.19,62.04,248610.0,,,0,0.000000,1.99,62.814,430973.2
...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-24,1249.90,1249.90,1226.00,1236.05,7313885.0,business.india-business,Infosys rallies on $3 billion Daimler deal; mcap crosses $70 billion,NEGATIVE,0.968163,1.22,1215.340,12189622.0
2020-12-28,1238.45,1248.00,1236.00,1240.30,4607051.0,,,0,0.000000,-1.11,1225.440,9911897.0
2020-12-29,1235.00,1254.45,1235.00,1250.30,6878105.0,,,0,0.000000,0.15,1240.040,9115847.6
2020-12-30,1253.00,1253.30,1238.15,1246.80,5194690.0,,,0,0.000000,1.24,1245.300,7974415.4


In [13]:
df['SMA_5_Close'] = df['SMA_5_Close'].shift(1)
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,headline_category,headline_text,Sentiment,Confidence,per_change_day,SMA_5_Close,SMA_5_Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2001-04-02,63.47,65.14,58.70,64.26,630207.0,,,0,0.000000,,,
2001-04-03,63.38,65.28,61.97,63.66,450738.0,,,0,0.000000,1.24,,
2001-04-04,61.72,61.72,59.84,61.16,324554.0,,,0,0.000000,0.44,,
2001-04-06,61.72,66.06,61.72,62.95,500757.0,,,0,0.000000,-0.91,,
2001-04-09,61.72,62.73,60.19,62.04,248610.0,,,0,0.000000,1.99,,430973.2
...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-24,1249.90,1249.90,1226.00,1236.05,7313885.0,business.india-business,Infosys rallies on $3 billion Daimler deal; mcap crosses $70 billion,NEGATIVE,0.968163,1.22,1199.97,12189622.0
2020-12-28,1238.45,1248.00,1236.00,1240.30,4607051.0,,,0,0.000000,-1.11,1215.34,9911897.0
2020-12-29,1235.00,1254.45,1235.00,1250.30,6878105.0,,,0,0.000000,0.15,1225.44,9115847.6
2020-12-30,1253.00,1253.30,1238.15,1246.80,5194690.0,,,0,0.000000,1.24,1240.04,7974415.4


In [14]:
df['SMA_5_Volume'] = df['SMA_5_Volume'].shift(1)
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,headline_category,headline_text,Sentiment,Confidence,per_change_day,SMA_5_Close,SMA_5_Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2001-04-02,63.47,65.14,58.70,64.26,630207.0,,,0,0.000000,,,
2001-04-03,63.38,65.28,61.97,63.66,450738.0,,,0,0.000000,1.24,,
2001-04-04,61.72,61.72,59.84,61.16,324554.0,,,0,0.000000,0.44,,
2001-04-06,61.72,66.06,61.72,62.95,500757.0,,,0,0.000000,-0.91,,
2001-04-09,61.72,62.73,60.19,62.04,248610.0,,,0,0.000000,1.99,,
...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-24,1249.90,1249.90,1226.00,1236.05,7313885.0,business.india-business,Infosys rallies on $3 billion Daimler deal; mcap crosses $70 billion,NEGATIVE,0.968163,1.22,1199.97,12058475.6
2020-12-28,1238.45,1248.00,1236.00,1240.30,4607051.0,,,0,0.000000,-1.11,1215.34,12189622.0
2020-12-29,1235.00,1254.45,1235.00,1250.30,6878105.0,,,0,0.000000,0.15,1225.44,9911897.0
2020-12-30,1253.00,1253.30,1238.15,1246.80,5194690.0,,,0,0.000000,1.24,1240.04,9115847.6


In [15]:
df.shape

(6001, 12)

### Visualizing Data

In [16]:
fig = go.Figure()

fig.add_trace(
go.Line( x = list(df.index) , y= list(df.Close)))

# Set title
fig.update_layout(
    title_text=" Infosys Share price Movements with range slider and selectors"
)

# Add range slider
fig.update_layout(
    xaxis=dict(
        rangeselector=dict(
            buttons=list([
                dict(count=1,
                     label="1m",
                     step="month",
                     stepmode="backward"),
                dict(count=6,
                     label="6m",
                     step="month",
                     stepmode="backward"),
                dict(count=1,
                     label="YTD",
                     step="year",
                     stepmode="todate"),
                dict(count=1,
                     label="1y",
                     step="year",
                     stepmode="backward"),
                dict(step="all")
            ])
        ),
        rangeslider=dict(
            visible=True
        ),
        type="date"
    )
)

fig.show()

# px.line(data_frame=df, x= df.index, y=df.Close)


plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.




In [None]:
''' Features were selected which we add in the model
    Dropped the NA values which were added due to SMA
    Dependent Variable was chosen as the closing price of the stock

In [17]:
features = ['SMA_5_Close', 'SMA_5_Volume', 'Sentiment', 'Confidence', 'per_change_day']
LE_col = ['Sentiment']

X= df[features]
X = X[X['SMA_5_Close'].notna()]
X
y= df.iloc[5:,3]
y






Date
2001-04-10      59.81
2001-04-11      50.24
2001-04-12      44.71
2001-04-16      47.84
2001-04-17      47.74
               ...   
2020-12-24    1236.05
2020-12-28    1240.30
2020-12-29    1250.30
2020-12-30    1246.80
2020-12-31    1255.80
Name: Close, Length: 5996, dtype: float64

In [18]:
for i in range(0, X.shape[0]):
    if X.Sentiment[i] == 'POSITIVE':
        X.Sentiment[i] = 1
    elif X.Sentiment[i] == 'NEGATIVE':
        X.Sentiment[i] = -1
    elif X.Sentiment[i] == '0':
        X.Sentiment[i] = 0
X

# Converted categorical variables into integers.


Unnamed: 0_level_0,SMA_5_Close,SMA_5_Volume,Sentiment,Confidence,per_change_day
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2001-04-10,62.814,430973.2,1,0.930539,0.52
2001-04-11,61.924,415162.8,0,0.000000,-4.30
2001-04-12,59.240,453767.2,0,0.000000,-19.91
2001-04-16,55.950,638970.8,0,0.000000,-3.27
2001-04-17,52.928,675979.6,0,0.000000,8.38
...,...,...,...,...,...
2020-12-24,1199.970,12058475.6,-1,0.968163,1.22
2020-12-28,1215.340,12189622.0,0,0.000000,-1.11
2020-12-29,1225.440,9911897.0,0,0.000000,0.15
2020-12-30,1240.040,9115847.6,0,0.000000,1.24


In [19]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y,test_size=0.2, train_size=0.8, random_state=0)
X_train.isna().sum()
X_valid.isna().sum()
# X_valid

# Splitted the data in Training and Valid data

SMA_5_Close       0
SMA_5_Volume      0
Sentiment         0
Confidence        0
per_change_day    0
dtype: int64

#### Training Data

In [None]:
''' Using pipelines to make use of dfferent algorithms and finding 
    which is the best algorithim which fits our data best. '''



In [20]:
Pipeline_LinearReg = Pipeline(steps=[('scale', StandardScaler()), 
                                            ('model', LinearRegression(copy_X = True))])

Pipeline_RFR = Pipeline(steps=[('scale', StandardScaler()), 
                                            ('model', RandomForestRegressor(random_state=0))])

Pipeline_SGDR = Pipeline(steps=[('scale', StandardScaler()), 
                                            ('model', SGDRegressor(random_state=0) )])

Pipeline_SVR = Pipeline(steps=[('scale', StandardScaler()), 
                                            ('model', SVR())])

Pipeline_Ridge = Pipeline(steps=[('scale', StandardScaler()), 
                                            ('model', Ridge(random_state=0))])







In [None]:
''' Using Grid Params CV, different grid parameters are made to check which are the best parameters 
    which fits our data best. 
    Total 5 Models are used '''

In [21]:
Regularization_Strength = [0.1, 0.3, 0.5, 0.7, 1.0]
max_iter =list(np.linspace(100, 1000, 10).astype(int))


Grid_Params_LinearReg = [{'model__fit_intercept' : [True,False]}]

Grid_Params_RFR = [{'model__n_estimators':[150]}]

Grid_Params_SGDR = [{'model__alpha': [0.0001, 0.0005, 0.001], 'model__max_iter': [1000, 4000, 9000]}]

Grid_Params_SVR = [{'model__kernel': ['linear'], 
                   'model__C' : [0.8, 1.0, 1.2]}]
                   
Grid_Params_Ridge = [{'model__max_iter' : max_iter, 'model__alpha' : Regularization_Strength}]

                   

In [22]:

gs_LineraReg = GridSearchCV(estimator=Pipeline_LinearReg , param_grid=Grid_Params_LinearReg , 
                                scoring='neg_mean_absolute_percentage_error',cv=10)

gs_RFR = GridSearchCV(estimator=Pipeline_RFR, param_grid=Grid_Params_RFR ,
                      scoring='neg_mean_absolute_percentage_error',cv=10)

gs_SGDR = GridSearchCV(estimator=Pipeline_SGDR, param_grid=Grid_Params_SGDR ,
                       scoring='neg_mean_absolute_percentage_error',cv=10)

gs_SVR = GridSearchCV(estimator=Pipeline_SVR, param_grid=Grid_Params_SVR ,
                      scoring='neg_mean_absolute_percentage_error',cv=5)

gs_Ridge = GridSearchCV(estimator=Pipeline_Ridge, param_grid=Grid_Params_Ridge ,
                        scoring='neg_mean_absolute_percentage_error',cv=10)


In [None]:
'''A loop was created which train the models using different models and prints out the best parameters
    and training accuracy. '''

In [23]:
grid_dict = {0: 'Linear Regression', 1: 'Random Forest Regressor', 2: 'Stochastic Gradient Descent Reg', 
             3: 'Support Vector Reg', 4: 'Ridge Reg'}

grids = [gs_LineraReg, gs_RFR, gs_SGDR, gs_SVR, gs_Ridge]


best_acc = 10000000
best_clf = 0
best_gs = ''

for idx, gs in enumerate(grids):
	print('\nEstimator: %s' % grid_dict[idx])	
	gs.fit(X_train, y_train)
	# Best params
	print('Best params: %s' % gs.best_params_)
	# Best training data accuracy
	print('Best training accuracy: %.3f' % ((gs.best_score_)*(-1)))
	# Predict on test data with best params
	pred = gs.predict(X_valid)
	# Test data accuracy of model with best params
	print('Test set MAE score for best params: %.3f ' % mean_absolute_error(pred, y_valid))
	# Track best (highest test accuracy) model
	if mean_absolute_error(pred, y_valid) <= best_acc:
		best_acc = mean_absolute_error(pred, y_valid)
		best_gs = gs
		best_clf = idx
print('\nClassifier with best test set accuracy: %s' % grid_dict[best_clf])


Estimator: Linear Regression
Best params: {'model__fit_intercept': True}
Best training accuracy: 0.020
Test set MAE score for best params: 5.503 

Estimator: Random Forest Regressor
Best params: {'model__n_estimators': 150}
Best training accuracy: 0.018
Test set MAE score for best params: 5.437 

Estimator: Stochastic Gradient Descent Reg
Best params: {'model__alpha': 0.0005, 'model__max_iter': 1000}
Best training accuracy: 0.020
Test set MAE score for best params: 5.488 

Estimator: Support Vector Reg
Best params: {'model__C': 1.2, 'model__kernel': 'linear'}
Best training accuracy: 0.019
Test set MAE score for best params: 5.447 

Estimator: Ridge Reg
Best params: {'model__alpha': 1.0, 'model__max_iter': 100}
Best training accuracy: 0.020
Test set MAE score for best params: 5.502 

Classifier with best test set accuracy: Random Forest Regressor


#### Visualizing Prediction

Unnamed: 0_level_0,SMA_5_Close,SMA_5_Volume,Sentiment,Confidence,per_change_day
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2001-01-06,63.770,465065.4,0,0.000000,0.60
2001-01-08,57.392,544196.6,0,0.000000,2.77
2001-01-10,38.070,734434.2,0,0.000000,-3.17
2001-01-11,46.700,729433.0,0,0.000000,-0.46
2001-02-05,57.254,471966.8,0,0.000000,13.07
...,...,...,...,...,...
2020-12-24,1199.970,12058475.6,-1,0.968163,1.22
2020-12-28,1215.340,12189622.0,0,0.000000,-1.11
2020-12-29,1225.440,9911897.0,0,0.000000,0.15
2020-12-30,1240.040,9115847.6,0,0.000000,1.24


In [24]:
z= pd.DataFrame(y)
y

z['pred'] = gs_RFR.predict(X)
mean_absolute_error(z.Close, z.pred)

2.725543419872517

In [25]:
a = z.tail(100)


In [26]:
px.line(a, x= a.index, y=['Close', 'pred'])
