# Eya Loukil

<br>

<table><tr>
<td> <img src="stock_market.png" alt="Drawing" style="width: 250px;"/> </td>
<td> <img src="tsf.png" alt="Drawing" style="width: 250px;"/> </td>
</tr></table>


## TSF Task 7 : Stock Market Prediction using Numerical and Textual Analysis
### (Level - Advanced)


● Objective: Creating a hybrid model for stock price/performance prediction using numerical analysis of historical stock prices, and sentimental analysis of news headlines <br>
● Stock to analyze and predict - S&P BSE SENSEX <br>
● Download the historical stock prices from finance.yahoo.com <br>
● Download the textual (news) data from https://bit.ly/36fFPI6 <br>
● Using Python for separate analysis and then combine the findings to create a hybrid model <br>


In [None]:
import warnings
warnings.filterwarnings('ignore')

import math
import pandas as pd
import numpy as np

!pip3 install yfinance
import yfinance as yf

import matplotlib.pyplot as plt
import seaborn as sns

from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Dense, Activation

import nltk
nltk.download()
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

from sklearn import preprocessing, metrics
from sklearn.preprocessing import MinMaxScaler

In [None]:
#Reading the historic data of SENSEX from 2001 to 2020

df_price = yf.download('^BSESN', start='2001-01-02', end='2021-01-01')
df_price.head()

In [None]:
df_price['Open'].value_counts(dropna=False)

In [None]:
df_price['High'].value_counts(dropna=False)

In [None]:
df_price['Low'].value_counts(dropna=False)

In [None]:
df_price['Close'].value_counts(dropna=False)

In [None]:
df_price['Adj Close'].value_counts(dropna=False)

In [None]:
df_price['Volume'].value_counts(dropna=False)

In [None]:
print(df_price.columns)
df_price.reset_index(inplace=True)
df_price.rename(columns={'Date': 'date','Open': 'open', 'High': 'high', 'Low': 'low', 'Close': 'close', 'Adj Close': 'adjclose', 'Volume': 'volume'}, inplace = True)
df_price.head()

In [None]:
#Reading the textual news data

df_news=pd.read_csv("C:/Stock-Market-Prediction-using-Numerical-and-Textual-Analysis-main/Stock-Market-Prediction-using-Numerical-and-Textual-Analysis-main/stock_data.csv")
df_news.head()

In [None]:
len(df_price), len(df_news)

In [None]:
df_price.isnull().count(), df_news.isnull().count()

#### Analysis on SENSEX data

In [None]:
#Analysing the SENSEX data

df_price =df_price.drop_duplicates()
df_price['date'] = pd.to_datetime(df_price['date'].dt.normalize())
df_price = df_price.filter(['date', 'close', 'open', 'high', 'low', 'volume'])
df_price.set_index('date', inplace= True)
df_price

#### Analysis on Headlines data

In [None]:
df_news=df_news.drop_duplicates()
df_news['publish_date'] = pd.to_datetime(df_news['publish_date'],format= '%Y%m%d').dt.normalize()
df_news=df_news.filter(['publish_date','headline_text'])
df_news=df_news.groupby(['publish_date'])['headline_text'].apply(lambda x: ','.join(x)).reset_index()
df_news.set_index('publish_date', inplace=True)
df_news

In [None]:
#Merging the required data in a single dataframe

df_stock=pd.concat([df_price,df_news],axis=1)
df_stock.dropna(axis=0, inplace=True)
df_stock

In [None]:
#Using nltk - vader library to analyse the sentiments based on the headline data

from nltk.sentiment.vader import SentimentIntensityAnalyzer
import unicodedata

sia = SentimentIntensityAnalyzer()

df_stock['Compound'] = [sia.polarity_scores(v)['compound'] for v in df_stock['headline_text']]
df_stock['Negative'] = [sia.polarity_scores(v)['neg'] for v in df_stock['headline_text']]
df_stock['Neutral'] = [sia.polarity_scores(v)['neu'] for v in df_stock['headline_text']]
df_stock['Positive'] = [sia.polarity_scores(v)['pos'] for v in df_stock['headline_text']]
df_stock.head()

In [None]:
#Removing headline data

df_stock.drop((['headline_text']), axis=1, inplace=True)
df_stock = df_stock[['close', 'Compound', 'Negative', 'Neutral', 'Positive', 'open', 'high', 'low', 'volume']]
df_stock.head()

In [None]:
#Saving the compiled dataframe in a excel sheet that can be used later

df_stock.to_csv('stock_data.csv')

In [None]:
#Reading the stock_data 

stock_data=pd.read_csv('stock_data.csv')
stock_data.rename(columns={'Unnamed: 0':'Date'}, inplace = True)
stock_data.set_index('Date', inplace=True)
stock_data.head()

In [None]:
stock_data.describe(include='all')

In [None]:
stock_data.isna().any()

In [None]:
stock_data.info()

In [None]:
#Visualing the close price over the period of analysis

plt.figure(figsize=(15,7))
stock_data['close'].plot(label='SENSEX')
plt.title("SENSEX Close Price")
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.legend()

In [None]:
#Visualing the 50-Day moving average for measuring stock performance over the period

plt.figure(figsize=(15,7))
stock_data['close'].plot(label='SENSEX')
stock_data.rolling(window=50).mean()['close'].plot(label='50-DMA', color='r')
plt.legend()

 ### Data Preparation For Modelling


In [None]:
len(stock_data)

In [None]:
#Calculating the number of datapoints for training and testing

percentage_of_data = 1.0
data_to_use = int(percentage_of_data*(len(stock_data)-1))

train_end = int(data_to_use*0.8)
total_data = len(stock_data)
start = total_data - data_to_use

print("Number of records in Training Data:", train_end)
print("Number of records in Test Data:", total_data - train_end)

In [None]:
#Allocating the datapoints for each column

steps_to_predict = 1

close_price = stock_data.iloc[start:total_data,0]        
compound = stock_data.iloc[start:total_data,1]           
negative = stock_data.iloc[start:total_data,2]           
neutral = stock_data.iloc[start:total_data,3]            
positive = stock_data.iloc[start:total_data,4]           
open_price = stock_data.iloc[start:total_data,5]         
high = stock_data.iloc[start:total_data,6]               
low = stock_data.iloc[start:total_data,7]                
volume = stock_data.iloc[start:total_data,8]             

print("Close Price:")
close_price

In [None]:
# shifting next day close
close_price_shifted = close_price.shift(-1) 

# shifting next day compound
compound_shifted = compound.shift(-1) 

# concatenating the captured training data into a dataframe
data = pd.concat([close_price, close_price_shifted, compound, compound_shifted, volume, open_price, high, low], axis=1)

# setting column names of the revised stock data
data.columns = ['close_price', 'close_price_shifted', 'compound', 'compound_shifted','volume', 'open_price', 'high', 'low']

data = data.dropna()    
data.head(10)

In [None]:
# setting the target variable as the shifted close_price

y = data['close_price_shifted']
y

In [None]:
# setting the future dataset for training the model

cols = ['close_price', 'compound', 'compound_shifted', 'volume', 'open_price', 'high', 'low']
x = data[cols]
x

## Scaling

In [None]:
# scaling the feature dataset
scaler_x = preprocessing.MinMaxScaler (feature_range=(-1, 1))
x = np.array(x).reshape((len(x) ,len(cols)))
x = scaler_x.fit_transform(x)

# scaling the target variable
scaler_y = preprocessing.MinMaxScaler (feature_range=(-1, 1))
y = np.array (y).reshape ((len( y), 1))
y = scaler_y.fit_transform (y)

x, y

## Train and Test data split

In timeseries data, an observation for a particular date is always dependent on the previous date records and the data like stock prices which is dependent on date, the dataset is divided into train and test dataset as below,

In [None]:
# preparing training and test dataset

X_train = x[0 : train_end,]
X_test = x[train_end+1 : len(x),]    
y_train = y[0 : train_end] 
y_test = y[train_end+1 : len(y)]  

# printing the shape of the training and the test datasets

print('Number of rows and columns in the Training set X:', X_train.shape, 'and y:', y_train.shape)
print('Number of rows and columns in the Test set X:', X_test.shape, 'and y:', y_test.shape)

In [None]:
# reshaping the feature dataset into 3D for feeding into the LSTM model

X_train = X_train.reshape(X_train.shape + (1,)) 
X_test = X_test.reshape(X_test.shape + (1,))

# printing the re-shaped feature dataset
print('Shape of Training set X:', X_train.shape)
print('Shape of Test set X:', X_test.shape)

In [None]:
# setting the seed to achieve consistent and less random predictions at each execution
np.random.seed(2021)

# setting the model architecture
model=Sequential()
model.add(LSTM(100,return_sequences=True,activation='tanh',input_shape=(len(cols),1)))
model.add(Dropout(0.1))
model.add(LSTM(100,return_sequences=True,activation='tanh'))
model.add(Dropout(0.1))
model.add(LSTM(100,activation='tanh'))
model.add(Dropout(0.1))
model.add(Dense(1))

# printing the model summary
model.summary()

In [None]:
# compiling the model
model.compile(loss='mse' , optimizer='adam')

# fitting the model using the training dataset
model.fit(X_train, y_train, validation_split=0.2, epochs=10, batch_size=8, verbose=1)

## Prediction

In [None]:
#Prediction of stock data using the test dataset

predictions = model.predict(X_test) 

# unscaling the predictions
predictions = scaler_y.inverse_transform(np.array(predictions).reshape((len(predictions), 1)))

# printing the predictions
print('Predictions:')
predictions[0:5]

## Model Evaluation

In [None]:
# calculating the training mean-squared-error
train_loss = model.evaluate(X_train, y_train, batch_size = 1)

# calculating the test mean-squared-error
test_loss = model.evaluate(X_test, y_test, batch_size = 1)

# printing the training and the test mean-squared-errors
print('Train Loss =', round(train_loss,4))
print('Test Loss =', round(test_loss,4))

In [None]:
# calculating root mean squared error using sklearn.metrics package

rmse = metrics.mean_squared_error(y_test, predictions)
print('Root Mean Square Error (sklearn.metrics) =', round(np.sqrt(rmse),4))

In [None]:
# unscaling the test datasets
X_test = scaler_x.inverse_transform(np.array(X_test).reshape((len(X_test), len(cols))))

# unscaling the test y dataset, y_test
y_train = scaler_y.inverse_transform(np.array(y_train).reshape((len(y_train), 1)))
y_test = scaler_y.inverse_transform(np.array(y_test).reshape((len(y_test), 1)))

In [None]:
# plotting the prediction and original dataset on the same plot
    
plt.figure(figsize=(15,7))
plt.plot(predictions, label="Predicted Close Price", color='r')
plt.plot([row[0] for row in y_test], label="Testing Close Price")
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=2)
plt.title('SENSEX and Prediction')
plt.show()