In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load the Data

In [2]:
stocks_data=pd.read_csv('/kaggle/input/stocknews/upload_DJIA_table.csv')
news_headlines=pd.read_csv('/kaggle/input/stocknews/RedditNews.csv')

In [3]:
stocks_data.head()

In [4]:
news_headlines.head()

In [5]:
news_headlines.shape

In [6]:
#drop the duplecates
news_headlines=news_headlines.drop_duplicates()

In [7]:
# add all news according to date wise
news_headlines=news_headlines.groupby(["Date"])['News'].apply(lambda x: ','.join(x)).reset_index()

In [8]:
# mearge the stock data and news_headlines
data= pd.merge(stocks_data, news_headlines, on="Date")

In [9]:
data.head()

In [10]:
#check the null values
data['News'].isnull().sum()

In [11]:
# use hugging face api for sentiment classification
import transformers
from transformers import pipeline
classifier=pipeline("zero-shot-classification")

In [32]:
sentement_labels=['positive','negative','neutral']
sent=classifier(data['News'][0],sentement_labels)

In [18]:
sent['labels'],sent['scores']

In [33]:
data['positive']=""
data['negative']=""
data['neutral']=""
data['compound']=""

In [34]:
#use nltk sentiment analyser for setiments
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

In [35]:
for i in (range(len(data))):
    sentiments=sia.polarity_scores(data['News'][i])
    data['positive'][i]=sentiments['pos']
    data['negative'][i]=sentiments['neg']
    data['neutral'][i]=sentiments['neu']
    data['compound'][i]=sentiments['compound']

In [36]:
data.head()

In [37]:
# there is no need of news so remove this
data.drop(['News'],inplace=True,axis=1)

In [38]:
# make index as a date sort it in ascending order
data.set_index('Date',inplace=True)
data=data.sort_index(ascending=True,axis=0)

In [39]:
#target_value
y=data['Close']

# devide the data into train test 

In [40]:
train_set=data.iloc[:1500].values
test_set=data.iloc[1500:].values

In [42]:
# convert data into range to 1
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler(feature_range=(0,1))
sc_traindata = sc.fit_transform(train_set)
sc_testdata = sc.fit_transform(test_set)

In [51]:
sc_traindata[1]

In [78]:
#devide the data into 60 - 60 patches for input into model
X_train = []
y_train = []
for i in range(60, 1500):
    X_train.append(sc_traindata[i-60:i, 0])
    y_train.append(sc_traindata[i, 0])
X_train, y_train = np.array(X_train), np.array(y_train)
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))

In [79]:
X_train.shape

In [57]:
#import important libraries
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import Dense

# Model Building and traing

In [58]:
model = Sequential()
model.add(LSTM(units=50,return_sequences=True,input_shape=(X_train.shape[1], 1)))
model.add(Dropout(0.2))
model.add(LSTM(units=50,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=50,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=50))
model.add(Dropout(0.2))
model.add(Dense(units=1))
model.compile(optimizer='adam',loss='mean_squared_error')
model.fit(X_train,y_train,epochs=100,batch_size=32)

# summary of the model

In [59]:
model.summary()

In [60]:
# test tha data
X_test = []
y_test  = []
for i in range(60, len(sc_testdata)):
    X_test.append(sc_testdata[i-60:i, 0])
    y_test.append(sc_testdata[i, 0])
X_test, y_test = np.array(X_test), np.array(y_test)
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

In [61]:
sc_testdata[0]

In [62]:
y_test[0]

In [65]:
predicted_stock_price = model.predict(X_test)


In [66]:
for i in range(40):
    print(predicted_stock_price[i],y_test[i])

In [67]:
import matplotlib.pyplot as plt
plt.plot(predicted_stock_price, color = 'black', label = 'TATA Stock Price')
plt.plot(y_test, color = 'green', label = 'Predicted TATA Stock Price')
plt.show()