# Sentiment Analysis
Sentiment Analysis using the eikon api

In [1]:
import secrets # contains my API keys

import datetime
import warnings

import eikon as ek
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from bs4 import BeautifulSoup
from textblob import TextBlob # sentiment engine
from datetime import time

warnings.filterwarnings("ignore")
ek.set_app_key(secrets.APP_KEY)

## Time Series Data
Daily time series data from microsoft over a 10 day period.

In [2]:
df=ek.get_timeseries(["MSFT.O"],start_date="2016-01-01", end_date="2016-01-10",interval="daily")
df

MSFT.O,HIGH,CLOSE,LOW,OPEN,COUNT,VOLUME
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-01-04,54.8,54.8,53.39,54.32,272781,53777963
2016-01-05,55.39,55.05,54.54,54.93,180637,34079674
2016-01-06,54.4001,54.05,53.64,54.32,225856,39518863
2016-01-07,53.485,52.17,52.07,52.7,303271,56564852
2016-01-08,53.28,52.33,52.15,52.37,261949,48753969


## Sentiment Analysis
Sentiment analysis of news articles about a stock.

In [3]:
# fetch n most recent headlines about a stock
def get_headlines(stock_code="LLOY.L",n_articles=100):
    query_str="R:{} AND Language:LEN".format(stock_code)
    df=ek.get_news_headlines(query_str,count=n_articles)
    return df

In [4]:
# fetch article contents (returns html)
def get_article_content(story_id):
    df=ek.get_news_story(story_id)
    return df

In [5]:
stock_code="LLOY.L"; n_articles=100
df=get_headlines(stock_code,n_articles)
df.head()

Unnamed: 0,versionCreated,text,storyId,sourceCode
2020-11-11 13:48:14.678,2020-11-11 13:48:53.675000+00:00,(EN) LLOYDS BANKING GROUP PLC Monthly Presenta...,urn:newsml:reuters.com:20201111:nGLF8342Vx:3,NS:GLFILE
2020-11-11 12:23:19.000,2020-11-11 12:23:19+00:00,COVID-19 'war games': the computer program tha...,urn:newsml:reuters.com:20201111:nL8N2GC54H:4,NS:RTRS
2020-11-10 22:47:12.000,2020-11-10 22:47:12+00:00,UPDATE 1-Short sellers lost billions as travel...,urn:newsml:reuters.com:20201110:nL1N2HW2OD:4,NS:RTRS
2020-11-10 16:55:11.000,2020-11-10 16:55:11+00:00,Short sellers lose $500 mln as European travel...,urn:newsml:reuters.com:20201110:nL8N2HW5YM:2,NS:RTRS
2020-11-10 15:49:52.096,2020-11-10 15:49:53.474000+00:00,REG - Time Out Group plc - Directorate Change,urn:newsml:newsroom:20201110:nRSJ8859Ea:0,NS:LSE


In [6]:
get_article_content(df["storyId"][0])

'<div class="storyContent" lang="en"><p><a href="reuters://screen/verb=Open/url=cpurl%3A%2F%2Fviews.cp.%2Ffilings%2Ffilings.viewer%2FDownload.aspx%3FDocumentId%3D55893421%26ContentFormat%3Dpdf%26ApplicationId%3DEikonNewsAlertMonitoringView" data-type="cpurl" data-cpurl="cpurl://views.cp./filings/filings.viewer/Download.aspx?DocumentId=55893421&ContentFormat=pdf&ApplicationId=EikonNewsAlertMonitoringView" translate="no">http://filings.ica.int.thomsonreuters.com/filings.viewer/Download.aspx...</a></p></div>'

In [7]:
# add columns for sentiment variables
df["Polarity"]=np.nan
df["Subjectivity"]=np.nan
df["Score"]=np.nan
df.head()

Unnamed: 0,versionCreated,text,storyId,sourceCode,Polarity,Subjectivity,Score
2020-11-11 13:48:14.678,2020-11-11 13:48:53.675000+00:00,(EN) LLOYDS BANKING GROUP PLC Monthly Presenta...,urn:newsml:reuters.com:20201111:nGLF8342Vx:3,NS:GLFILE,,,
2020-11-11 12:23:19.000,2020-11-11 12:23:19+00:00,COVID-19 'war games': the computer program tha...,urn:newsml:reuters.com:20201111:nL8N2GC54H:4,NS:RTRS,,,
2020-11-10 22:47:12.000,2020-11-10 22:47:12+00:00,UPDATE 1-Short sellers lost billions as travel...,urn:newsml:reuters.com:20201110:nL1N2HW2OD:4,NS:RTRS,,,
2020-11-10 16:55:11.000,2020-11-10 16:55:11+00:00,Short sellers lose $500 mln as European travel...,urn:newsml:reuters.com:20201110:nL8N2HW5YM:2,NS:RTRS,,,
2020-11-10 15:49:52.096,2020-11-10 15:49:53.474000+00:00,REG - Time Out Group plc - Directorate Change,urn:newsml:newsroom:20201110:nRSJ8859Ea:0,NS:LSE,,,


In [8]:
# fetchs article, performs sentiment analysis and returns variables
def sentiment_analysis(story_id):
    content_html=get_article_content(story_id)
    if content_html:
        soup=BeautifulSoup(content_html,"lxml") # get text from page
        sentA=TextBlob(soup.get_text()) # sentiment analysis
        
        # extract sentiment variables
        polarity=sentA.sentiment.polarity # in [-1,1]
        subjectivity=sentA.sentiment.subjectivity # in[0,1]
        
        if (polarity>=.05): score="positive"
        elif (polarity<=-.05): score="negative"
        else: score="neutral"
        
        return (polarity,subjectivity,score)
    return None

In [9]:
i=1
for idx,story_id in enumerate(df["storyId"].values):
    sentiment_vars=sentiment_analysis(story_id)
    if sentiment_vars:
        df["Polarity"].iloc[idx]=sentiment_vars[0]
        df["Subjectivity"].iloc[idx]=sentiment_vars[1]
        df["Score"].iloc[idx]=sentiment_vars[2]
    print("{}/{}".format(i,df.shape[0]),end="\r")
    i+=1

100/100

In [10]:
df.head()

Unnamed: 0,versionCreated,text,storyId,sourceCode,Polarity,Subjectivity,Score
2020-11-11 13:48:14.678,2020-11-11 13:48:53.675000+00:00,(EN) LLOYDS BANKING GROUP PLC Monthly Presenta...,urn:newsml:reuters.com:20201111:nGLF8342Vx:3,NS:GLFILE,0.0,0.0,neutral
2020-11-11 12:23:19.000,2020-11-11 12:23:19+00:00,COVID-19 'war games': the computer program tha...,urn:newsml:reuters.com:20201111:nL8N2GC54H:4,NS:RTRS,0.099009,0.423298,positive
2020-11-10 22:47:12.000,2020-11-10 22:47:12+00:00,UPDATE 1-Short sellers lost billions as travel...,urn:newsml:reuters.com:20201110:nL1N2HW2OD:4,NS:RTRS,0.055217,0.386753,positive
2020-11-10 16:55:11.000,2020-11-10 16:55:11+00:00,Short sellers lose $500 mln as European travel...,urn:newsml:reuters.com:20201110:nL8N2HW5YM:2,NS:RTRS,0.107278,0.413399,positive
2020-11-10 15:49:52.096,2020-11-10 15:49:53.474000+00:00,REG - Time Out Group plc - Directorate Change,urn:newsml:newsroom:20201110:nRSJ8859Ea:0,NS:LSE,0.17203,0.29655,positive


## Compare Sentiment Analysis to Share Price

In [11]:
# range of story dates
start = df['versionCreated'].min().replace(hour=0,minute=0,second=0,microsecond=0).strftime('%Y/%m/%d')
end = df['versionCreated'].max().replace(hour=0,minute=0,second=0,microsecond=0).strftime('%Y/%m/%d')

# minute-wise stock price during this range
Minute = ek.get_timeseries([stock_code], start_date=start, interval="minute")
Minute.tail()

LLOY.L,HIGH,LOW,OPEN,CLOSE,COUNT,VOLUME
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-11-11 16:33:00,32.595,31.905,31.905,32.595,26,842694381
2020-11-11 16:34:00,33.0,32.595,32.595,32.995,81,3634799338
2020-11-11 16:35:00,33.405,32.995,32.995,33.375,215,12394966496
2020-11-11 16:36:00,33.405,33.37,33.375,33.38,42,2696794958
2020-11-11 16:37:00,33.38,33.38,33.38,33.38,4,368829


In [12]:
# consider stock price at different intervals after the news
df["two_min"]=np.nan
df["five_min"]=np.nan
df["ten_min"]=np.nan
df["thirty_min"]=np.nan

In [13]:
# fill in stock price values
i=1
for idx,news_date in enumerate(df["versionCreated"].values):
    start_time=df["versionCreated"][idx]
    start_time=start_time.replace(second=0,microsecond=0)
    try: # incase news broke outside trading hours
        t0=Minute.iloc[Minute.index.get_loc(start_time),2] # stock value when news broke
        # stock price at each given interval
        for (field,mins) in [("two_min",2),("five_min",5),("ten_min",10),("thirty_min",30)]:
            df[field][idx]=((Minute.iloc[Minute.index.get_loc((start_time + datetime.timedelta(minutes=mins))),3]/(t0)-1)*100)
    except:
        pass
    print("{}/{}".format(i,df.shape[0]),end="\r")
    i+=1
df.head()

1/1002/1003/1004/1005/1006/1007/1008/1009/10010/10011/10012/10013/10014/10015/10016/10017/10018/10019/10020/10021/10022/10023/10024/10025/10026/10027/10028/10029/10030/10031/10032/10033/10034/10035/10036/10037/10038/10039/10040/10041/10042/10043/10044/10045/10046/10047/10048/10049/10050/10051/10052/10053/10054/10055/10056/10057/10058/10059/10060/10061/10062/10063/10064/10065/10066/10067/10068/10069/10070/10071/10072/10073/10074/10075/10076/10077/10078/10079/10080/10081/10082/10083/10084/10085/10086/10087/10088/10089/10090/10091/10092/10093/10094/10095/10096/10097/10098/10099/100100/100

Unnamed: 0,versionCreated,text,storyId,sourceCode,Polarity,Subjectivity,Score,two_min,five_min,ten_min,thirty_min
2020-11-11 13:48:14.678,2020-11-11 13:48:53.675000+00:00,(EN) LLOYDS BANKING GROUP PLC Monthly Presenta...,urn:newsml:reuters.com:20201111:nGLF8342Vx:3,NS:GLFILE,0.0,0.0,neutral,0.149701,0.10479,0.02994,-0.329341
2020-11-11 12:23:19.000,2020-11-11 12:23:19+00:00,COVID-19 'war games': the computer program tha...,urn:newsml:reuters.com:20201111:nL8N2GC54H:4,NS:RTRS,0.099009,0.423298,positive,0.0,0.054463,0.242057,0.499244
2020-11-10 22:47:12.000,2020-11-10 22:47:12+00:00,UPDATE 1-Short sellers lost billions as travel...,urn:newsml:reuters.com:20201110:nL1N2HW2OD:4,NS:RTRS,0.055217,0.386753,positive,,,,
2020-11-10 16:55:11.000,2020-11-10 16:55:11+00:00,Short sellers lose $500 mln as European travel...,urn:newsml:reuters.com:20201110:nL8N2HW5YM:2,NS:RTRS,0.107278,0.413399,positive,,,,
2020-11-10 15:49:52.096,2020-11-10 15:49:53.474000+00:00,REG - Time Out Group plc - Directorate Change,urn:newsml:newsroom:20201110:nRSJ8859Ea:0,NS:LSE,0.17203,0.29655,positive,-0.305839,-0.461879,-0.04057,-0.274631


## Results

In [14]:
grouped = df.groupby(['Score']).mean()
print("Mean percentage price movement after news article published for stock {} (Total {} articles analysed).".format(stock_code,n_articles))
grouped

Mean percentage price movement after news article published for stock LLOY.L (Total 100 articles analysed).


Unnamed: 0_level_0,Polarity,Subjectivity,two_min,five_min,ten_min,thirty_min
Score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
negative,-0.067475,0.399961,0.140994,0.052873,-0.652097,
neutral,0.008956,0.229273,0.058859,0.049314,0.01941,0.090059
positive,0.117605,0.440865,0.192462,-0.134817,0.120995,0.381661
