# <h1 style='font-size:2.2rem;color:orange;'>Stock Markets Sensitivity to Social vs. News Media Sentiment</h1>

## 1. Install packages

In [None]:
#pip install yfinance
#pip install snscrape
#pip install emoji --upgrade

## 2. Import Libraries 

In [None]:
import pandas as pd
import numpy as np
import bs4 as bs
from selenium import webdriver
import requests
import time
import concurrent.futures
import re
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
import yfinance as yf
import os
import json
import sys
import string
import itertools
from segtok.segmenter import split_single

from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.stem import WordNetLemmatizer
from collections import Counter
import fastparquet

from flair.models import TextClassifier
from flair.data import Sentence
from segtok.segmenter import split_single
from afinn import Afinn
from sklearn.preprocessing import MinMaxScaler


import statsmodels.api as sm # statistical models including regression
import scipy as sp # scientific calculation toolkit
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from numpy import mean
from numpy import std
from numpy.random import randn
from numpy.random import seed
from matplotlib import pyplot

## 3. Download datasets | Stock Data | News Data | Twitter Data

### 3.1 Scrap Stock Data

In [None]:
# Initialize a list of selected stocks
tickers= ['AAPL', 'MSFT', 'TSLA', 'AMZN', 'ATVI', 'NVDA', 'FB', 'UBER', 'V', 'MA', 'AVGO', 'CSCO', 'ADBE', 'CRM', 'AMD', 'INTC', 'NFLX']

for ticker in tickers:
    # Get the daily data for each stock from Sep 2021 to Feb 2022
    data = yf.download(ticker,'2021-09-01','2022-03-01')
    data['Ticker'] = ticker
    # Output to csv
    data.to_csv('Price_'+ str(ticker) + '.csv')


for ticker in tickers:
    # Get the hourly data for each stock from Sep 2021 to Feb 2022
    data = yf.download(ticker,'2021-09-01','2022-03-01',interval="60m")
    data['Ticker'] = ticker
    # Output to csv
    data.to_csv('Hourly_Price_'+ str(ticker) + '.csv')

### 3.2 Scrap News

In [None]:
# Set up function to scrape news content from the news links
def linkScraper(newsLinks):
    
    # Try to get the HTML elements using bs4
    # If attempt is successful, append HTML elements to lists; If not, append nan to lists
    try:
        linkResp = requests.get(newsLinks)
        linkSoup = bs.BeautifulSoup(linkResp.text, 'lxml')
        
    except:
        print(newsLinks) 

    # Get url
    try:
        url.append(linkSoup.find('meta',{"property":"og:url"}).get('content'))
    except:
        url.append(np.nan)

    # Get title
    try:
        title.append(linkSoup.find('h1',{"class":"text__text__1FZLe text__dark-grey__3Ml43 text__medium__1kbOh text__heading_2__1K_hh heading__base__2T28j heading__heading_2__3Fcw5"}).text)
    except:
        title.append(np.nan)

    # Get date
    try:        
        date.append(linkSoup.find('span',{"class":"date-line__date__23Ge-"}).text + ' ' + linkSoup.findAll('span',{"class":"date-line__date__23Ge-"})[1].text)
    except:
        date.append(np.nan)

    # Get author
    try:
        author.append(linkSoup.find('a',{"class":"author-name__author__1gx5k"}).text)
    except:
        author.append(np.nan)

    # Get content
    try:
        content.append(linkSoup.find('div',{"class":"article-body__content__3VtU3 paywall-article"}).get_text(separator=' '))
    except:
        content.append(np.nan)
    

# Set up function to scrape all news links
def newScraper():
    # Initialize targeted stocks symbols
    symbols = ['Amazon', 'Apple', 'Microsoft', 'Tesla', 'Blizzard', 'Nvidia', 'Facebook',
               'Uber', 'Mastercard', 'AMD', 'Intel', 'Netflix']
    
    global news, stock, url, title, date, author, content
    stock = []
    url = []
    title = []   
    date = []
    author = []  
    content = [] 
    
    driver = webdriver.Chrome("chromedriver.exe")
    
    # Set up for loop to go through each stock
    for symbol in symbols: 
        searchFormat = 'https://www.reuters.com/site-search/?query={}&offset={}&sort=newest&date=past_year'  
        newsLinks = []

        # Set up for loop to go through each page for each stock
        for i in range(1,180):
            try: 
                driver.get(searchFormat.format(symbol, i*10-10))
                driver.implicitly_wait(3)
                time.sleep(1) 
                
            except:
                continue
            
            # Set up for loop to get each news for each page
            for attempt in range(3):
                try: 

                    # Obtain the web links for each news in that page
                    elems = driver.find_elements_by_css_selector(".search-results__item__2oqiX [href]")
                    if elems != []:
                        for elem in elems:
                            # Append the news links into a list
                            newsLinks.append(elem.get_attribute('href'))

                    else:
                        print('no elements, go to next stock')
                        break 
                    
                except: 
                    print ('webpage error, retrying again')
                    continue

            else:
                continue
            
            break
    
 
        #Delete duplicates 
        newsLinks = list(set(newsLinks))

        # Use concurrency to get the web elements for each scraped web link
        with concurrent.futures.ThreadPoolExecutor(max_workers=30) as executor: 
            executor.map(linkScraper, newsLinks)
        stock.extend([symbol] * len(newsLinks))
        

        # Verify data collection completion
        print(str(symbol) + " stock length is : " + str(len(stock)))
        print(str(symbol) + " url length is : " + str(len(url)))
        print(str(symbol) + " title length is : " + str(len(title)))
        print(str(symbol) + " date length is : " + str(len(date)))
        print(str(symbol) + " author length is : " + str(len(author)))
        print(str(symbol) + " content length is : " + str(len(content)))
    
    # Output dataframe
    news = pd.DataFrame({'Stock': stock, 'Date':date, 'Title': title, 'Author': author, 'Content':content, 'Url':url})  
    
    return (news)



In [6]:
news.head()

Unnamed: 0,Ticker,Stock,Date,Title,Author,Content,Url
0,AMD,AMD,02/07/2021 21:31:00,AMD directors dodge shareholder derivative sui...,Jody Godoy,Summary Related documents Shareholder showed n...,https://www.reuters.com/legal/litigation/amd-d...
1,AMD,AMD,27/04/2021 20:27:00,"AMD lifts revenue forecast, CEO says supply ch...",Reuters,April 27 (Reuters) - Advanced Micro Devices In...,https://www.reuters.com/technology/amd-lifts-a...
2,AMD,AMD,25/05/2021 13:00:00,Oracle launches Arm-based cloud computing serv...,Stephen Nellis,May 25 (Reuters) - Oracle Corp (ORCL.N) on T...,https://www.reuters.com/technology/oracle-laun...
3,AMD,AMD,29/11/2021 20:18:00,Wall Street regains some ground after virus pu...,Ambar Warrick,"Summary Companies (Updates prices, adds commen...",https://www.reuters.com/markets/europe/wall-st...
4,AMD,AMD,21/09/2021 06:02:00,Novartis buys gene therapy firm Arctos Medical...,Reuters,"ZURICH, Sept 21 (Reuters) - Swiss drugmaker No...",https://www.reuters.com/business/healthcare-ph...


### 3.3 Scrap Twitter

In [None]:
tickers = ['AMZN', 'AAPL', 'MSFT', 'TSLA', 'ATVI', 'NVDA', 'FB', 'UBER', 'MA', 'AMD', 'INTC', 'NFLX']
for i in tickers:
    # Define search term as #ticker and $ticker
    search_term = '#{ticker} ${ticker}'.format(ticker = i)

    # Set date range to be Sep 2021 to Feb 2022
    snscrape_code = "snscrape --jsonl --since 2021-09-01 twitter-search \"{x} until:2022-03-01\" > {ticker}_tweets.json".format(x = search_term, ticker = i)
    
    # Using OS library to call CLI commands in Python
    os.system(snscrape_code)

    # Read the json file generated from the CLI commands above and create a pandas dataframe
    read_path = '{ticker}_tweets.json'.format(ticker = i)
    raw_tweets = pd.read_json(read_path, lines=True)

    # Filter English tweets
    raw_tweets = raw_tweets[raw_tweets.lang == 'en']
    raw_tweets = raw_tweets.rename(columns={'id': 'tweet_id'})
    
    # Filter useful columns
    raw_tweets = raw_tweets[['date','tweet_id','content','user','replyCount','retweetCount','likeCount','quoteCount','hashtags']]
    
    # Fetch user column dictionary data
    df = pd.concat([raw_tweets.drop(['user'], axis=1), raw_tweets['user'].apply(pd.Series)], axis=1)
    df = df[['date','tweet_id','content','replyCount','retweetCount','likeCount','quoteCount', 'verified','followersCount', 'friendsCount', 'listedCount', 'mediaCount']]
    df['ticker'] = i

    # Remove duplicate tweets to avoid bot tweets
    df = df.drop_duplicates(subset=['content'], keep='first')
    
    # Output to csv
    df.to_csv(os.path.join('tweet_{ticker}.csv'.format(ticker = i)), index=False)
    print('~~ Finished outputting: ' + 'tweet_{ticker}.csv'.format(ticker = i))


In [None]:
df.head()

Unnamed: 0_level_0,tweet_id,content,replyCount,retweetCount,likeCount,quoteCount,verified,followersCount,friendsCount,listedCount,mediaCount,ticker
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2022-02-28 07:00:22+00:00,1498191328072675331,$NFLX Weekly. #NFLX 2 hammer candlesticks (one...,0,0,4,0,False,18590,494,520,8381,NFLX
2022-02-24 12:10:07+00:00,1496819727708135428,$NFLX whoops #NFLX,0,0,0,0,False,786,58,25,3007,NFLX
2022-02-23 15:15:28+00:00,1496503985951641605,$nflx #nflx + 48% gain already this AM #tradin...,0,0,1,0,False,20,37,0,39,NFLX
2022-02-23 00:01:18+00:00,1496273928502841344,#Netflix $NFLX #NFLX What a horrendous looking...,0,0,0,0,False,524,1054,6,1328,NFLX
2022-02-22 19:25:44+00:00,1496204581268721664,$NFLX - #NFLX chart on,0,0,0,0,False,1842,0,34,10933,NFLX


## 4. Data Pre-processing | Stock Data | News Data | Twitter Data

### 4.1 News Cleaning

In [None]:
# Drop nan
news = news.dropna()
news['Content_Clean'] = news.Content
news['Content_Clean'] = [i.replace('Register now for FREE unlimited access to Reuters.com Register ', '')
            .replace(' Our Standards:  The Thomson Reuters Trust Principles.', '') for i in news.Content_Clean]

# Convert to lowercase
news.Content_Clean = news.Content_Clean.map(lambda x: x.lower())

# Remove symbols
news.Content_Clean = news.Content_Clean.map(lambda x: re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ', x))
news.Content_Clean = news.Content_Clean.map(lambda x: re.sub('@[^\s]+',' ', x))
news.Content_Clean = news.Content_Clean.map(lambda x: re.sub('[,.\'"!+?\-=)(*><&/;:$%]', ' ', x))
news.Content_Clean = news.Content_Clean.map(lambda x: re.sub('[\s]+', ' ', x))


# Insert ticker column for stocks
ticker = ['AMZN', 'AAPL', 'MSFT', 'TSLA', 'ATVI', 'NVDA', 'FB', 'UBER', 'MA', 'AMD', 'INTC', 'NFLX']
stockDict = dict(zip(news.Stock.unique(), ticker))
tickerList = list(news.Stock.map(stockDict))
news.insert(loc = 0, column = 'Ticker', value = tickerList)


# Convert date column to datetime
news.Date = pd.to_datetime(news['Date'])
news.Date = news.Date.dt.strftime('%d/%m/%Y %H:%M:%S')


# Tokenize news
def newsToken(i):
    for i in news.Content_Clean: 
         yield(gensim.utils.simple_preprocess(str(i), deacc=True)) 
 
    
# Remove stopwords
def newsStopword(content):   
    stopWords = stopwords.words('english') 
    # Customize stop words
    stopWords.extend(['reuters', 'january', 'february', 'march', 'april', 'may', 'june', 
                      'july', 'august', 'september', 'october', 'november', 'december', 
                      'london', 'summary', 'new york','bengaluru', 'america'])
    keepWords = ['up','down','against', 'above','below','off','over','further','no','not','only']
    stopWords = list(set(stopWords) - set(keepWords))
    return [[i for i in simple_preprocess(str(doc)) 
             if i not in stopWords and len(i) >=1 ] for doc in content] 

    
# Lemmatize tokens
def newsLemmatization(content, allowed_postags=['NOUN', 'PROPN', 'VERB', 'ADP', 'ADJ']):  
    # Use spaCy library and load English package
    nlp = spacy.load("en_core_web_sm")  
    texts_out = []
    # Perform the function on each news content
    for i in content: 
        # Join the words together for lemmatization analysis
        doc = nlp(" ".join(i)) 
        # Get the lemmatized version for each word
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags]) 
    return texts_out


# Execute step by step
matrix1 = list(newsToken(news.Content_Clean))  
matrix2 = newsLemmatization(matrix1)
matrix3 = newsStopword(matrix2) 

# Insert columns for tokenized content
news.insert(loc = 7 , column = 'Content_Clean_Token', value = matrix3)

In [None]:
news.head()

Unnamed: 0,Ticker,Stock,Date,Title,Author,Content,Content_Clean,Content_Clean_Token,Url
0,AMD,AMD,02/07/2021 21:31:00,AMD directors dodge shareholder derivative sui...,Jody Godoy,Summary Related documents Shareholder showed n...,related document shareholder show basis skip d...,['related' 'document' 'shareholder' 'show' 'ba...,https://www.reuters.com/legal/litigation/amd-d...
1,AMD,AMD,27/04/2021 20:27:00,"AMD lifts revenue forecast, CEO says supply ch...",Reuters,April 27 (Reuters) - Advanced Micro Devices In...,advance micro devices inc amd raise annual rev...,['advance' 'micro' 'devices' 'inc' 'amd' 'rais...,https://www.reuters.com/technology/amd-lifts-a...
2,AMD,AMD,25/05/2021 13:00:00,Oracle launches Arm-based cloud computing serv...,Stephen Nellis,May 25 (Reuters) - Oracle Corp (ORCL.N) on T...,oracle corp orcl tuesday launch cloud computin...,['oracle' 'corp' 'orcl' 'tuesday' 'launch' 'cl...,https://www.reuters.com/technology/oracle-laun...
3,AMD,AMD,29/11/2021 20:18:00,Wall Street regains some ground after virus pu...,Ambar Warrick,"Summary Companies (Updates prices, adds commen...",company update price add commentary change byl...,['company' 'update' 'price' 'add' 'commentary'...,https://www.reuters.com/markets/europe/wall-st...
4,AMD,AMD,21/09/2021 06:02:00,Novartis buys gene therapy firm Arctos Medical...,Reuters,"ZURICH, Sept 21 (Reuters) - Swiss drugmaker No...",zurich sept swiss drugmaker novartis novn say ...,['zurich' 'sept' 'swiss' 'drugmaker' 'novartis...,https://www.reuters.com/business/healthcare-ph...


### 4.2 Tweets Cleaning

In [None]:
df = pd.DataFrame()

# Merge all tweets csv within a given path
for file in glob.glob("*.csv"):
    df1 = pd.read_csv(file)
    df = pd.concat([df, df1])
    
# Convert date column to datetime
df.date = pd.to_datetime(df['date'])
df.date = df.date.dt.strftime('%d/%m/%Y %H:%M:%S')


def _1_TextprocessTweet(tweet):
    #C onvert to lowercase
    tweet = tweet.lower()
    # Remove www.* or https?://*
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',tweet)
    # Remove @username 
    tweet = re.sub('@[^\s]+',' ',tweet)
    # Remove additional white spaces
    tweet = re.sub('[\s]+', ' ', tweet)
    # Remove newlines
    tweet = tweet.strip('\n')

    # Remove all username mentions, hashtags and ticker tags
    tweet = ' '.join(re.sub("(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)|(\$[A-Za-z0-9]+)", " ", tweet).split())
    
    # Replace emoji with text
    tweet = emoji.demojize(tweet)
    tweet = tweet.replace(":"," ")
    tweet = ' '.join(tweet.split())
    tweet = tweet.replace('_'," ")
    return tweet


# Tokenize
def _2_Tokenize_Stopword(tweet):  

    # Remove punctuation
    tweet = tweet.replace('\'','')
    tweet = re.sub('[%s]' % re.escape(string.punctuation), ' ', tweet) 

    # Perform word_tokenize
    tokens = [w for w in word_tokenize(tweet) if w.isalpha()] 

    # Customize and remove stop words
    stopWords = stopwords.words('english') 
    keepWords = ['up','down','against', 'above','below','off','over','further','no','not','only']
    stopWords = list(set(stopWords) - set(keepWords))
    stopWords.extend(['amzn', 'aapl', 'msft', 'tsla', 'atvi', 'nvda', 'fb', 'uber', 'ma', 'amd', 'intc', 'nflx'])

    tweet = [t for t in tokens if t not in stopWords]
    return tweet


# Lemmatize tokens
def _3_Lemmatization(tweet, allowed_postags=['NOUN', 'PROPN', 'VERB', 'ADP','ADJ']):  
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(" ".join(tweet))
    tweet = [token.lemma_ for token in doc if token.pos_ in allowed_postags]
    return tweet


# Populate various processed columns
df['content_clean'] = df['content'].apply(_1_TextprocessTweet)
df['content_tokenized'] = df['content_clean'].apply(_2_Tokenize_Stopword)


# Use concurrency to speed up the lemmatization function
with concurrent.futures.ThreadPoolExecutor(max_workers=30) as executor: 
    df['content_lemmatized'] = list(executor.map(_3_Lemmatization, df['content_tokenized']))

In [27]:
df.head()

Unnamed: 0,date,tweet_id,content,replyCount,retweetCount,likeCount,quoteCount,verified,followersCount,friendsCount,listedCount,mediaCount,ticker,content_clean,content_tokenized,content_lemmatized
0,28/02/2022 22:39:16,1498427612665987078,$SPY #SPY #QQQ $QQQ $UVXY #UVXY $AMC #AMC $AAP...,0,1,2,0,False,279,1026,4,833,AAPL,final liquidity pump underway. this is it. the...,"['final', 'liquidity', 'pump', 'underway', 'su...","['final', 'liquidity', 'pump', 'underway', 'su..."
1,28/02/2022 21:42:08,1498413233698443272,$AAPL held Premarket high for a bit in morning...,0,0,0,0,False,60,258,4,32,AAPL,held premarket high for a bit in morning befor...,"['held', 'premarket', 'high', 'bit', 'morning'...","['hold', 'premarket', 'high', 'bit', 'morning'..."
2,28/02/2022 21:30:00,1498410179972399105,$AAPL closed today at $165.12. If you bought 1...,0,0,0,0,False,50,11,0,1,AAPL,closed today at .12. if you bought 1 share of ...,"['closed', 'today', 'bought', 'share', 'closin...","['close', 'today', 'buy', 'share', 'closing', ..."
3,28/02/2022 10:28:15,1498243644431732743,$SPY #SPY #QQQ $QQQ $AMC #AMC $AAPL #AAPL $GME...,0,1,2,0,False,279,1026,4,833,AAPL,levels still valid and all levels will be rete...,"['levels', 'still', 'valid', 'levels', 'retest...","['level', 'valid', 'level', 'reteste', 'up', '..."
4,28/02/2022 04:57:58,1498160526311915520,$AAPL Weekly. #AAPL formed a hammer candlestic...,0,0,12,0,False,18591,494,520,8381,AAPL,"weekly. formed a hammer candlestick last week,...","['weekly', 'formed', 'hammer', 'candlestick', ...","['weekly', 'form', 'hammer', 'candlestick', 'l..."


## 5. Sentiment Analysis | News Data | Twitter Data

### 5.1 Library Flair (Using sentences)

In [None]:
# Tokenize by sentence since Flair is sentence-based
classifier = TextClassifier.load('en-sentiment')
news = news.dropna()

def flairClean(row):
    # Clean some uselss sentences first for news
    rows = row.replace('Register now for FREE unlimited access to Reuters.com Register ', '').replace(
        ' Our Standards:  The Thomson Reuters Trust Principles.', '') 

    # Clean symbols
    rows = re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ', rows)
    rows = re.sub('@[^\s]+',' ', rows)
    rows = re.sub('[\'"!+?=)(*><&/;:$%\n]', ' ', rows)
    rows = re.sub('[\s]+', ' ', rows)
    return rows

# Split into sentences
def sentence(paragraph):
    sentences = [elm for elm in split_single(paragraph)]
    return sentences

# Clean empty string in the list, in case the sentence is empty
def cleanEmpty(sentence):
    
    a = [x for x in sentence if x]
    
    return a


# Generate sentiment score
def flair(sentence):

    # Initialize variables
    poslen = 0
    neglen = 0 
    pos = 0
    neg = 0
    
    # Set up for loop to obtain the sentiment scores of each sentence
    for i in sentence:
        text = Sentence(i)
        classifier.predict(text)
               
        value = text.labels[0].to_dict()['value'] 
        if value == 'POSITIVE':
            pos += text.labels[0].to_dict()['confidence']
            poslen += 1
        else:
            neg += -(text.labels[0].to_dict()['confidence'])
            neglen += 1


    # Standardize the sentiment scores between [1,-1]
    # Calculate the compound sentiment score
    if poslen == 0 and neglen == 0:
        score = 0            
    elif neglen == 0:
        score = pos/poslen
    elif poslen == 0:
        score = neg/neglen
    else:
        score = (pos+neg)/(pos+abs(neg))
    
    return score



# Create new columns
news['Sentiment_Flair'] = news['Content'].apply(flairClean).apply(sentence).apply(flair)

try:
    df['Sentiment_Flair'] = df['content_clean'].apply(flairClean).apply(sentence).apply(cleanEmpty).apply(flair)
except:
    df['Sentiment_Flair'] = df['content_lemmatized'].apply(flair)

### 5.2 Library Afinn

In [None]:
# Set up Afinn function
def afinn(text):

    # Initialize values
    afinn = Afinn(language = 'en') 
    poslen = 0
    neglen = 0 
    pos = 0
    neg = 0

    # Standardize the sentiment scores between [1,-1]
    # Calculate the compound sentiment score
    for i in text:
        if afinn.score(i) > 0:
            pos += afinn.score(i)
            poslen += 1
        elif afinn.score(i) < 0:
            neg += afinn.score(i)
            neglen += 1

    if poslen == 0 and neglen == 0:
        score = 0            
    elif neglen == 0:
        score = (pos/poslen)/5
    elif poslen == 0:
        score = (neg/neglen)/5
    else:
        score = (pos+neg)/(pos+abs(neg))
    
    return score
    
# Create new columns
df['content_lemmatized'].apply(afinn)
news['Content_Clean_Token'].apply(afinn)

### 5.3 Library Vader

In [None]:
# Set up Vader function
def vader(text):

    # Edit lexicon to assign sentiment score for special terms and emojis
    new_words = {
    'fire': 4.0,
    'decreasing': -4.0,
    'increasing': 4.0,
    'decrease': -4.0,
    'increase': 4.0,
    'rocket':4.0,
    'up':4.0,
    'down':-4.0,
    'bull':4.0,
    'bear':-4.0
    }

    sia = SentimentIntensityAnalyzer()

    # Update lexicon
    sia.lexicon.update(new_words)

    # Initialize values
    poslen = 0
    neglen = 0 
    pos = 0
    neg = 0
    

    # Standardize the sentiment scores between [1,-1]
    # Calculate the compound sentiment score
    for i in text:
        if sia.polarity_scores(i)['compound'] > 0:
            pos += sia.polarity_scores(i)['compound']
            poslen += 1
        elif sia.polarity_scores(i)['compound'] < 0:
            neg += sia.polarity_scores(i)['compound']
            neglen += 1
                       
    if poslen == 0 and neglen == 0:
        score = 0            
    elif neglen == 0:
        score = (pos/poslen)
    elif poslen == 0:
        score = (neg/neglen)
    else:
        score = (pos+neg)/(pos+abs(neg))
    
    return score

### 5.4 Library TextBlob

In [None]:
# Set up TextBlob function
def textblob(text):

    # Initialize values
    poslen = 0
    neglen = 0 
    pos = 0
    neg = 0
    

    # Standardize the sentiment scores between [1,-1]
    # Calculate the compound sentiment score
    for i in text:
        if TextBlob(i).sentiment.polarity > 0:
            pos += TextBlob(i).sentiment.polarity
            poslen += 1
        elif TextBlob(i).sentiment.polarity < 0:
            neg += TextBlob(i).sentiment.polarity
            neglen += 1

    if poslen == 0 and neglen == 0:
        score = 0            
    elif neglen == 0:
        score = (pos/poslen)
    elif poslen == 0:
        score = (neg/neglen)
    else:
        score = (pos+neg)/(pos+abs(neg))
    
    return score

In [None]:
news.head()

Unnamed: 0,Ticker,Stock,Date,Title,Author,Content,Content_Clean,Content_Clean_Token,Url,Sentiment_Flair,Sentiment_Afinn,Sentiment_Vader,Sentiment_TextBlob
0,AMD,AMD,02/07/2021 21:31:00,AMD directors dodge shareholder derivative sui...,Jody Godoy,Summary Related documents Shareholder showed n...,related document shareholder show basis skip d...,['related' 'document' 'shareholder' 'show' 'ba...,https://www.reuters.com/legal/litigation/amd-d...,-0.60043,-0.78022,-0.593653,-0.656827
1,AMD,AMD,27/04/2021 20:27:00,"AMD lifts revenue forecast, CEO says supply ch...",Reuters,April 27 (Reuters) - Advanced Micro Devices In...,advance micro devices inc amd raise annual rev...,['advance' 'micro' 'devices' 'inc' 'amd' 'rais...,https://www.reuters.com/technology/amd-lifts-a...,0.370084,0.705882,0.780044,0.589689
2,AMD,AMD,25/05/2021 13:00:00,Oracle launches Arm-based cloud computing serv...,Stephen Nellis,May 25 (Reuters) - Oracle Corp (ORCL.N) on T...,oracle corp orcl tuesday launch cloud computin...,['oracle' 'corp' 'orcl' 'tuesday' 'launch' 'cl...,https://www.reuters.com/technology/oracle-laun...,0.436454,0.882353,0.281721,-0.524692
3,AMD,AMD,29/11/2021 20:18:00,Wall Street regains some ground after virus pu...,Ambar Warrick,"Summary Companies (Updates prices, adds commen...",company update price add commentary change byl...,['company' 'update' 'price' 'add' 'commentary'...,https://www.reuters.com/markets/europe/wall-st...,-0.48253,0.025641,0.200357,0.235957
4,AMD,AMD,21/09/2021 06:02:00,Novartis buys gene therapy firm Arctos Medical...,Reuters,"ZURICH, Sept 21 (Reuters) - Swiss drugmaker No...",zurich sept swiss drugmaker novartis novn say ...,['zurich' 'sept' 'swiss' 'drugmaker' 'novartis...,https://www.reuters.com/business/healthcare-ph...,0.250985,0.047619,0.408526,-0.215686


In [29]:
df.head()

Unnamed: 0,date,tweet_id,content,replyCount,retweetCount,likeCount,quoteCount,verified,followersCount,friendsCount,listedCount,mediaCount,ticker,content_clean,content_tokenized,content_lemmatized,Sentiment_Flair,Sentiment_Afinn,Sentiment_Vader,Sentiment_TextBlob
0,28/02/2022 22:39:16,1498427612665987078,$SPY #SPY #QQQ $QQQ $UVXY #UVXY $AMC #AMC $AAP...,0,1,2,0,False,279,1026,4,833,AAPL,final liquidity pump underway. this is it. the...,"['final', 'liquidity', 'pump', 'underway', 'su...","['final', 'liquidity', 'pump', 'underway', 'su...",0.99743,-0.6,-0.4404,0.0
1,28/02/2022 21:42:08,1498413233698443272,$AAPL held Premarket high for a bit in morning...,0,0,0,0,False,60,258,4,32,AAPL,held premarket high for a bit in morning befor...,"['held', 'premarket', 'high', 'bit', 'morning'...","['hold', 'premarket', 'high', 'bit', 'morning'...",0.900152,0.5,0.501733,0.38
2,28/02/2022 21:30:00,1498410179972399105,$AAPL closed today at $165.12. If you bought 1...,0,0,0,0,False,50,11,0,1,AAPL,closed today at .12. if you bought 1 share of ...,"['closed', 'today', 'bought', 'share', 'closin...","['close', 'today', 'buy', 'share', 'closing', ...",-0.986211,0.2,-0.151741,0.0
3,28/02/2022 10:28:15,1498243644431732743,$SPY #SPY #QQQ $QQQ $AMC #AMC $AAPL #AAPL $GME...,0,1,2,0,False,279,1026,4,833,AAPL,levels still valid and all levels will be rete...,"['levels', 'still', 'valid', 'levels', 'retest...","['level', 'valid', 'level', 'reteste', 'up', '...",0.247874,-0.2,-0.18795,0.0
4,28/02/2022 04:57:58,1498160526311915520,$AAPL Weekly. #AAPL formed a hammer candlestic...,0,0,12,0,False,18591,494,520,8381,AAPL,"weekly. formed a hammer candlestick last week,...","['weekly', 'formed', 'hammer', 'candlestick', ...","['weekly', 'form', 'hammer', 'candlestick', 'l...",-0.406989,0.0,-0.322088,-0.183333


## 6. Results

### 6.1 Merging Returns Data and Sentiment Scores

In [None]:
# Merge hourly data
import pandas as pd
from datetime import datetime

stocks = ['AMZN', 'AAPL', 'MSFT', 'TSLA', 'ATVI', 'NVDA', 'FB', 'UBER', 'MA', 'AMD', 'INTC', 'NFLX']

# Read Tweets DataFrame
tweets2 = pd.read_parquet('allTWEETS_Processed.parquet')
# Read News DataFrame
news2 = pd.read_parquet('News.parquet')

for i in stocks:

    tweets = tweets2.loc[tweets2['ticker'] == i]
    news = news2.loc[news2['Ticker'] == i]
    
    tweets.rename(columns = {'Sentiment_Flair':'T_Sentiment_Flair','Sentiment_Afinn':'T_Sentiment_Afinn',
                             'Sentiment_Vader':'T_Sentiment_Vader','Sentiment_TextBlob':'T_Sentiment_TextBlob'}, inplace = True)
    
    news.rename(columns = {'Date':'date','Sentiment_Flair':'N_Sentiment_Flair','Sentiment_Afinn':'N_Sentiment_Afinn',
                             'Sentiment_Vader':'N_Sentiment_Vader','Sentiment_TextBlob':'N_Sentiment_TextBlob'}, inplace = True)
    
    # Read date and sentiment scores only
    tweets = tweets[['date', 'T_Sentiment_Flair', 'T_Sentiment_Afinn', 'T_Sentiment_Vader','T_Sentiment_TextBlob']].sort_values(by='date')
    news = news[['date', 'N_Sentiment_Flair', 'N_Sentiment_Afinn', 'N_Sentiment_Vader','N_Sentiment_TextBlob']].sort_values(by='date')
    
    # Transform date to datetime
    tweets['date'] = pd.to_datetime(tweets['date'],format = "%d/%m/%Y %H:%M:%S")
    news['date'] = pd.to_datetime(news['date'],format = "%d/%m/%Y %H:%M:%S")
    
    # Group tweets and news by hours taking hourly average of sentiment scores each hour
    tweets=tweets.groupby(pd.Grouper(freq='H', key='date')).mean() 
    news=news.groupby(pd.Grouper(freq='H', key='date')).mean() 
    
    # Reset Index
    tweets.reset_index(inplace = True)
    news.reset_index(inplace = True)
    
    # Convert date to datetime again
    tweets['date'] = pd.to_datetime(tweets['date'],format = "%d/%m/%Y %H:%M:%S")
    news['date'] = pd.to_datetime(news['date'],format = "%d/%m/%Y %H:%M:%S")
    
    # Forward fill hours with no tweets and news data
    tweets = tweets.ffill(axis = 0)
    news = news.ffill(axis = 0)

#-----------------------------------------------------------------------------

    # Read stock price
    stock = pd.read_csv('Hourly_Price_{}.csv'.format(i))
    
    # Remove %z UTC timezone from stock['Datetime'] to align datetime
    stock['Datetime'] = stock['Datetime'].astype(str)
    stock.Datetime=stock.Datetime.str[:-6]
    
    # Transform date to datetime
    stock['Datetime'] = pd.to_datetime(stock['Datetime'], format = '%Y-%m-%d %H:%M:%S')
    
    # Read stock data
    stock = stock[['Datetime', 'Open' ,'High', 'Low', 'Close', 'Adj Close','Volume']].sort_values(by='Datetime')
    
    # Standardize stock hour datetime 
    stock=stock.groupby(pd.Grouper(freq='H', key='Datetime')).mean() 
    
    # Remove non-trading hours
    stock.dropna(subset = ['Adj Close'], inplace=True)
    
    # Reset Index
    stock.reset_index(inplace = True)
    
    # Calculate hourly stock returns
    stock['Return'] = stock['Adj Close'].pct_change(1)
    
    # Rename and remove columns
    stock.rename(columns = {'Datetime':'date'}, inplace = True)
    stock = stock[['date','Return', 'Open' ,'High', 'Low', 'Close', 'Adj Close','Volume']]
    
    # Merge stock data and tweets data
    df = pd.merge(stock, tweets, on=['date'])
    df = pd.merge(df, news, on=['date'])
    
    df.to_parquet('HourlyReturn_{}.parquet'.format(i))


#---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------


# Merge daily data
for i in stocks:

    tweets = tweets2.loc[tweets2['ticker'] == i]
    news = news2.loc[news2['Ticker'] == i]
    
    tweets.rename(columns = {'Sentiment_Flair':'T_Sentiment_Flair','Sentiment_Afinn':'T_Sentiment_Afinn',
                             'Sentiment_Vader':'T_Sentiment_Vader','Sentiment_TextBlob':'T_Sentiment_TextBlob'}, inplace = True)
    
    news.rename(columns = {'Date':'date','Sentiment_Flair':'N_Sentiment_Flair','Sentiment_Afinn':'N_Sentiment_Afinn',
                             'Sentiment_Vader':'N_Sentiment_Vader','Sentiment_TextBlob':'N_Sentiment_TextBlob'}, inplace = True)
    
    # Read date and sentiment scores only
    tweets = tweets[['date', 'T_Sentiment_Flair', 'T_Sentiment_Afinn', 'T_Sentiment_Vader','T_Sentiment_TextBlob']].sort_values(by='date')
    news = news[['date', 'N_Sentiment_Flair', 'N_Sentiment_Afinn', 'N_Sentiment_Vader','N_Sentiment_TextBlob']].sort_values(by='date')
    
    # Transform date to datetime
    tweets['date'] = pd.to_datetime(tweets['date'],format = "%d/%m/%Y %H:%M:%S")
    news['date'] = pd.to_datetime(news['date'],format = "%d/%m/%Y %H:%M:%S")
    
    # Group tweets and news by day taking daily average of sentiment scores each day
    tweets=tweets.groupby(pd.Grouper(freq='D', key='date')).mean() 
    news=news.groupby(pd.Grouper(freq='D', key='date')).mean() 
    
    # Reset Index
    tweets.reset_index(inplace = True)
    news.reset_index(inplace = True)
    
    # Convert date to datetime again
    tweets['date'] = pd.to_datetime(tweets['date'],format = "%d/%m/%Y %H:%M:%S")
    news['date'] = pd.to_datetime(news['date'],format = "%d/%m/%Y %H:%M:%S")
    
    # Forward fill hours with no tweets and news data
    tweets = tweets.ffill(axis = 0)
    news = news.ffill(axis = 0)

#-----------------------------------------------------------------------------

    # Read stock price
    stock = pd.read_csv('Price_{}.csv'.format(i))
    
    # Remove %z UTC timezone from stock['Datetime'] to align datetime
    stock = stock.reset_index()
    
    # Transform date to datetime
    stock['Date'] = pd.to_datetime(stock['Date'],format = "%Y-%m-%d")
    
    # Convert datetime to date
    stock['Date'] = pd.to_datetime(stock['Date'].dt.date)
    
    # Read data
    stock = stock[['Date', 'Open' ,'High', 'Low', 'Close', 'Adj Close','Volume', 'Ticker']].sort_values(by='Date')
    
    # Calculate daily stock returns
    stock['Return'] = stock['Adj Close'].pct_change(1)
    
    # Rename and remove columns
    stock.rename(columns = {'Date':'date'}, inplace = True)
    stock = stock[['date','Return', 'Open' ,'High', 'Low', 'Close', 'Adj Close','Volume', 'Ticker']]
    
    # Merge stock data and tweets data
    df = pd.merge(stock, tweets, on=['date'])
    df = pd.merge(df, news, on=['date'])
    
    df.to_parquet('DailyReturn_{}.parquet'.format(i))

In [33]:
# Display dataframe for AAPL as sample
aapl = pd.read_parquet('DailyReturn_AAPL.parquet')
aapl.head()

Unnamed: 0,date,Return,Open,High,Low,Close,Adj Close,Volume,Ticker,T_Sentiment_Flair,T_Sentiment_Afinn,T_Sentiment_Vader,T_Sentiment_TextBlob,N_Sentiment_Flair,N_Sentiment_Afinn,N_Sentiment_Vader,N_Sentiment_TextBlob
0,2021-09-01,0.004479,152.830002,154.979996,152.339996,152.509995,152.093964,80313700,AAPL,0.439025,0.075238,0.129183,0.261108,-0.451613,0.101139,0.166658,0.323046
1,2021-09-02,0.007475,153.869995,154.720001,152.399994,153.649994,153.23085,71115500,AAPL,0.389312,0.15,0.23985,0.012037,-0.375664,0.022325,0.152422,0.300089
2,2021-09-03,0.00423,153.759995,154.630005,153.089996,154.300003,153.879089,57808700,AAPL,0.321581,0.124074,0.056036,0.042972,-0.465355,-0.021327,0.042717,-0.135096
3,2021-09-07,0.015489,154.970001,157.259995,154.389999,156.690002,156.262573,82278300,AAPL,0.575571,0.002564,0.10747,-0.007832,-0.31051,0.341669,0.350883,0.272359
4,2021-09-08,-0.010084,156.979996,157.039993,153.979996,155.110001,154.686874,74420200,AAPL,0.504929,0.011111,0.053732,-0.115833,-0.263752,0.302215,0.449849,0.297079


### 6.2 Regression Analysis

In [None]:
# Set regression parameters
data1 = ['T', 'N']
time1 = ['Daily', 'Hourly']
mod1 = ['Flair', 'TextBlob', 'Vader', 'Afinn']


# Set up Regression
def reg (data, time, mod):

    # Select stocks 
    stocks1 = ['AMZN', 'AAPL', 'MSFT', 'TSLA', 'NVDA', 'FB', 'UBER', 'NFLX']
    
    for i in stocks1:
        test = pd.read_parquet('{}Return_{}.parquet'.format(time,i))
        
     
        test = test[['Return', '{}_Sentiment_{}'.format(data, mod)]]

        # Get the lagged sentiment scores
        test['{}_Sentiment_{}_Lag1'.format(data, mod)] = test['{}_Sentiment_{}'.format(data, mod)].shift(1)
        test['{}_Sentiment_{}_Lag2'.format(data, mod)] = test['{}_Sentiment_{}'.format(data, mod)].shift(2)
        test['{}_Sentiment_{}_Lag3'.format(data, mod)] = test['{}_Sentiment_{}'.format(data, mod)].shift(3)
        test['{}_Sentiment_{}_Lag6'.format(data, mod)] = test['{}_Sentiment_{}'.format(data, mod)].shift(6)

        # Select the columns as independent variables for regression
        all_columns = "+".join(np.delete(test.columns, [0]))
        my_formula = "Return~" + all_columns
        
        # Get the coefficients and p-values
        coef = smf.ols(formula = my_formula, data = test).fit().params
        pval = smf.ols(formula = my_formula, data = test).fit().pvalues
        table = pd.concat({'Coef': coef,'P-value': pval}, axis=1)

        # Create a table to present the results
        table['Significance'] = np.where(table['P-value'] <= 0.05, 'Yes', 'No')
        table['Correlation'] = np.where((table['Coef'] >0 ) & (table['Significance'] =='Yes' ), 'Postive', 'Negative/No')

        # Output the table
        print (i)
        print (table)       

In [None]:
# Regression results for news
for i in mod1:  
     reg('N', 'Hourly', i)
for i in mod1:  
     reg('N', 'Daily', i)
     

# Regression results for tweets
for i in mod1:  
     reg('T', 'Hourly', i)
for i in mod1:  
     reg('T', 'Daily', i)

## 7. Trading Strategies

### 7.1 Strategy 1: 0.2 buy-sell


In [None]:
# Read DataFrame (sample: AMZN)
df = pd.read_parquet('DailyReturn_AMZN.parquet')

# Convert columns to list
df_close = df['Close'].tolist()
df_adj_close = df['Adj Close'].tolist()
df_ticker = df['Ticker'].tolist()
T_Sentiment_Flair = df['T_Sentiment_Flair'].tolist()
T_Sentiment_Afinn = df['T_Sentiment_Afinn'].tolist()
T_Sentiment_Vader = df['T_Sentiment_Vader'].tolist()
T_Sentiment_TextBlob = df['T_Sentiment_TextBlob'].tolist()


def change_buy_sell():
    # Set ac_balance (Account Balance) as $1M
    ac_balance = 1000000
    # Set amount as number of stocks
    amount=0
    # Initialize buy price
    buyprice = 0
    # Initialize sell price 
    sellprice = 0
    # Initialize Ireturn(investment return) which is the price difference of each trade 
    Ireturn = 0
    i=0
    securitiesMV = 0 
    # Initalize a list of return 
    Return = []
    for i in range(len(sscore)-1):
            # Buy_action
            # If sentiment score increases by 0.2 over a day
             if(sscore[i+1]-sscore[i])>=0.2:
            # Set the buy price of this trade 
                 buyprice = stock_price[i+1]
            # Calculte the amount of stock and Market Value of securities bought
            # The two lines below are the same but makes this code more readable
                 amount = ac_balance/buyprice
                 securitiesMV = amount * buyprice
            # Sell_action 
            # If sentiment score decreases by 0.2 over a day
             elif(sscore[i+1]-sscore[i]) <=-0.2: 
                 # Set the sell price of this trade 
                 sellprice = stock_price[i+1]
                 # Calculte the Market Value of securities sold
                 sellingMV = (amount*sellprice)
                 # Avoid calculating the price difference when there is no trade 
                 if buyprice != 0:
                     # Calcualte price difference 
                     Ireturn = sellingMV- securitiesMV  
                     # Add to list
                     Return.append(Ireturn)
                     # Reset null trade position and be ready to execute next trade
                     buyprice = sellprice = amount = 0
                     # Update Account Balance
                     ac_balance = Ireturn + ac_balance
    # Calculate the ROI（％）
    return  (sum(Return)/1000000)*100

# Find out the best among sentiment anaylsis models and the avg. return in %
winner= None
winning_score=-1000000000000
change_buy_sell_avg=0

sscore= T_Sentiment_Flair
stock_price= df_close
print('<<Flair>>   G/L',change_buy_sell(),'%')
change_buy_sell_avg+=change_buy_sell()
if change_buy_sell() >= winning_score:
    winner = 'Flair'
    winning_score=change_buy_sell()

sscore= T_Sentiment_Afinn
stock_price= df_close
print('<<Afinn>>    G/L',change_buy_sell(),'%')
change_buy_sell_avg+=change_buy_sell()
if change_buy_sell() >= winning_score:
    winner = 'Flair'
    winning_score=change_buy_sell()
    
sscore= T_Sentiment_Vader
stock_price= df_close
print('<<Vader>>    G/L',change_buy_sell(),'%')
change_buy_sell_avg+=change_buy_sell()
if change_buy_sell() >= winning_score:
    winner = 'Flair'
    winning_score=change_buy_sell()
    
sscore= T_Sentiment_TextBlob
stock_price= df_close
print('<<TextBlob>>    G/L',change_buy_sell(),'%')
change_buy_sell_avg+=change_buy_sell()
if change_buy_sell() >= winning_score:
    winner = 'Flair'
    winning_score=change_buy_sell()
    
print('   [[',winner,winning_score,'%',']]')
print('                                          Strategy[0.2 BS] avg',change_buy_sell_avg/4,'%')

### 7.2 Strategy 2: Cross Trade

In [None]:
def cross_trade():
    # Set initial trading amount of stock as 0 
    amount=0
    # Set beginning account balance as 1000000
    Ac_balance=1000000
    # Set buy price, sell price and return as 0
    buyprice = 0
    sellprice = 0
    Ireturn = 0
    i=0
    securitiesMV = 0 
    # Create a list for each return 
    Return = []
    for i in range(21,len(sscore)-1):
        # Calculate 2day and 4day moving average 
        # Declare variables (fl = fast line; sl = slow line)
        fl=sum(sscore[i-2:i])/2
        sl=sum(sscore[i-4:i])/4
        # Calculate 2day and 4day moving average on a day before 
        # In order to spot if there is a breakthough (cross)
        # Declare variables (cfl = fast line for comparsion; sl = slow line for comparsion)
        cfl=sum(sscore[i-3:i-1])/2
        csl=sum(sscore[i-5:i-1])/4
        
        
        # Trigger buy action if fast line is above slow line and if there is a twist
        # Twist = (slow line - buy price) changing from negative to positive 
        if sl-fl <=0 and (cfl-csl)*(fl-sl)<=0: 
            # Buy action
            # Set buy price 
            buyprice = stock_price[i+1]
            # Calculate amount of stock and securities market value 
            amount = Ac_balance/buyprice
            securitiesMV = amount * buyprice
        # Sell_action 
        # Trigger sell action if fast line is below slow line and if there is a twist
        # Twist = (slow line - buy price) changing from positive to negative 
        elif sl-fl >=0 and (cfl-csl)*(fl-sl)<=0:   
           # Set sellprice and selling market value 
           sellprice = stock_price[i+1]
           sellingMV = (amount*sellprice)
           # Avoid selling before buying 
           if buyprice != 0:
               # Calculate return (price difference)
               Ireturn = sellingMV-securitiesMV
               # Add return to the list 
               Return.append(Ireturn)
               # Reset null trade position and be ready to execute next trade
               buyprice = sellprice = amount = 0
               # Update account balance 
               Ac_balance = Ireturn + Ac_balance
    # Calculate Return ROI (%)           
    return  ((sum(Return))/1000000)*100

# Find out the best among sentiment anaylsis models and the avg. return in %

winner= None
winning_score=-1000000000000000
cross_avg=0

sscore= T_Sentiment_Flair
stock_price= df_close
print('<<Flair>>   G/L:',cross_trade(),'%')
cross_avg+=cross_trade()
if cross_trade() >= winning_score:
    winner = 'Flair'
    winning_score=cross_trade()

sscore= T_Sentiment_Afinn
stock_price= df_close
print('<<Afinn>>   G/L:',cross_trade(),'%')
cross_avg+=cross_trade()
if cross_trade() >= winning_score:
    winner = 'Afinn'
    winning_score=cross_trade()

sscore= T_Sentiment_Vader
stock_price= df_close
print('<<Vader>>   G/L:',cross_trade(),'%')
cross_avg+=cross_trade()
if cross_trade() >= winning_score:
    winner = 'Vader'
    winning_score=cross_trade()

sscore= T_Sentiment_TextBlob
stock_price= df_close
print('<<TextBlob>>   G/L:',cross_trade(),'%')
cross_avg+=cross_trade()
if cross_trade() >= winning_score:
    winner = 'TextBlob'
    winning_score=cross_trade()

# Calculate the Avg. return of all eight outcomes
print('   [[',winner,winning_score,'%',']]')
print('                                          Strategy[cross] avg',cross_avg/4,'%')     

### 7.3 Overall Average ROI

In [None]:
print('Master_AVG:',((change_buy_sell_avg/4)+(cross_avg/4))/2,'%')