In [17]:
# All modules that are required to import:
import numpy as np
import pandas as pd
import time

import requests
import bs4
import json
import re

import nltk
nltk.download('punkt')
nltk.download('twitter_samples')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

from nltk.corpus import twitter_samples

pos_tweet = twitter_samples.tokenized('positive_tweets.json')
neg_tweet = twitter_samples.tokenized('negative_tweets.json')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\cheol\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\cheol\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\cheol\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\cheol\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [18]:
# filter out the stop words
# borrowed list of stop words from https://github.com/kavgan/stop-words/blob/master/terrier-stop.txt 

filepath = open("terrier-stop.txt", "r")
temp = filepath.read().split("\n")
stop_words = { key : 1 for key in temp }

In [19]:
# Goal of this part: Read through all positive / negative tweets, normalize and remove unnecessary words from tweets, then create actual dictionary-like to use for our dataset

# Convert all complex part-of-speech to basic words
# List of part-of-speech is in this link: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
# WordNetLemmatizer has a function lemmatize where you can convert complex part of speech words into basic forms
# Things to consider:
#   Remove all unnecessary words from normalized_neg_tweets / normalized_pos_tweets
#   1. Remove mentions(starts with @)
#   2. Remove links (starts with https:// or http:// )
#   3. Remove punctuation (starts with ! or ?)
#   4. Remove Stop-Words (words that do have little to no meaning and does not affect the context of the sentence) to make our dataset more concise
# Note that we are keeping emoji (i.e. :) or :( . That is because these emojis do actually show sentiment of the text context)
# If words are DETERMINERS (DT), COORDINATING CONJUCTIONS (CC), PREPOSITIONS (IN), PERSONAL / POSSESSIVE PRONOUNS (PRP / PRP$), or WH-PRONOUNS (WP) WH-ADVERB(WRB), we remove it (consider as Stop words)
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
import string

normalizer = WordNetLemmatizer()
punctuation_and_stop_words = {'!': 1, '"': 1, '#': 1, '$': 1 ,'%': 1, '&': 1, "'": 1,'(': 1,')': 1,'*': 1,'+': 1,',': 1,'-': 1,'.': 1,':': 1,';': 1,'<': 1,'=': 1,'>': 1,'?': 1,'@': 1,'[': 1,']': 1,'^': 1,'_': 1,'`': 1,'{': 1,'|': 1,'}': 1,'~': 1,'https://': 1,'http://': 1}
stop_words_final = {**stop_words, **punctuation_and_stop_words}


def determiners(word):
    if word in stop_words_final:
        return False
    else:
        return True

def normalize(tweet_list):
    normalized_tweet = []
    for tweet in tweet_list:
        sentence = []
        for token, tag in pos_tag(tweet):
            # For Complex Noun words:
            if tag.startswith('NN'):
                new_tag = 'n'
            # For Complex Verb words
            elif tag.startswith('VB'):
                new_tag = 'v'
            # For stop-words
            elif tag.startswith('DT') or tag.startswith('CC') or tag.startswith('IN') or tag.startswith('PRP') or tag.startswith('PRP$') or tag.startswith('WP') or tag.startswith('WRB'):
                continue 
            # Every other words, convert them into adjective (pos = 'a')
            else:
                if determiners(token):
                    new_tag = 'a'
                else:
                    continue
            sentence.append(normalizer.lemmatize(token, new_tag))
        normalized_tweet.append(sentence)
    return normalized_tweet

normalized_pos_tweets = normalize(pos_tweet)
normalized_neg_tweets = normalize(neg_tweet)


In [20]:
# Now, store all positive / negative words into dictionary so it can be used as a guide for calculating sentiment for sentences

pos_words_dict = {}
neg_words_dict = {}

# Store all words into dictionary
for tweet in normalized_pos_tweets:
    for word in tweet:
        if word in pos_words_dict:
            temp = pos_words_dict[word.lower()]
            temp += 1
            pos_words_dict[word.lower()] = temp
        else:
            pos_words_dict[word.lower()] = 1

for tweet in normalized_neg_tweets:
    for word in tweet:
        if word in neg_words_dict:
            temp = neg_words_dict[word.lower()]
            temp += 1
            neg_words_dict[word.lower()] = temp
        else:
            neg_words_dict[word.lower()] = 1


In [21]:
# remove all emojis and leave only roman alphabets
pos_df = pd.DataFrame({'word': list(pos_words_dict.keys()), 'frequency': list(pos_words_dict.values())})
cleaned_pos_df = pos_df.loc[pos_df['word'].str.isalpha()]

neg_df = pd.DataFrame({'word': list(neg_words_dict.keys()), 'frequency': list(neg_words_dict.values())})
cleaned_neg_df = neg_df.loc[neg_df['word'].str.isalpha()]

In [22]:
# merge the two dataframes into one
merged = pd.merge(cleaned_pos_df, cleaned_neg_df, on='word', how='outer').fillna(0)
merged['frequency'] = merged['frequency_x'] - merged['frequency_y']
merged


Unnamed: 0,word,frequency_x,frequency_y,frequency
0,be,97.0,57.0,40.0
1,top,10.0,6.0,4.0
2,engage,7.0,0.0,7.0
3,member,16.0,6.0,10.0
4,community,2.0,1.0,1.0
...,...,...,...,...
9454,ahmad,0.0,1.0,-1.0
9455,maslan,0.0,1.0,-1.0
9456,hull,0.0,1.0,-1.0
9457,supporter,0.0,1.0,-1.0


In [23]:
# scale the frequencies of each word between 1 to 5 for positive words, -1 to -5 for negative words
# if there is a duplicate word in both negative and positive dataset, take the difference in frequencies
# and consider it as a positive word if the positive frequency is higher, and vice versa

pos_max = merged['frequency'].max()
neg_min = abs(merged['frequency'].min())

def scaler(freq):

    if freq > 0:
        return freq * (4 / pos_max) + 1
    elif freq < 0:
        return freq * (4 / neg_min) - 1
merged = merged.assign(**{'scale':merged['frequency'].apply(scaler)})
merged

Unnamed: 0,word,frequency_x,frequency_y,frequency,scale
0,be,97.0,57.0,40.0,3.133333
1,top,10.0,6.0,4.0,1.213333
2,engage,7.0,0.0,7.0,1.373333
3,member,16.0,6.0,10.0,1.533333
4,community,2.0,1.0,1.0,1.053333
...,...,...,...,...,...
9454,ahmad,0.0,1.0,-1.0,-1.012270
9455,maslan,0.0,1.0,-1.0,-1.012270
9456,hull,0.0,1.0,-1.0,-1.012270
9457,supporter,0.0,1.0,-1.0,-1.012270


In [24]:
merged = merged.assign(**{'is_null': merged['scale'].isnull().values})
merged.loc[merged['is_null'] == True]

Unnamed: 0,word,frequency_x,frequency_y,frequency,scale,is_null
6,hey,1.0,1.0,0.0,,True
7,james,1.0,1.0,0.0,,True
8,how,1.0,1.0,0.0,,True
13,centre,2.0,2.0,0.0,,True
17,many,1.0,1.0,0.0,,True
...,...,...,...,...,...,...
5888,lagos,1.0,1.0,0.0,,True
5890,kingdom,1.0,1.0,0.0,,True
5891,potato,1.0,1.0,0.0,,True
5892,hundred,1.0,1.0,0.0,,True


In [25]:
# decided to drop words with a total frequency of zero, since they were words that appeared the same number of times as both negative and positive words
merged = merged.dropna()
# Finished creating dataset for pos / neg words with scores (scale) included
merged

Unnamed: 0,word,frequency_x,frequency_y,frequency,scale,is_null
0,be,97.0,57.0,40.0,3.133333,False
1,top,10.0,6.0,4.0,1.213333,False
2,engage,7.0,0.0,7.0,1.373333,False
3,member,16.0,6.0,10.0,1.533333,False
4,community,2.0,1.0,1.0,1.053333,False
...,...,...,...,...,...,...
9454,ahmad,0.0,1.0,-1.0,-1.012270,False
9455,maslan,0.0,1.0,-1.0,-1.012270,False
9456,hull,0.0,1.0,-1.0,-1.012270,False
9457,supporter,0.0,1.0,-1.0,-1.012270,False


In [44]:
# For purpose of calculating execution time:
start = time.time()

# Now, retrieve articles from NYT (using API provided from NYT)
response = requests.get("https://api.nytimes.com/svc/search/v2/articlesearch.json?q=apple&fq=news_desk:Business&page=0&api-key=fO0tDSRQQdU68GkuXbMjt1uA2FYImzVp").json()
docs = response['response']['docs']
url_list = []
for item in docs:
    url_list.append(item['web_url'])
article_list = []
url_list


['https://www.nytimes.com/2020/12/16/technology/facebook-takes-the-gloves-off-in-feud-with-apple.html',
 'https://www.nytimes.com/2020/12/13/business/media/apple-gawker-tim-cook.html',
 'https://www.nytimes.com/2020/12/23/business/dealbook/trump-stimulus-veto.html',
 'https://www.nytimes.com/2020/12/01/technology/amazon-apple-chips-intel-arm.html',
 'https://www.nytimes.com/2020/12/17/technology/google-antitrust-monopoly.html',
 'https://www.nytimes.com/2020/12/17/business/dealbook/tech-apple-facebook-fight.html',
 'https://www.nytimes.com/2020/12/15/technology/big-tech-regulation-europe.html',
 'https://www.nytimes.com/2020/12/14/technology/big-tech-lobbying-europe.html',
 'https://www.nytimes.com/2020/12/09/technology/personaltech/amazon-halo-review.html',
 'https://www.nytimes.com/2020/11/18/technology/apple-app-store-fee.html']

In [1]:
# function to retrieve text from links and tokenize them into sentences.
def tokenize_sentence(url_list):
    text = []
    title = []
    abstract = []
    for url in url_list:
        time.sleep(0.1)
        headers = {'user-agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36'}
        article = requests.get(url, headers = headers)
        soup = bs4.BeautifulSoup(article.content, 'html.parser')
        article_text_p = soup.find_all('p', attrs={'class': 'css-axufdj evys1bk0'})
        abstract_text_p = soup.find('p', attrs={'class': 'css-w6ymp8 e1wiw3jv0'})
        title_text_h1 = soup.find('h1', attrs={'data-test-id': 'headline'})
        temp = []
        title.append(title_text_h1.text)
        abstract.append(abstract_text_p.text)
    
        for item in article_text_p:
            temp.append(item.text)
        space = ' '
        article_text = space.join(temp)
        text.append(article_text)

    # Word Tokenization to sentences 

    tokenized_by_sentence = []
    for num in range(len(text)):
        del_quo = re.sub(",”", " ", text[num])
        del_quo_2  = re.sub("”", " ", del_quo)
        del_quo_3 = re.sub("“", "", del_quo_2)
        text_token = re.split("(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!|;|”)\s", del_quo_3)
        text_token.insert(0, abstract[num])
        text_token.insert(0, title[num])
        tokenized_by_sentence.append(text_token)
        
    return tokenized_by_sentence


In [29]:
# tokenize and lametize the article from new york times

def stop_word_filter(word):
    if (word in stop_words): 
        return False
    else: 
        return True

# For filtering out empty strings
stop_words[''] = 1

def tokenizer_myself(given_articles):
    tokenized_result = []
    for article_iter in given_articles: 
        temp = []
        for sentence in article_iter:
            lowered_sentence = sentence.lower()
            tokenized_sentence = lowered_sentence.split(" ")
            tokenized_sentence = list(filter(stop_word_filter, tokenized_sentence))
            if len(tokenized_sentence) > 1:
                temp.append(tokenized_sentence)
        new_temp = normalize(temp)
        tokenized_result.append(new_temp)
    return tokenized_result

tokenized_by_sentence_new = tokenizer_myself(tokenized_by_sentence)

In [1]:
# calculate positivity or negativity of each sentence
def sentence_calculator(tokenized_by_sentence_new):

    articles_lst = []

    hash_table = { key:1 for key in list(merged['word'])}

    for article in tokenized_by_sentence_new:
        sentence_vals = []
        for sentence in article:
            val = 1.0
            for word in sentence:
                if word in hash_table:
                    val = val * merged.loc[merged['word'] == word]['scale'].values[0]
            sentence_vals.append(val)
        articles_lst.append(sentence_vals)
    return articles_lst

In [3]:
# Calculate the overall percent for the article (50% for Title and subtitle, other 50% for content)
# Note that we are removing score 1 since those scores mean that our system did not find any pos / neg words from that sentence 

def filter_one(variable):
    one_ind = 1.0
    if variable == one_ind:
        return False
    else:
        return True
def calculate_vals(articles_lst):
    avg_score_article = []
    for article in articles_lst:
        new_article = list(filter(filter_one, article))
        avg_score = (sum(new_article[0:2]) / 2) + (sum(new_article[2:]) / len(new_article[2:])) / 2
        avg_score_article.append(avg_score)
    return avg_score_article

In [32]:
# For purpose of calculating execution time
end = time.time()

print("Total execution time: ", end - start)

Total execution time:  7.45346736907959


In [42]:
# Now to examine the correctness of our model, we will retrieve past stock related articles from the web and the daily stock prices of the past on that particular stock, and see how correctly our model can predict the future prices of stocks.
# stock price: https://financialmodelingprep.com/developer/docs/#Stock-Historical-Price : Historical Daily Prices

stock_endpoint = 'https://financialmodelingprep.com/api/v3/historical-price-full/'
response = requests.get(stock_endpoint + 'AAPL' + '?apikey=70407133ea11d7284c70bbca4eee2547').json()
type(response) == dict
stock_df = pd.DataFrame(response['historical'])

In [43]:
stock_df

Unnamed: 0,date,open,high,low,close,adjClose,volume,unadjustedVolume,change,changePercent,vwap,label,changeOverTime
0,2020-12-30,135.580002,135.990005,133.399994,133.720001,133.720001,92882124.0,92882124.0,-1.86000,-1.372,134.37000,"December 30, 20",-0.01372
1,2020-12-29,138.050003,138.789993,134.339996,134.869995,134.869995,120778200.0,120778200.0,-3.18001,-2.304,135.99999,"December 29, 20",-0.02304
2,2020-12-28,133.990005,137.339996,133.509995,136.690002,136.690002,124486200.0,124486200.0,2.70000,2.015,135.84666,"December 28, 20",0.02015
3,2020-12-24,131.320007,133.460007,131.100006,131.970001,131.970001,54930100.0,54930100.0,0.64999,0.495,132.17667,"December 24, 20",0.00495
4,2020-12-23,132.160004,132.429993,130.779999,130.960007,130.960007,88223700.0,88223700.0,-1.20000,-0.908,131.39000,"December 23, 20",-0.00908
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1254,2016-01-07,24.670000,25.032499,24.107500,24.112499,22.158121,324377600.0,324377600.0,-0.55750,-2.260,24.41750,"January 07, 16",-0.02260
1255,2016-01-06,25.139999,25.592501,24.967501,25.174999,23.134508,273829600.0,273829600.0,0.03500,0.139,25.24500,"January 06, 16",0.00139
1256,2016-01-05,26.437500,26.462500,25.602501,25.677500,23.596279,223164000.0,223164000.0,-0.76000,-2.875,25.91417,"January 05, 16",-0.02875
1257,2016-01-04,25.652500,26.342501,25.500000,26.337500,24.202784,270597600.0,270597600.0,0.68500,2.670,26.06000,"January 04, 16",0.02670
