In [1]:
# All modules that are required to import:
import numpy as np
import pandas as pd
import time
# For purpose of calculating execution time:
start = time.time()
import requests
import bs4
import json
import re

import nltk
nltk.download('punkt')
nltk.download('twitter_samples')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

from nltk.corpus import twitter_samples

pos_tweet = twitter_samples.tokenized('positive_tweets.json')
neg_tweet = twitter_samples.tokenized('negative_tweets.json')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yok018\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\yok018\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yok018\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\yok018\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
# filter out the stop words
# borrowed list of stop words from https://github.com/kavgan/stop-words/blob/master/terrier-stop.txt 

filepath = open("terrier-stop.txt", "r")
temp = filepath.read().split("\n")
stop_words = { key : 1 for key in temp }

In [3]:
# Goal of this part: Read through all positive / negative tweets, normalize and remove unnecessary words from tweets, then create actual dictionary-like to use for our dataset

# Convert all complex part-of-speech to basic words
# List of part-of-speech is in this link: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
# WordNetLemmatizer has a function lemmatize where you can convert complex part of speech words into basic forms
# Things to consider:
#   Remove all unnecessary words from normalized_neg_tweets / normalized_pos_tweets
#   1. Remove mentions(starts with @)
#   2. Remove links (starts with https:// or http:// )
#   3. Remove punctuation (starts with ! or ?)
#   4. Remove Stop-Words (words that do have little to no meaning and does not affect the context of the sentence) to make our dataset more concise
# Note that we are keeping emoji (i.e. :) or :( . That is because these emojis do actually show sentiment of the text context)
# If words are DETERMINERS (DT), COORDINATING CONJUCTIONS (CC), PREPOSITIONS (IN), PERSONAL / POSSESSIVE PRONOUNS (PRP / PRP$), or WH-PRONOUNS (WP) WH-ADVERB(WRB), we remove it (consider as Stop words)
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
import string

normalizer = WordNetLemmatizer()
punctuation_and_stop_words = {'!': 1, '"': 1, '#': 1, '$': 1 ,'%': 1, '&': 1, "'": 1,'(': 1,')': 1,'*': 1,'+': 1,',': 1,'-': 1,'.': 1,':': 1,';': 1,'<': 1,'=': 1,'>': 1,'?': 1,'@': 1,'[': 1,']': 1,'^': 1,'_': 1,'`': 1,'{': 1,'|': 1,'}': 1,'~': 1,'https://': 1,'http://': 1}
stop_words_final = {**stop_words, **punctuation_and_stop_words}


def determiners(word):
    if word in stop_words_final:
        return False
    else:
        return True

def normalize(tweet_list):
    normalized_tweet = []
    for tweet in tweet_list:
        sentence = []
        for token, tag in pos_tag(tweet):
            # For Complex Noun words:
            if tag.startswith('NN'):
                new_tag = 'n'
            # For Complex Verb words
            elif tag.startswith('VB'):
                new_tag = 'v'
            # For stop-words
            elif tag.startswith('DT') or tag.startswith('CC') or tag.startswith('IN') or tag.startswith('PRP') or tag.startswith('PRP$') or tag.startswith('WP') or tag.startswith('WRB'):
                continue 
            # Every other words, convert them into adjective (pos = 'a')
            else:
                if determiners(token):
                    new_tag = 'a'
                else:
                    continue
            sentence.append(normalizer.lemmatize(token, new_tag))
        normalized_tweet.append(sentence)
    return normalized_tweet

normalized_pos_tweets = normalize(pos_tweet)
normalized_neg_tweets = normalize(neg_tweet)


In [4]:
# Now, store all positive / negative words into dictionary so it can be used as a guide for calculating sentiment for sentences

pos_words_dict = {}
neg_words_dict = {}

# Store all words into dictionary
for tweet in normalized_pos_tweets:
    for word in tweet:
        if word in pos_words_dict:
            temp = pos_words_dict[word.lower()]
            temp += 1
            pos_words_dict[word.lower()] = temp
        else:
            pos_words_dict[word.lower()] = 1

for tweet in normalized_neg_tweets:
    for word in tweet:
        if word in neg_words_dict:
            temp = neg_words_dict[word.lower()]
            temp += 1
            neg_words_dict[word.lower()] = temp
        else:
            neg_words_dict[word.lower()] = 1


In [5]:
# remove all emojis and leave only roman alphabets
pos_df = pd.DataFrame({'word': list(pos_words_dict.keys()), 'frequency': list(pos_words_dict.values())})
cleaned_pos_df = pos_df.loc[pos_df['word'].str.isalpha()]

neg_df = pd.DataFrame({'word': list(neg_words_dict.keys()), 'frequency': list(neg_words_dict.values())})
cleaned_neg_df = neg_df.loc[neg_df['word'].str.isalpha()]

In [6]:
# merge the two dataframes into one
merged = pd.merge(cleaned_pos_df, cleaned_neg_df, on='word', how='outer').fillna(0)
merged['frequency'] = merged['frequency_x'] - merged['frequency_y']
merged


Unnamed: 0,word,frequency_x,frequency_y,frequency
0,be,97.0,57.0,40.0
1,top,10.0,6.0,4.0
2,engage,7.0,0.0,7.0
3,member,16.0,6.0,10.0
4,community,2.0,1.0,1.0
...,...,...,...,...
9446,ahmad,0.0,1.0,-1.0
9447,maslan,0.0,1.0,-1.0
9448,hull,0.0,1.0,-1.0
9449,supporter,0.0,1.0,-1.0


In [7]:
# scale the frequencies of each word between 1 to 5 for positive words, -1 to -5 for negative words
# if there is a duplicate word in both negative and positive dataset, take the difference in frequencies
# and consider it as a positive word if the positive frequency is higher, and vice versa

pos_max = merged['frequency'].max()
neg_min = abs(merged['frequency'].min())

def scaler(freq):

    if freq > 0:
        return freq * (4 / pos_max) + 1
    elif freq < 0:
        return freq * (4 / neg_min) - 1
merged = merged.assign(**{'scale':merged['frequency'].apply(scaler)})
merged

Unnamed: 0,word,frequency_x,frequency_y,frequency,scale
0,be,97.0,57.0,40.0,3.133333
1,top,10.0,6.0,4.0,1.213333
2,engage,7.0,0.0,7.0,1.373333
3,member,16.0,6.0,10.0,1.533333
4,community,2.0,1.0,1.0,1.053333
...,...,...,...,...,...
9446,ahmad,0.0,1.0,-1.0,-1.012270
9447,maslan,0.0,1.0,-1.0,-1.012270
9448,hull,0.0,1.0,-1.0,-1.012270
9449,supporter,0.0,1.0,-1.0,-1.012270


In [8]:
merged = merged.assign(**{'is_null': merged['scale'].isnull().values})
merged.loc[merged['is_null'] == True]

Unnamed: 0,word,frequency_x,frequency_y,frequency,scale,is_null
6,hey,1.0,1.0,0.0,,True
7,james,1.0,1.0,0.0,,True
8,how,1.0,1.0,0.0,,True
13,centre,2.0,2.0,0.0,,True
17,many,1.0,1.0,0.0,,True
...,...,...,...,...,...,...
5884,lagos,1.0,1.0,0.0,,True
5886,kingdom,1.0,1.0,0.0,,True
5887,potato,1.0,1.0,0.0,,True
5888,hundred,1.0,1.0,0.0,,True


In [9]:
# decided to drop words with a total frequency of zero, since they were words that appeared the same number of times as both negative and positive words
merged = merged.dropna()
# Finished creating dataset for pos / neg words with scores (scale) included
merged

Unnamed: 0,word,frequency_x,frequency_y,frequency,scale,is_null
0,be,97.0,57.0,40.0,3.133333,False
1,top,10.0,6.0,4.0,1.213333,False
2,engage,7.0,0.0,7.0,1.373333,False
3,member,16.0,6.0,10.0,1.533333,False
4,community,2.0,1.0,1.0,1.053333,False
...,...,...,...,...,...,...
9446,ahmad,0.0,1.0,-1.0,-1.012270,False
9447,maslan,0.0,1.0,-1.0,-1.012270,False
9448,hull,0.0,1.0,-1.0,-1.012270,False
9449,supporter,0.0,1.0,-1.0,-1.012270,False


In [10]:
# Now, retrieve articles from NYT (using API provided from NYT)
response = requests.get("https://api.nytimes.com/svc/search/v2/articlesearch.json?q=apple&fq=news_desk:Business&api-key=fO0tDSRQQdU68GkuXbMjt1uA2FYImzVp").json()
docs = response['response']['docs']
url_list = []
for item in docs:
    url_list.append(item['web_url'])
article_list = []
url_list


['https://www.nytimes.com/2020/12/16/technology/facebook-takes-the-gloves-off-in-feud-with-apple.html',
 'https://www.nytimes.com/2020/12/13/business/media/apple-gawker-tim-cook.html',
 'https://www.nytimes.com/2020/12/23/business/dealbook/trump-stimulus-veto.html',
 'https://www.nytimes.com/2020/12/01/technology/amazon-apple-chips-intel-arm.html',
 'https://www.nytimes.com/2020/12/17/technology/google-antitrust-monopoly.html',
 'https://www.nytimes.com/2020/12/17/business/dealbook/tech-apple-facebook-fight.html',
 'https://www.nytimes.com/2020/12/15/technology/big-tech-regulation-europe.html',
 'https://www.nytimes.com/2020/12/14/technology/big-tech-lobbying-europe.html',
 'https://www.nytimes.com/2020/12/09/technology/personaltech/amazon-halo-review.html',
 'https://www.nytimes.com/2020/11/18/technology/apple-app-store-fee.html']

In [11]:
text = []
title = []
abstract = []
for url in url_list:
    time.sleep(0.1)
    headers = {'user-agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36'}
    article = requests.get(url, headers = headers)
    soup = bs4.BeautifulSoup(article.content, 'html.parser')
    article_text_p = soup.find_all('p', attrs={'class': 'css-axufdj evys1bk0'})
    abstract_text_p = soup.find('p', attrs={'class': 'css-w6ymp8 e1wiw3jv0'})
    title_text_h1 = soup.find('h1', attrs={'data-test-id': 'headline'})
    temp = []
    title.append(title_text_h1.text)
    abstract.append(abstract_text_p.text)
    
    for item in article_text_p:
        temp.append(item.text)
    space = ' '
    article_text = space.join(temp)
    text.append(article_text)

In [12]:
# Word Tokenization to sentences 

tokenized_by_sentence = []
for num in range(len(text)):
    del_quo = re.sub(",”", " ", text[num])
    del_quo_2  = re.sub("”", " ", del_quo)
    del_quo_3 = re.sub("“", "", del_quo_2)
    text_token = re.split("(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!|;|”)\s", del_quo_3)
    text_token.insert(0, abstract[num])
    text_token.insert(0, title[num])
    tokenized_by_sentence.append(text_token)



In [13]:
# tokenize and lametize the article from new york times

def stop_word_filter(word):
    if (word in stop_words): 
        return False
    else: 
        return True

# For filtering out empty strings
stop_words[''] = 1

def tokenizer_myself(given_articles):
    tokenized_result = []
    for article_iter in given_articles: 
        temp = []
        for sentence in article_iter:
            tokenized_sentence = sentence.split(" ")
            tokenized_sentence = list(filter(stop_word_filter, tokenized_sentence))
            if len(tokenized_sentence) > 1:
                temp.append(tokenized_sentence)
        new_temp = normalize(temp)
        tokenized_result.append(new_temp)
    return tokenized_result

tokenized_by_sentence_new = tokenizer_myself(tokenized_by_sentence)

In [14]:
# calculate positivity or negativity of each sentence

articles_lst = []

hash_table = { key:1 for key in list(merged['word'])}

for article in tokenized_by_sentence_new:
    sentence_vals = []
    for sentence in article:
        val = 1.0
        for word in sentence:
            if word in hash_table:
                val = val * merged.loc[merged['word'] == word]['scale'].values[0]
        sentence_vals.append(val)
    articles_lst.append(sentence_vals)

In [15]:
# Calculate the overall percent for the article (50% for Title and subtitle, other 50% for content)
# Note that we are removing score 1 since those scores mean that our system did not find any pos / neg words from that sentence 

def filter_one(variable):
    one_ind = 1.0
    if variable == one_ind:
        return False
    else:
        return True

avg_score_article = []
for article in articles_lst:
    new_article = list(filter(filter_one, article))
    avg_score = (sum(new_article[0:2]) / 2) + (sum(new_article[2:]) / len(new_article[2:])) / 2
    avg_score_article.append(avg_score)

avg_score_article

[-0.776708063515334,
 -0.19211078597161274,
 -3.1607233883820003,
 1.9378043608143205,
 0.05862496784129545,
 -0.10573732801823434,
 2.5652842585009497,
 1.983412941237357,
 -1.2840119590627617,
 -2.171577842704367]

In [16]:
# For purpose of calculating execution time
end = time.time()

print("Total execution time: ", end - start)

Total execution time:  16.69343590736389
