In [1]:
import pandas
import os
import pandas as pd
import nltk
import json
from datetime import datetime, timezone

In [2]:
from nltk.corpus import stopwords, words
from nltk.stem import LancasterStemmer, PorterStemmer, SnowballStemmer

In [3]:
NEWS_DIRECTORY = '/media/adv/Data/PROJECTS/CSE573-SWM/News'

In [4]:
# Download libraries
nltk.download('punkt')
nltk.download('words')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/adv/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /home/adv/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /home/adv/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
def get_news_articles(path):
    news_text = []
    news_publish_time = []
    news_source = []
    domains_to_select = ['ae', 'au', 'bb', 'biz', 'ca', 'in', 'io', 'net', 'uk', 'com']
    
    
    with os.scandir(path) as news_directories:
        for directory in news_directories:
            with os.scandir(os.path.join(path, directory.name)) as folder:
                for article in folder:
                    with open(os.path.join(path, directory.name, article.name), encoding='utf-8') as f:
                        news_data = json.load(f)
                    if 'site' in news_data['thread'] and news_data['thread']['site']:
                        source = news_data['thread']['site'] 
                        domain = source.split('.')[-1]
                        # Skip news from domain not in the list
                        if domain not in domains_to_select:
                            continue
                        news_source.append(source)
                    else:
                        news_source.append(None)
                    if 'published' in news_data and news_data['published']:
                        news_publish_time.append(news_data['published'])
                    else:
                        news_publish_time.append(None)
                    if 'text' in news_data and news_data['text']:
                        news_text.append(news_data['text'])
                    else:
                        news_text.append(None)

    df = pd.DataFrame({
        'timestamp': news_publish_time,
        'text': news_text,
        'source': news_source,
    })

    return df

news_df = get_news_articles(NEWS_DIRECTORY)

In [6]:
news_df

Unnamed: 0,timestamp,text,source
0,2018-06-04T03:00:00.000+03:00,"At its annual WWDC keynote on Monday, Apple In...",marketwatch.com
1,2018-06-04T03:00:00.000+03:00,"Amazon.com Inc.'s stock AMZN, +0.58% rallied 0...",marketwatch.com
2,2018-06-04T15:20:00.000+03:00,New York (Reuters) - Technology stocks led the...,reuters.com
3,2018-06-03T23:44:00.000+03:00,Orleans Capital Management Upped Its Nextera ...,mmahotstuff.com
4,2018-06-04T04:48:00.000+03:00,The bears have a case.\nWith minimal economic ...,thestreet.com
...,...,...,...
73496,2019-02-07T12:30:00.000+02:00,Apple đồng ý trả hơn nửa tỷ USD tiền nợ thuế t...,vietgiaitri.com
73497,2019-02-06T16:56:00.000+02:00,-=Tableau Software (DATA) reported earnings on...,tradewitheva.com
73498,2019-02-07T14:43:00.000+02:00,You follow Analyst Blog - edit You follow Zack...,zacks.com
73499,2019-02-07T12:32:00.000+02:00,Apple đồng ý trả hơn nửa tỷ USD tiền nợ thuế t...,vietgiaitri.com


In [7]:
news_df.isnull().sum()

timestamp    0
text         0
source       0
dtype: int64

In [8]:
# Convert timezone to UTC and drop the timezone
news_df['timestamp'] = news_df['timestamp'].apply(lambda x: datetime.fromisoformat(x).astimezone(tz=timezone.utc))
news_df['timestamp'] = news_df['timestamp'].dt.tz_localize(None)

In [9]:
# news_df['time'] = news_df['publish_timestamp'].apply(lambda x: x.time())
# news_df['date'] = news_df['publish_timestamp'].apply(lambda x: x.date())
news_df['text'] = news_df['text'].apply(lambda x: x.lower())

In [10]:
news_df.to_csv('RawNewsData.csv')

In [11]:
news_df.columns

Index(['timestamp', 'text', 'source'], dtype='object')

In [12]:
columns = ['timestamp', 'source', 'sentences']
processed_amzn_news_df = pd.DataFrame(columns=columns)
processed_aapl_news_df = pd.DataFrame(columns=columns)

In [13]:
# Extract and separate sentences containing AAPL and AMZN
for index, row in news_df.iterrows():
    text = row['text']
    aapl_sentences, amzn_sentences = [], []
    for sentence in nltk.sent_tokenize(text):
        if 'amazon' in sentence or 'amzn' in sentence:
            amzn_sentences.append(sentence)
        if 'apple' in sentence or 'aapl' in sentence:
            aapl_sentences.append(sentence)
    if aapl_sentences:
        processed_aapl_news_df.loc[len(processed_aapl_news_df)] = [row['timestamp'], row['source'], aapl_sentences]
    if amzn_sentences:
        processed_amzn_news_df.loc[len(processed_amzn_news_df)] = [row['timestamp'], row['source'], amzn_sentences]
        
print(len(processed_amzn_news_df), len(processed_aapl_news_df))

20236 73175


In [14]:
# del news_df
processed_amzn_news_df.to_csv('AmznExtractedSentences.csv')
processed_aapl_news_df.to_csv('AaplExtractedSentences.csv')

In [15]:
processed_amzn_news_df

Unnamed: 0,timestamp,source,sentences
0,2018-06-04 00:00:00,marketwatch.com,"[amazon.com inc.'s stock amzn, +0.58% rallied ..."
1,2018-06-04 15:10:00,seekingalpha.com,[aum of $66.4b\n52-week performance vs. the s&...
2,2018-06-04 00:00:00,marketwatch.com,[the technology sector is riding its way into ...
3,2018-06-04 18:46:00,yahoo.com,[elsewhere facebook (nasdaq:fb) dipped 1.03% a...
4,2018-06-04 18:46:00,yahoo.com,[elsewhere facebook (nasdaq:fb) dipped 1.03% a...
...,...,...,...
20231,2019-02-07 15:51:00,nasdaq.com,"[you look at facebook, you look at amazon , ap..."
20232,2019-02-07 15:17:00,marketwatch.com,[• momo crowd money flows are positive in amaz...
20233,2019-02-07 04:10:00,marketwatch.com,[that performance was good enough to land atop...
20234,2019-02-07 03:43:00,barrons.com,[sonos speakers also support amazon.com ’s (am...


In [16]:
import string
import re


def extract_words(input_words):
    from nltk.corpus import words
    
    # Remove all non-ascii words
    processed_words = [w for w in input_words if w.isascii()]
    
    # Remove punctuation words
    tr_dict = str.maketrans(dict.fromkeys(string.punctuation))
    processed_words = [w.translate(tr_dict) for w in processed_words if w]
    
    # Remove links
    final_words = []
    for word in processed_words:
        if not re.match('[www]', word):
            final_words.append(word)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    processed_words = [w for w in final_words if w not in stop_words]
    
    # Stem words and return unique words
    stemmer = SnowballStemmer('english')
    seen = set()
    processed_words = [stemmer.stem(word) for word in processed_words if word]
    processed_words = [x for x in processed_words if not (x in seen or seen.add(x))]
    del seen
    
    # Keep only words from English dictionary
    english_words = set([w.lower() for w in words.words()])
    processed_words = [w for w in processed_words if w in english_words]
    
    return processed_words

In [17]:
tokenized_df_columns = ['timestamp', 'source', 'tokens']
tokenized_amzn_news_df = pd.DataFrame(columns=tokenized_df_columns)
tokenized_aapl_news_df = pd.DataFrame(columns=tokenized_df_columns)

print("\n\nProcessing %d records" % len(processed_amzn_news_df))
for index, row in processed_amzn_news_df.iterrows():
    # This break is only for testing purpose
#     if index >= 1000:
#         break
        
    if index % 500 == 0:
        print("Completed %d rows" % index)
    token_words = []
    for sentence in row['sentences']:
        token_words.extend(nltk.wordpunct_tokenize(sentence))
    token_words = extract_words(token_words)
    tokenized_amzn_news_df.loc[index] = [
        processed_amzn_news_df.loc[index]['timestamp'],
        processed_amzn_news_df.loc[index]['source'], 
        token_words
    ]
tokenized_amzn_news_df.to_csv('AmznExtractedTokens.csv')


print("\n\nProcessing %d records" % len(processed_aapl_news_df))
for index, row in processed_aapl_news_df.iterrows():
    # This break is only for testing purpose
#     if index >= 1000:
#         break
    if index % 500 == 0:
        print("Completed %d rows" % index)
    token_words = []
    for sentence in row['sentences']:
        token_words.extend(nltk.wordpunct_tokenize(sentence))
    token_words = extract_words(token_words)
    tokenized_aapl_news_df.loc[index] = [
        processed_aapl_news_df.loc[index]['timestamp'], 
        processed_aapl_news_df.loc[index]['source'],
        token_words
    ]
tokenized_aapl_news_df.to_csv('AaplExtractedTokens.csv')
        



Processing 20236 records
Completed 0 rows
Completed 500 rows
Completed 1000 rows
Completed 1500 rows
Completed 2000 rows
Completed 2500 rows
Completed 3000 rows
Completed 3500 rows
Completed 4000 rows
Completed 4500 rows
Completed 5000 rows
Completed 5500 rows
Completed 6000 rows
Completed 6500 rows
Completed 7000 rows
Completed 7500 rows
Completed 8000 rows
Completed 8500 rows
Completed 9000 rows
Completed 9500 rows
Completed 10000 rows
Completed 10500 rows
Completed 11000 rows
Completed 11500 rows
Completed 12000 rows
Completed 12500 rows
Completed 13000 rows
Completed 13500 rows
Completed 14000 rows
Completed 14500 rows
Completed 15000 rows
Completed 15500 rows
Completed 16000 rows
Completed 16500 rows
Completed 17000 rows
Completed 17500 rows
Completed 18000 rows
Completed 18500 rows
Completed 19000 rows
Completed 19500 rows
Completed 20000 rows


Processing 73175 records
Completed 0 rows
Completed 500 rows
Completed 1000 rows
Completed 1500 rows
Completed 2000 rows
Completed 250

In [18]:
# Sort dataframes based on timestamps
# tokenized_amzn_news_df.sort_values(['day', 'time'], axis=0, ascending=(True, True), inplace=True)
# tokenized_aapl_news_df.sort_values(['day', 'time'], axis=0, ascending=(True, True), inplace=True)
tokenized_amzn_news_df.sort_values(['timestamp'], axis=0, ascending=True, inplace=True)
tokenized_aapl_news_df.sort_values(['timestamp'], axis=0, ascending=True, inplace=True)

In [19]:
import pickle

with open('AmznExtractedTokens.pkl', 'wb') as f:
    pickle.dump(tokenized_amzn_news_df, f)
with open('AaplExtractedTokens.pkl', 'wb') as f:
    pickle.dump(tokenized_aapl_news_df, f)

In [26]:
tokenized_amzn_news_df['tokens'][0]

['toward',
 'cap',
 'target',
 'e',
 'record',
 'climb',
 'billion',
 'stock',
 'boost',
 'run',
 'report',
 'month',
 'current',
 'mark',
 'quarter',
 'second',
 'outstand',
 'humphrey',
 'amazon',
 'make',
 'monday',
 'million',
 'result',
 'dow',
 'streak',
 'price',
 'giant',
 'past',
 'lift',
 'reach',
 'april',
 'share',
 'base',
 'sixth',
 'straight',
 'market',
 'morn',
 'behind',
 'first',
 'three',
 'u',
 'help',
 'close',
 'enough',
 'trade']