In [9]:
import pandas
import os
import pandas as pd
import nltk
import json
from datetime import datetime, timezone

In [99]:
from nltk.corpus import stopwords, words
from nltk.stem import LancasterStemmer, PorterStemmer, SnowballStemmer

In [11]:
NEWS_DIRECTORY = '/media/adv/Data/PROJECTS/CSE573-SWM/News'

In [70]:
# Download libraries
nltk.download('punkt')
nltk.download('words')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/adv/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /home/adv/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /home/adv/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [13]:
def get_news_articles(path):
    news_text = []
    news_publish_time = []
    news_source = []
    
    with os.scandir(path) as news_directories:
        for directory in news_directories:
            with os.scandir(os.path.join(path, directory.name)) as folder:
                for article in folder:
                    with open(os.path.join(path, directory.name, article.name), encoding='utf-8') as f:
                        news_data = json.load(f)
                    if 'site' in news_data['thread'] and news_data['thread']['site']:
                        news_source.append(news_data['thread']['site'])
                    else:
                        news_source.append(None)
                    if 'published' in news_data and news_data['published']:
                        news_publish_time.append(news_data['published'])
                    else:
                        news_publish_time.append(None)
                    if 'text' in news_data and news_data['text']:
                        news_text.append(news_data['text'])
                    else:
                        news_text.append(None)

    df = pd.DataFrame({
        'publish_timestamp': news_publish_time,
        'text': news_text,
        'source': news_source,
    })

    return df

news_df = get_news_articles(NEWS_DIRECTORY)

In [14]:
news_df

Unnamed: 0,publish_timestamp,text,source
0,2018-06-04T03:00:00.000+03:00,"At its annual WWDC keynote on Monday, Apple In...",marketwatch.com
1,2018-06-04T03:00:00.000+03:00,"Amazon.com Inc.'s stock AMZN, +0.58% rallied 0...",marketwatch.com
2,2018-06-04T15:20:00.000+03:00,New York (Reuters) - Technology stocks led the...,reuters.com
3,2018-06-04T21:12:00.000+03:00,"32d62ff48cf84493019ca98f7ced4475cef8f041""> ６月...",yahoo.co.jp
4,2018-06-03T23:44:00.000+03:00,Orleans Capital Management Upped Its Nextera ...,mmahotstuff.com
...,...,...,...
78050,2019-02-06T16:56:00.000+02:00,-=Tableau Software (DATA) reported earnings on...,tradewitheva.com
78051,2019-02-07T14:43:00.000+02:00,You follow Analyst Blog - edit You follow Zack...,zacks.com
78052,2019-02-07T12:32:00.000+02:00,Apple đồng ý trả hơn nửa tỷ USD tiền nợ thuế t...,vietgiaitri.com
78053,2019-02-07T22:25:00.000+02:00,Apple (NASDAQ: AAPL ) has moved its modem chip...,seekingalpha.com


In [15]:
news_df.isnull().sum()

publish_timestamp    0
text                 0
source               0
dtype: int64

In [16]:
# Convert timezone to UTC
news_df['publish_timestamp'] = news_df['publish_timestamp'].apply(lambda x: datetime.fromisoformat(x).astimezone(tz=timezone.utc))

In [17]:
news_df['time'] = news_df['publish_timestamp'].apply(lambda x: x.time())
news_df['date'] = news_df['publish_timestamp'].apply(lambda x: x.date())
news_df['text'] = news_df['text'].apply(lambda x: x.lower())

In [18]:
news_df.to_csv('RawNewsData.csv')

In [19]:
news_df.columns

Index(['publish_timestamp', 'text', 'source', 'time', 'date'], dtype='object')

In [20]:
columns = ['date', 'time', 'source', 'sentences']
processed_amzn_news_df = pd.DataFrame(columns=columns)
processed_aapl_news_df = pd.DataFrame(columns=columns)

In [21]:
# Extract and separate sentences containing 
for index, row in news_df.iterrows():
    text = row['text']
    aapl_sentences, amzn_sentences = [], []
    for sentence in nltk.sent_tokenize(text):
        if 'amazon' in sentence or 'amzn' in sentence:
            amzn_sentences.append(sentence)
        if 'apple' in sentence or 'aapl' in sentence:
            aapl_sentences.append(sentence)
    if aapl_sentences:
        processed_aapl_news_df.loc[len(processed_aapl_news_df)] = [row['date'], row['time'], row['source'], aapl_sentences]
    if amzn_sentences:
        processed_amzn_news_df.loc[len(processed_amzn_news_df)] = [row['date'], row['time'], row['source'], amzn_sentences]
        
print(len(processed_amzn_news_df), len(processed_aapl_news_df))

21395 77687


In [78]:
del news_df
processed_amzn_news_df.to_csv('AmznExtractedSentences.csv')
processed_aapl_news_df.to_csv('AaplExtractedSentences.csv')

In [56]:
# import ast 
# processed_amzn_news_df1 = pd.read_csv('AmznExtractedSentences.csv', names=columns, skiprows=[0])
# processed_amzn_news_df1['sentences'] = processed_amzn_news_df1['sentences'].apply(lambda x: ast.literal_eval(x))


In [57]:
processed_amzn_news_df

Unnamed: 0,date,time,source,sentences
0,2018-06-04,00:00:00,marketwatch.com,"[amazon.com inc.'s stock amzn, +0.58% rallied ..."
1,2018-06-04,18:12:00,yahoo.co.jp,"[ 32d62ff48cf84493019ca98f7ced4475cef8f041""> ６..."
2,2018-06-04,15:10:00,seekingalpha.com,[aum of $66.4b\n52-week performance vs. the s&...
3,2018-06-04,00:00:00,marketwatch.com,[the technology sector is riding its way into ...
4,2018-06-04,00:00:00,w4t.cz,[foto: butz.2013\njak už to v posledních letec...
...,...,...,...,...
21390,2019-02-07,15:17:00,marketwatch.com,[• momo crowd money flows are positive in amaz...
21391,2019-02-07,04:10:00,marketwatch.com,[that performance was good enough to land atop...
21392,2019-02-07,03:43:00,barrons.com,[sonos speakers also support amazon.com ’s (am...
21393,2019-02-06,14:56:00,tradewitheva.com,[posted by eva s at 5:00 pm\nhere are some of ...


In [97]:
import string
import re


def extract_words(input_words):
    from nltk.corpus import words
    
    # Remove all non-ascii words
    processed_words = [w for w in input_words if w.isascii()]
    
    # Remove punctuation words
    tr_dict = str.maketrans(dict.fromkeys(string.punctuation))
    processed_words = [w.translate(tr_dict) for w in processed_words if w]
    
    # Remove links
    final_words = []
    for word in processed_words:
        if not re.match('[www]', word):
            final_words.append(word)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    processed_words = [w for w in final_words if w not in stop_words]
    
    # Stem words and return unique words
    stemmer = SnowballStemmer('english')
    processed_words = list(set([stemmer.stem(word) for word in processed_words if word]))
    
    # Keep only words from English dictionary
    english_words = set([w.lower() for w in words.words()])
    processed_words = [w for w in processed_words if w in english_words]
    
    return processed_words

In [101]:
tokenized_df_columns = ['date', 'time', 'source', 'tokens']
tokenized_amzn_news_df = pd.DataFrame(columns=tokenized_df_columns)
tokenized_aapl_news_df = pd.DataFrame(columns=tokenized_df_columns)

print("\n\nProcessing %d records" % len(processed_amzn_news_df))
for index, row in processed_amzn_news_df.iterrows():
    # This break is only for testing purpose
    if index >= 1000:
        break
        
    if index % 500 == 0:
        print("Completed %d rows" % index)
    token_words = []
    for sentence in row['sentences']:
        token_words.extend(nltk.wordpunct_tokenize(sentence))
    token_words = extract_words(token_words)
    tokenized_amzn_news_df.loc[index] = [
        processed_amzn_news_df.loc[index]['date'], 
        processed_amzn_news_df.loc[index]['time'], 
        processed_amzn_news_df.loc[index]['source'], 
        token_words
    ]
tokenized_amzn_news_df.to_csv('AmznExtractedTokens.csv')


print("\n\nProcessing %d records" % len(processed_aapl_news_df))
for index, row in processed_aapl_news_df.iterrows():
    # This break is only for testing purpose
    if index >= 1000:
        break
    if index % 500 == 0:
        print("Completed %d rows" % index)
    token_words = []
    for sentence in row['sentences']:
        token_words.extend(nltk.wordpunct_tokenize(sentence))
    token_words = extract_words(token_words)
    tokenized_aapl_news_df.loc[index] = [
        processed_aapl_news_df.loc[index]['date'], 
        processed_aapl_news_df.loc[index]['time'], 
        processed_aapl_news_df.loc[index]['source'],
        token_words
    ]
tokenized_aapl_news_df.to_csv('AaplExtractedTokens.csv')
        



Processing 21395 records
Completed 0 rows
Completed 500 rows


Processing 77687 records
Completed 0 rows
Completed 500 rows


In [75]:
import pickle

with open('AmznExtractedTokens.pkl', 'wb') as f:
    pickle.dump(tokenized_amzn_news_df, f)
with open('AaplExtractedTokens.pkl', 'wb') as f:
    pickle.dump(tokenized_aapl_news_df, f)

In [90]:
tokenized_amzn_news_df['tokens'][0]

['edg',
 'u',
 'first',
 '2',
 'dow',
 '8',
 'averag',
 'morn',
 'close',
 'enough',
 'mark',
 'amazon',
 'monday',
 'straight',
 'toward',
 'sinc',
 '01',
 'industri',
 '26',
 'streak',
 '6',
 '0',
 'record',
 '7',
 'capit',
 'billion',
 'base',
 'humphrey',
 'sixth',
 '83',
 'increas',
 'three',
 'appl',
 'run',
 'reach',
 'price',
 '58',
 'e',
 'help',
 '1',
 'trade',
 '800',
 'april',
 'cap',
 'boost',
 'lift',
 '9',
 '485',
 'market',
 'target',
 'compani',
 'behind',
 'million',
 '64',
 '000',
 '10',
 '3',
 '18',
 '801',
 'quarter',
 'giant',
 '943',
 'jone',
 'commerc',
 'second',
 'outstand',
 '34',
 'stock',
 'past',
 'current',
 '23']