In [1]:
import pickle
import nltk
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfTransformer

nltk.download("popular")
stop_words = set(stopwords.words('english'))
tfidf = TfidfTransformer(smooth_idf=True,use_idf=True)

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\Naveen\AppData\Roaming\nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     C:\Users\Naveen\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     C:\Users\Naveen\AppData\Roaming\nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     C:\Users\Naveen\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     C:\Users\Naveen\AppData\Roaming\nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_dat

In [2]:
# Load tokenized news data

with open('AaplLabelledNewsData-OrderPreserved.pkl', 'rb') as f:
    labelled_aapl_news_df = pickle.load(f)
    
with open('AmznLabelledNewsData-OrderPreserved.pkl', 'rb') as f:
    labelled_amzn_news_df = pickle.load(f)

In [3]:
#visualize data
labelled_aapl_news_df.head()

Unnamed: 0,news_timestamp,stock_timestamp,source,tokens,label
0,2017-12-07 20:00:00,2017-12-08,businesswire.com,"[nape, summit, also, annual, luncheon, tom, sp...",-1
1,2017-12-08 21:37:00,2017-12-11,yahoo.com,"[huge, x, comeback, tour, follow, note, big, y...",1
2,2017-12-12 01:55:00,2017-12-13,yahoo.com,"[browser, os, ad, server, make, first, move, c...",1
3,2017-12-12 22:10:00,2017-12-13,investingnews.com,"[humanitarian, place, conflict, free, region]",1
4,2017-12-14 11:42:00,2017-12-15,seekingalpha.com,"[prefer, invest]",1


In [4]:
labelled_amzn_news_df.tail()

Unnamed: 0,news_timestamp,stock_timestamp,source,tokens,label
19911,2019-01-31 21:42:00,2019-02-01,nasdaq.com,"[also, prove, reach, mani, entertain, age, soo...",-1
19912,2019-01-31 22:03:00,2019-02-01,makemefeed.com,"[read, amazon, general, avail, publish, cost, ...",-1
19913,2019-01-31 22:05:00,2019-02-01,yahoo.com,"[direct, cloud, amazon, unit, driven, match, l...",-1
19914,2019-01-31 23:24:00,2019-02-01,marketwatch.com,"[earn, up, say, spend, strong, amazon, real, t...",-1
19915,2019-01-31 23:31:00,2019-02-01,marketwatch.com,"[earn, up, say, spend, strong, amazon, real, t...",-1


In [5]:
# combine both dataframes
frames = [labelled_aapl_news_df, labelled_amzn_news_df]
aapl_amzn_mixed_df = pd.concat(frames, axis=0, ignore_index=True)
print('apple dataframe size is: %d' % len(labelled_aapl_news_df))
print('amazon dataframe size is: %d' % len(labelled_amzn_news_df))
print('mixed dataframe size is: %d' % len(aapl_amzn_mixed_df))

apple dataframe size is: 71941
amazon dataframe size is: 19916
mixed dataframe size is: 91857


In [6]:
#visualize mixed frame data
aapl_amzn_mixed_df.head()

Unnamed: 0,news_timestamp,stock_timestamp,source,tokens,label
0,2017-12-07 20:00:00,2017-12-08,businesswire.com,"[nape, summit, also, annual, luncheon, tom, sp...",-1
1,2017-12-08 21:37:00,2017-12-11,yahoo.com,"[huge, x, comeback, tour, follow, note, big, y...",1
2,2017-12-12 01:55:00,2017-12-13,yahoo.com,"[browser, os, ad, server, make, first, move, c...",1
3,2017-12-12 22:10:00,2017-12-13,investingnews.com,"[humanitarian, place, conflict, free, region]",1
4,2017-12-14 11:42:00,2017-12-15,seekingalpha.com,"[prefer, invest]",1


In [7]:
#visualize mixed frame data
aapl_amzn_mixed_df.tail()

Unnamed: 0,news_timestamp,stock_timestamp,source,tokens,label
91852,2019-01-31 21:42:00,2019-02-01,nasdaq.com,"[also, prove, reach, mani, entertain, age, soo...",-1
91853,2019-01-31 22:03:00,2019-02-01,makemefeed.com,"[read, amazon, general, avail, publish, cost, ...",-1
91854,2019-01-31 22:05:00,2019-02-01,yahoo.com,"[direct, cloud, amazon, unit, driven, match, l...",-1
91855,2019-01-31 23:24:00,2019-02-01,marketwatch.com,"[earn, up, say, spend, strong, amazon, real, t...",-1
91856,2019-01-31 23:31:00,2019-02-01,marketwatch.com,"[earn, up, say, spend, strong, amazon, real, t...",-1


In [8]:
corpus = []

#can try to vectorize this
for index, row in aapl_amzn_mixed_df.iterrows():
    corpus.append(' '.join(row['tokens']))
    if index % 10000 == 0:
        print("Completed %d rows for mixed corpus" % index)

Completed 0 rows for mixed corpus
Completed 10000 rows for mixed corpus
Completed 20000 rows for mixed corpus
Completed 30000 rows for mixed corpus
Completed 40000 rows for mixed corpus
Completed 50000 rows for mixed corpus
Completed 60000 rows for mixed corpus
Completed 70000 rows for mixed corpus
Completed 80000 rows for mixed corpus
Completed 90000 rows for mixed corpus


In [9]:
#print sample items to ensure update
corpus[:5]

['nape summit also annual luncheon tom speaker new job fair upstream profession seminar present organ seg intern theater run two day prospect preview near exhibit space',
 'huge x comeback tour follow note big year best phone money one buy premium start cool per month depend plan put reach mani spot face id facial fantast dual len camera help make kind need time analyst question sure loss home button disappear could full screen handset turn user slick touch control even tire go high power second plus lover impress display got great two mean much fan',
 'browser os ad server make first move competitor like follow lead context ecosystem rife fraud publish ought much kind low rent unit reader rob hacker might target plan kill net neutral affect still use x face id even though say beat big media serf land tech giant sold buy twitter',
 'humanitarian place conflict free region',
 'prefer invest']

In [10]:
corpus[-5:]

['also prove reach mani entertain age soon non ad support stream platform like amazon enough dis remain digit juggernaut along year come',
 'read amazon general avail publish cost product use candid sent letter fund mogul nascent ad bigger ever start make inroad big brand made billion sale earn fourth quarter said call grab market critic link embed develop kit n',
 'direct cloud amazon unit driven match last quarter growth aw',
 'earn up say spend strong amazon real test analyst maria forecast like bake lot given',
 'earn up say spend strong amazon real test analyst maria forecast like bake lot given']

In [16]:
one_gram_vectorizer = CountVectorizer(max_df=0.9, min_df=0.005, stop_words=stop_words, ngram_range=(1,1))
one_gram_features = one_gram_vectorizer.fit_transform(corpus)

#printing first 30 feature words for visualization
print(one_gram_vectorizer.get_feature_names()[:30])

aapl_amzn_mixed_df = aapl_amzn_mixed_df.rename(columns={'tokens': 'features'})

one_gram_features_array = tfidf.fit_transform(one_gram_features).toarray()
print("one-gram feature vector size: %d X %d" % (len(one_gram_features_array), len(one_gram_features_array[0])))

for i in range(len(one_gram_features_array)):
    aapl_amzn_mixed_df.at[i,'features'] = one_gram_features_array[i]

['access', 'accord', 'account', 'acronym', 'across', 'act', 'action', 'actual', 'ad', 'add', 'address', 'adjust', 'admit', 'adopt', 'advisor', 'affect', 'afford', 'afternoon', 'aggress', 'ago', 'agre', 'agreement', 'ahead', 'ai', 'aim', 'air', 'alert', 'allow', 'almost', 'along']
one-gram feature vector size: 91857 X 1039


In [17]:
#print sample frames
aapl_amzn_mixed_df.head()

Unnamed: 0,news_timestamp,stock_timestamp,source,features,label
0,2017-12-07 20:00:00,2017-12-08,businesswire.com,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",-1
1,2017-12-08 21:37:00,2017-12-11,yahoo.com,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1
2,2017-12-12 01:55:00,2017-12-13,yahoo.com,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.156...",1
3,2017-12-12 22:10:00,2017-12-13,investingnews.com,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1
4,2017-12-14 11:42:00,2017-12-15,seekingalpha.com,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1


In [18]:
# Save one-gram features labelled data

with open('mixed_one_gram_features_labelled_df.pkl', 'wb') as f:
    pickle.dump(aapl_amzn_mixed_df, f)

In [24]:
#extracting two-gram features now

two_gram_vectorizer = CountVectorizer(max_df=0.9, min_df=0.003, stop_words=stop_words, ngram_range=(2,2))
two_gram_features = two_gram_vectorizer.fit_transform(corpus)

#printing first 30 feature words for visualization
print(two_gram_vectorizer.get_feature_names()[:30])
print(two_gram_vectorizer.get_feature_names()[-30:])

two_gram_features_array=tfidf.fit_transform(two_gram_features).toarray()

print("two-gram feature vector size: %d X %d" % (len(two_gram_features_array), len(two_gram_features_array[0])))

for i in range(len(two_gram_features_array)):
    aapl_amzn_mixed_df.at[i,'features'] = two_gram_features_array[i]

['access design', 'accord data', 'accord file', 'accord maintain', 'accord recent', 'accord report', 'accord stock', 'account invest', 'account portfolio', 'action alert', 'actual previous', 'ad stake', 'address free', 'address latest', 'advisor final', 'advisor new', 'advisor return', 'alert news', 'alert number', 'alert open', 'alert plus', 'alert sever', 'alert trade', 'allow user', 'alphabet amazon', 'also ad', 'also bought', 'also hold', 'also made', 'also recent']
['two day', 'two low', 'two year', 'unit sale', 'unit state', 'unveil new', 'uptrend among', 'uptrend analyst', 'uptrend investor', 'uptrend news', 'uptrend recent', 'us long', 'version access', 'version content', 'version read', 'version view', 'view design', 'visit latest', 'vital data', 'year ago', 'year analyst', 'year board', 'year date', 'year dividend', 'year low', 'year new', 'year news', 'year per', 'year profit', 'york time']
two-gram feature vector size: 91857 X 817


In [25]:
#print sample frames
aapl_amzn_mixed_df.head()

Unnamed: 0,news_timestamp,stock_timestamp,source,features,label
0,2017-12-07 20:00:00,2017-12-08,businesswire.com,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",-1
1,2017-12-08 21:37:00,2017-12-11,yahoo.com,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1
2,2017-12-12 01:55:00,2017-12-13,yahoo.com,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1
3,2017-12-12 22:10:00,2017-12-13,investingnews.com,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1
4,2017-12-14 11:42:00,2017-12-15,seekingalpha.com,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1


In [26]:
# Save two-gram features labelled data for Apple

with open('mixed_two_gram_features_labelled_df.pkl', 'wb') as f:
    pickle.dump(aapl_amzn_mixed_df, f)

In [16]:
# Save two-gram features labelled data for Amazon

with open('amzn_two_gram_features_labelled_df.pkl', 'wb') as f:
    pickle.dump(labelled_amzn_news_df, f)