In [1]:
import pickle
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Load tokenized news data

with open('AaplLabelledNewsData-OrderPreserved.pkl', 'rb') as f:
    labelled_aapl_news_df = pickle.load(f)
    
with open('AmznLabelledNewsData-OrderPreserved.pkl', 'rb') as f:
    labelled_amzn_news_df = pickle.load(f)

In [3]:
#print sample frames
print(labelled_aapl_news_df.head(10))
print(labelled_amzn_news_df.head(10))

       news_timestamp stock_timestamp             source  \
0 2017-12-07 20:00:00      2017-12-08   businesswire.com   
1 2017-12-08 21:37:00      2017-12-11          yahoo.com   
2 2017-12-12 01:55:00      2017-12-13          yahoo.com   
3 2017-12-12 22:10:00      2017-12-13  investingnews.com   
4 2017-12-14 11:42:00      2017-12-15   seekingalpha.com   
5 2017-12-14 12:31:00      2017-12-15   seekingalpha.com   
6 2017-12-15 02:32:00      2017-12-18   businesswire.com   
7 2017-12-15 12:04:00      2017-12-18   seekingalpha.com   
8 2017-12-18 20:05:00      2017-12-19   seekingalpha.com   
9 2017-12-21 00:00:00      2017-12-22    morningstar.com   

                                              tokens label  
0  [nape, summit, also, annual, luncheon, tom, sp...    -1  
1  [huge, x, comeback, tour, follow, note, big, y...     1  
2  [browser, os, ad, server, make, first, move, c...     1  
3      [humanitarian, place, conflict, free, region]     1  
4                                 

In [4]:
columns = ['merged_tokens']
corpus_aapl = []
corpus_amzn = []

#can try to vectorize this
for index, row in labelled_aapl_news_df.iterrows():
    corpus_aapl.append(' '.join(row['tokens']))
    if index % 10000 == 0:
        print("Completed %d rows for apple" % index)
    
for index, row in labelled_amzn_news_df.iterrows():
    corpus_amzn.append(' '.join(row['tokens']))
    if index % 5000 == 0:
        print("Completed %d rows for amazon" % index)

Completed 0 rows for apple
Completed 10000 rows for apple
Completed 20000 rows for apple
Completed 30000 rows for apple
Completed 40000 rows for apple
Completed 50000 rows for apple
Completed 60000 rows for apple
Completed 70000 rows for apple
Completed 0 rows for amazon
Completed 5000 rows for amazon
Completed 10000 rows for amazon
Completed 15000 rows for amazon


In [5]:
#print sample items to ensure update
corpus_aapl[:5]

['nape summit also annual luncheon tom speaker new job fair upstream profession seminar present organ seg intern theater run two day prospect preview near exhibit space',
 'huge x comeback tour follow note big year best phone money one buy premium start cool per month depend plan put reach mani spot face id facial fantast dual len camera help make kind need time analyst question sure loss home button disappear could full screen handset turn user slick touch control even tire go high power second plus lover impress display got great two mean much fan',
 'browser os ad server make first move competitor like follow lead context ecosystem rife fraud publish ought much kind low rent unit reader rob hacker might target plan kill net neutral affect still use x face id even though say beat big media serf land tech giant sold buy twitter',
 'humanitarian place conflict free region',
 'prefer invest']

In [6]:
corpus_amzn[:5]

['amazon soon sell pull e platform two year ago support prime video',
 'line cashier similar fashion futurist amazon go store year ago yet open public',
 'fit fang stock amazon alphabet drug intern',
 'amazon cloud china',
 'stream stick plus offer play amazon content get access prime video fire interact expect exist go latest iter cut diamond shape port also come assist built current usual price product look like soon away said pull jan stop playground control smart home final bring box might hope ultra spec show thank back forth cast bummer']

In [7]:
aapl_one_gram_vectorizer = CountVectorizer()
aapl_one_gram_features = aapl_one_gram_vectorizer.fit_transform(corpus_aapl)

#printing first 30 feature words for visualization
print(aapl_one_gram_vectorizer.get_feature_names()[:30])

labelled_aapl_news_df = labelled_aapl_news_df.rename(columns={'tokens': 'features'})

aapl_one_gram_features_array = aapl_one_gram_features.toarray()
print("Aapl one-gram feature vector size: %d X %d" % (len(aapl_one_gram_features_array), len(aapl_one_gram_features_array[0])))

for i in range(len(aapl_one_gram_features_array)):
    labelled_aapl_news_df.at[i,'features'] = aapl_one_gram_features_array[i]

['aa', 'aal', 'aam', 'aaron', 'ab', 'aba', 'aback', 'abacus', 'abandon', 'abattoir', 'abb', 'abbasi', 'abbey', 'abbot', 'abe', 'abel', 'aberdeen', 'abhor', 'abidi', 'abitibi', 'abner', 'aboard', 'abolish', 'abord', 'abort', 'abound', 'abraham', 'abram', 'abreast', 'abroad']
Aapl one-gram feature vector size: 71941 X 9486


In [8]:
#print sample frames
print(labelled_aapl_news_df.head(10))

       news_timestamp stock_timestamp             source  \
0 2017-12-07 20:00:00      2017-12-08   businesswire.com   
1 2017-12-08 21:37:00      2017-12-11          yahoo.com   
2 2017-12-12 01:55:00      2017-12-13          yahoo.com   
3 2017-12-12 22:10:00      2017-12-13  investingnews.com   
4 2017-12-14 11:42:00      2017-12-15   seekingalpha.com   
5 2017-12-14 12:31:00      2017-12-15   seekingalpha.com   
6 2017-12-15 02:32:00      2017-12-18   businesswire.com   
7 2017-12-15 12:04:00      2017-12-18   seekingalpha.com   
8 2017-12-18 20:05:00      2017-12-19   seekingalpha.com   
9 2017-12-21 00:00:00      2017-12-22    morningstar.com   

                                            features label  
0  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...    -1  
1  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...     1  
2  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...     1  
3  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...     1  
4  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [9]:
amzn_one_gram_vectorizer = CountVectorizer()
amzn_one_gram_features = amzn_one_gram_vectorizer.fit_transform(corpus_amzn)

#printing first 30 feature words for visualization
print(amzn_one_gram_vectorizer.get_feature_names()[:30])

labelled_amzn_news_df = labelled_amzn_news_df.rename(columns={'tokens': 'features'})

amzn_one_gram_features_array = amzn_one_gram_features.toarray()
print("Amzn one-gram feature vector size: %d X %d" % (len(amzn_one_gram_features_array), len(amzn_one_gram_features_array[0])))

for i in range(len(amzn_one_gram_features_array)):
    labelled_amzn_news_df.at[i,'features'] = amzn_one_gram_features_array[i]

['aa', 'aal', 'aaron', 'ab', 'abacus', 'abandon', 'abb', 'abbasi', 'abbey', 'abduct', 'abel', 'aberdeen', 'abner', 'aboard', 'abord', 'abound', 'abraham', 'abram', 'abroad', 'abrupt', 'absent', 'absorb', 'abstract', 'absurd', 'abu', 'abuzz', 'abyss', 'acacia', 'acapulco', 'accent']
Amzn one-gram feature vector size: 19916 X 6420


In [10]:
#print sample frames
print(labelled_amzn_news_df.head(10))

       news_timestamp stock_timestamp            source  \
0 2017-12-15 12:04:00      2017-12-18  seekingalpha.com   
1 2017-12-21 12:04:00      2017-12-22  seekingalpha.com   
2 2017-12-21 12:27:00      2017-12-22     thestreet.com   
3 2017-12-22 00:00:00      2017-12-26     investors.com   
4 2017-12-22 20:57:00      2017-12-26         yahoo.com   
5 2017-12-27 00:00:00      2017-12-28     investors.com   
6 2017-12-27 11:43:00      2017-12-28  seekingalpha.com   
7 2017-12-28 00:00:00      2017-12-29     investors.com   
8 2017-12-28 11:39:00      2017-12-29  seekingalpha.com   
9 2017-12-28 11:39:00      2017-12-29  seekingalpha.com   

                                            features label  
0  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...     1  
1  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...    -1  
2  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...    -1  
3  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...     1  
4  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [11]:
# Save one-gram features labelled data

with open('aapl_one_gram_features_labelled_df.pkl', 'wb') as f:
    pickle.dump(labelled_aapl_news_df, f)
with open('amzn_one_gram_features_labelled_df.pkl', 'wb') as f:
    pickle.dump(labelled_amzn_news_df, f)

In [13]:
#extracting two-gram features now

aapl_two_gram_vectorizer = CountVectorizer(analyzer='word', ngram_range=(2, 2))
aapl_two_gram_features = aapl_two_gram_vectorizer.fit_transform(corpus_aapl)

#printing first 30 feature words for visualization
print(aapl_two_gram_vectorizer.get_feature_names()[:30])

aapl_two_gram_features_array = aapl_two_gram_features.toarray()
print("Aapl two-gram feature vector size: %d X %d" % (len(aapl_two_gram_features_array), len(aapl_two_gram_features_array[0])))

for i in range(len(aapl_two_gram_features_array)):
    labelled_aapl_news_df.at[i,'features'] = aapl_two_gram_features_array[i]

['aa aal', 'aa ade', 'aa auto', 'aa automat', 'aa azo', 'aa bac', 'aa biggest', 'aa ce', 'aa champion', 'aa col', 'aa credit', 'aa flight', 'aa flo', 'aa global', 'aa group', 'aa hum', 'aa josh', 'aa like', 'aa load', 'aa loop', 'aa meet', 'aa morgan', 'aa na', 'aa narrow', 'aa nation', 'aa need', 'aa non', 'aa one', 'aa partnership', 'aa percent']


MemoryError: Unable to allocate 253. GiB for an array with shape (71941, 472850) and data type int64

In [None]:
amzn_two_gram_vectorizer = CountVectorizer(analyzer='word', ngram_range=(2, 2))
amzn_two_gram_features = amzn_two_gram_vectorizer.fit_transform(corpus_amzn)

#printing first 30 feature words for visualization
print(amzn_two_gram_vectorizer.get_feature_names()[:30])

amzn_two_gram_features_array = amzn_two_gram_features.toarray()
print("Amzn two-gram feature vector size: %d X %d" % (len(amzn_two_gram_features_array), len(amzn_two_gram_features_array[0])))

for i in range(len(amzn_two_gram_features_array)):
    labelled_amzn_news_df.at[i,'features'] = amzn_two_gram_features_array[i]

In [None]:
# Save two-gram features labelled data

with open('aapl_two_gram_features_labelled_df.pkl', 'wb') as f:
    pickle.dump(labelled_aapl_news_df, f)
with open('amzn_two_gram_features_labelled_df.pkl', 'wb') as f:
    pickle.dump(labelled_amzn_news_df, f)