# Imports

# Helper Functions

In [1]:
import pandas as pd
import numpy as np
import operator
import sys
from gensim.models import Word2Vec
from gensim.test.utils import common_texts, get_tmpfile
from sklearn.decomposition import PCA
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm, tree
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from matplotlib import pyplot
import matplotlib.patches as mpatches
import seaborn as sns

In [8]:
def sort_dict(mydict, reversed=False):
  return sorted(mydict.items(), key=operator.itemgetter(1), reverse=reversed)

# Read Data

In [9]:
# Companies description
desc_df = pd.read_csv('stocks_data/constituents.csv')
print('\nCompanies Details')
print(desc_df.head())

# stocks details
stocks_df = pd.read_csv('stocks_data/all_stocks_5yr.csv')#, parse_dates=['date'])
stocks_df_train = pd.read_csv('stocks_data/train_all_stocks_5yr.csv')
stocks_df_test = pd.read_csv('stocks_data/test_all_stocks_5yr.csv')
print('\nCompanies Stocks')
print(stocks_df.head())


Companies Details
  Symbol                 Name                  Sector
0    MMM           3M Company             Industrials
1    AOS      A.O. Smith Corp             Industrials
2    ABT  Abbott Laboratories             Health Care
3   ABBV          AbbVie Inc.             Health Care
4    ACN        Accenture plc  Information Technology

Companies Stocks
         date   open   high    low  close    volume Name
0  2013-02-08  15.07  15.12  14.63  14.75   8407500  AAL
1  2013-02-11  14.89  15.01  14.26  14.46   8882000  AAL
2  2013-02-12  14.45  14.51  14.10  14.27   8126000  AAL
3  2013-02-13  14.30  14.94  14.25  14.66  10259500  AAL
4  2013-02-14  14.94  14.96  13.16  13.99  31879900  AAL


In [10]:
# Rename the 'datetime' string column into 'date'
stocks_df_test.rename(columns={'datetime': 'date'}, inplace=True)
stocks_df_train.rename(columns={'datetime': 'date'}, inplace=True)
# Rename the 'symbol' string column into 'Name'
stocks_df_test.rename(columns={'symbol': 'Name'}, inplace=True)
stocks_df_train.rename(columns={'symbol': 'Name'}, inplace=True)
stocks_df = stocks_df_train

# Preprocess

In [11]:
stocks_df.head()

Unnamed: 0,Name,date,close,high,low,open,volume,change_in_price,down_days,up_days,RSI,low_14,high_14,k_percent,r_percent,MACD,MACD_EMA,Price_Rate_Of_Change,On Balance Volume,Prediction
0,A,2013-02-28,41.48,42.06,41.45,41.78,3464202,-0.25,0.25,0.0,31.526668,40.19,45.35,25.0,-75.0,-0.377195,-0.296696,-0.069538,-26498313,-1.0
1,MTB,2013-02-28,102.09,102.77,101.55,101.77,741665,-0.56,0.56,0.0,36.68062,100.88,105.895,24.127617,-75.872383,-0.348706,-0.083759,-0.023996,-382289,-1.0
2,UNH,2013-02-28,53.45,53.96,53.39,53.82,5872088,-0.42,0.42,0.0,23.295602,52.51,58.255,16.362054,-83.637946,-0.50209,-0.351509,-0.063103,-41813055,-1.0
3,SWKS,2013-02-28,21.3,21.64,21.2,21.5,3672172,0.11,0.0,0.11,34.464025,20.28,25.1,21.161826,-78.838174,-0.449634,-0.31217,-0.146976,-13398219,1.0
4,XEL,2013-02-28,28.7,28.82,28.53,28.57,3616494,0.15,0.0,0.15,75.602587,27.65,28.82,89.74359,-10.25641,0.074566,-0.037752,0.029043,5357027,1.0


In [7]:
# dictionary for companies name and sector
companies_names = {symbol:name for symbol, name in desc_df[['Symbol', 'Name']].values}
companies_sector = {symbol:sector for symbol, sector in desc_df[['Symbol', 'Sector']].values}

# get all companies symbols
symbols = stocks_df['Name'].values
dates = set(stocks_df['date'].values)
dates = sorted(dates)

# store each individual date and all its stocks
dates_dictionary = {date:{} for date in dates}

# Data for Word Embeddings

For each date in out dataset we rearrange each company in ascending order based on the **change in price**.

Formula for **change in price** [source](https://pocketsense.com/calculate-market-price-change-common-stock-4829.html):
* (closing_price - opening_price) / opening_price

We can change the formula to use highest price and lowest price. This is something we will test out.

In [12]:
# calculate price change for each stock and sort them in each day
for date, symbol, op, cl, in stocks_df[['date', 'Name', 'open', 'close']].values:
  # CHANGE IN PRICE: (closing_price - opening_price) / opening_price
  dates_dictionary[date][symbol] = (cl - op)/op
# sort each day reverse order
dates_dictionary = {date:sort_dict(dates_dictionary[date]) for date in dates}

stocks_w2v_data = [[value[0] for value in dates_dictionary[date]] for date in dates]

# Train Word Embeddings

In [13]:
def hash(astring):
   return ord(astring[0])

In [14]:
# recreate model with 10 dimensions(this is the model that will be used for the rest of the code)
j = 4 # I'm assuming this from the paper
model = Word2Vec(stocks_w2v_data, min_count=1, vector_size=j)
words = list(model.wv.key_to_index)  # Changed here
X = model.wv[words]  # Changed here
Y = list()
for word in words:
    Y.append(companies_sector[word])

# split data set for cross validation
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

# Application

In [17]:
# This is a little bit different from the old code, but I'm going to stick with it for now
# print similar stocks
target_symb = 'ALXN'

print('Symbol:%s\tName:%s\tSector: %s'%(target_symb, companies_names[target_symb], companies_sector[target_symb]))
top_similar = model.wv.most_similar(target_symb)
print('Most Similar')
for similar in top_similar:
  symb = similar[0]
  name = companies_names[symb]
  sect = companies_sector[symb]
  print('Symbol: %s\tName: %s\t\t\tSector: %s'%(symb, name, sect))

Symbol:ALXN	Name:Alexion Pharmaceuticals	Sector: Health Care
Most Similar
Symbol: REGN	Name: Regeneron			Sector: Health Care
Symbol: ILMN	Name: Illumina Inc			Sector: Health Care
Symbol: BIIB	Name: Biogen Inc.			Sector: Health Care
Symbol: VRTX	Name: Vertex Pharmaceuticals Inc			Sector: Health Care
Symbol: AAL	Name: American Airlines Group			Sector: Industrials
Symbol: DAL	Name: Delta Air Lines Inc.			Sector: Industrials
Symbol: MYL	Name: Mylan N.V.			Sector: Health Care
Symbol: ALK	Name: Alaska Air Group Inc			Sector: Industrials
Symbol: INCY	Name: Incyte			Sector: Health Care
Symbol: UAA	Name: Under Armour Class A			Sector: Consumer Discretionary


In [19]:
# access vector for one word
print(model.wv['AAL'])

[-2.660959   0.8983903  2.5342708  0.6659524]


In [21]:
# Save Stocr2vec Vectors
binary_vectors = 'stock2vec_Keyed_Binary.bin'
text_vectors = 'stock2vec_Keyed_Text.vec
stoack2vecKeyedBinary = model.wv.save_word2vec_format(fname=binary_vectors, binary=True)
stoack2vecKeyedText = model.wv.save_word2vec_format(fname=text_vectors, binary=False)

# Read in the vectors

In [22]:
from gensim.models import KeyedVectors

filename = 'stock2vec_Keyed_Binary.bin'
model = KeyedVectors.load_word2vec_format(filename, binary=True)

In [24]:
vector = model['AAL']
vector

array([-2.660959 ,  0.8983903,  2.5342708,  0.6659524], dtype=float32)