### Project Objectives

Given 27 different companies in the SP500, we will extract their stock price data directly from Yahoo Finance. The objective is to use the KMeans algorithm to categorize these companies into 5 clusters, by the magnitude of the changes in opening and closing stock price between Jan 1st 2015 and Jan 1st 2020.

### Import Libraries

In [1]:
from pandas_datareader import data
import matplotlib.pyplot as plt
import pandas as pd
import datetime
import numpy as np
import seaborn as sns
import warnings
warnings.simplefilter("ignore")

### Data Extraction

In [2]:
companies_dict = {
    "Amazon": "AMZN",
    "Apple": "AAPL",
    "Walgreen": "WBA",
    "Northrop Grumman": "NOC",
    "Boeing": "BA",
    "Lockheed Martin": "LMT",
    "McDonalds": "MCD",
    "Intel": "INTC",
    "Navistar": "NAV",
    "IBM": "IBM",
    "Texas Instruments": "TXN",
    "MasterCard": "MA",
    "Microsoft": "MSFT",
    "General Electrics": "GE",
    "American Express": "AXP",
    "Pepsi": "PEP",
    "Coca Cola": "KO",
    "Johnson & Johnson": "JNJ",
    "Toyota": "TM",
    "Honda": "HMC",
    "Mitsubishi": "MSBHF",
    "Sony": "SNE",
    "Exxon": "XOM",
    "Chevron": "CVX",
    "Valero Energy": "VLO",
    "Ford": "F",
    "Bank of America": "BAC"}

In [None]:
data_source = "yahoo"
start_date = "2015-01-01"
end_date = "2020-01-01"
data = data.DataReader(list(companies_dict.values()), data_source,start_date,end_date)

### Explanatory Data Analysis

In [None]:
data.head()

In [None]:
data.isna().sum()

In [None]:
stock_open = np.array(data["Open"]).T
stock_close = np.array(data["Close"]).T

In [None]:
close_open_diff = stock_close - stock_open
sum_of_diff = np.sum(close_open_diff, 1)

In [None]:
for i in range(len(companies_dict)):
    print("Company:{}, Change:{}".format(data["High"].columns[i], sum_of_diff[i]))

In [None]:
fig, axes = plt.subplots(nrows = 3, ncols = 9, figsize = (50, 25))
fig.subplots_adjust(hspace = 0.5)
fig.suptitle("Daily Stock Opening Prices", fontsize = 45)

for ax, value in zip(axes.flatten(), companies_dict.values()):
    lp = sns.lineplot(data = data["Open"][value], ax = ax)
    lp.axes.set_title(value, fontsize = 50)
    lp.set_xlabel("Date", fontsize = 30)
    lp.set_ylabel("Opening Price($)", fontsize = 30)
    lp.tick_params(labelsize = 5)

plt.show()

These plots are just here to provide an idea of the general trend of each stock. It must be noted that the y-axes are DIFFERENT for each stock and should not be compared. Rather, we want to have a sense of the general stock price movement.

### Feature Engineering

In [None]:
plt.figure(figsize = (20, 8))

ax1 = plt.subplot(1, 2, 1)

plt.title("Apple", fontsize = 20)
plt.xticks(fontsize = 10)
plt.yticks(fontsize = 20)
plt.xlabel("Date", fontsize = 20)
plt.ylabel("Change in Stock Price between Open and Close($)", fontsize = 20)
plt.plot(close_open_diff[1])

plt.subplot(1, 2, 2, sharey = ax1)
plt.title("Amazon", fontsize = 20)
plt.xticks(fontsize = 10)
plt.yticks(fontsize = 20)
plt.xlabel("Date", fontsize = 20)
plt.ylabel("Change in Stock Price between Open and Close ($)", fontsize = 20)
plt.plot(close_open_diff[0])

By inspection of the plots above, the stock prices for Amazon and Apple are on different scales. This can be generalized for all 27 stocks. Therefore, normalization or standardization is called for, if we want to model stock price using machine learning techniques.

In [None]:
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
norm_movements = normalizer.fit_transform(close_open_diff)

In [None]:
print(norm_movements.min())
print(norm_movements.max())
print(norm_movements.mean())

In [None]:
plt.figure(figsize = (20, 8))

ax1 = plt.subplot(1, 2, 1)

plt.title("Apple", fontsize = 20)
plt.xticks(fontsize = 10)
plt.yticks(fontsize = 20)
plt.xlabel("Date", fontsize = 20)
plt.ylabel("Change in Stock Price between Open and Close($)", fontsize = 20)
plt.plot(norm_movements[1])

plt.subplot(1, 2, 2, sharey = ax1)
plt.title("Amazon", fontsize = 20)
plt.xticks(fontsize = 10)
plt.yticks(fontsize = 20)
plt.xlabel("Date", fontsize = 20)
plt.ylabel("Change in Stock Price between Open and Close ($)", fontsize = 20)
plt.plot(norm_movements[0])

Our stock price changes are now on the same scale and meaningful comparisons can be made.

### Data Pipeline

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.cluster import KMeans

normalizer = Normalizer()
kmeans = KMeans(n_clusters = 5, max_iter = 1000, random_state = 1)

pipeline = make_pipeline(normalizer, kmeans)

pipeline.fit(close_open_diff)
predictions = pipeline.predict(close_open_diff)

In [None]:
predictions

In [None]:
predict_df = pd.DataFrame({"Cluster":predictions, "companies":list(companies_dict)})
predict_df

### Dimensionality Reduction (PCA)

In [None]:
norm_movements.shape

In [None]:
from sklearn.decomposition import PCA

normalizer = Normalizer()

pca_data = PCA(n_components = 2)

kmeans = KMeans(n_clusters = 5, max_iter = 1000)

pipeline = make_pipeline(normalizer, pca_data, kmeans)

pipeline.fit(close_open_diff)

preditions = pipeline.predict(close_open_diff)

predict2_df = pd.DataFrame({"Cluster":predictions, "companies":list(companies_dict)})
predict2_df


### Visualizing Results

In [None]:
# Reduce Data Dimensionality
pca_data = PCA(n_components = 2).fit_transform(norm_movements)

# Define the mesh step size
h = 0.002

# Plot decision boundary
x_min, x_max = pca_data[:,0].min()-1, pca_data[:, 0].max() + 1
y_min, y_max = pca_data[:,1].min()-1, pca_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Label each point in the mesh using our model
kpredictions = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

# Plot results by color
kpredictions = kpredictions.reshape(xx.shape)
cmap = plt.cm.Paired
plt.clf()
plt.figure(figsize = (10, 10))
plt.imshow(kpredictions, interpolation = "nearest", extent = (xx.min(), xx.max(), yy.min(), yy.max()), 
           cmap = cmap, aspect = "auto", origin = "lower")
plt.plot(pca_data[:, 0], pca_data[:, 1], "k.", markersize = 5)

# Plot the centroid of each cluster (white X)
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1], marker = "x", s = 169, 
            linewidths = 3, color = "w", zorder = 10)
plt.title("K-Means clustering results on stock market price movements (PCA-Reduced Data)")
plt.show()
    

In [None]:
results = pd.DataFrame({"Clusters":predictions, "Companies":list(companies_dict)}).sort_values(by=["Clusters"], axis = 0)
results