<a id='1.1'></a>
## 2.1. Loading the python packages

In [2]:
# Load libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import read_csv, set_option
from pandas.plotting import scatter_matrix
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import pandas_datareader as dr
import pickle

# Import Model Packages 
from sklearn.cluster import KMeans, AgglomerativeClustering, AffinityPropagation, DBSCAN
from scipy.cluster.hierarchy import fcluster
from scipy.cluster.hierarchy import dendrogram, linkage, cophenet
from scipy.spatial.distance import pdist
from sklearn.metrics import adjusted_mutual_info_score
from sklearn import cluster, covariance, manifold
from statsmodels.tsa.stattools import coint

# Other Helper Packages and functions
import matplotlib.ticker as ticker
from itertools import cycle

import yahoo_fin.stock_info as si
import yfinance as yf

from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
%load_ext autotime

time: 0 ns (started: 2022-10-16 14:24:00 +08:00)


<a id='1.2'></a>
## 2.2. Loading the Data

In [3]:
# The dataset was obtained from Kaggle and imported
dataset = pickle.load(open('./data/01_pricedate_2015_2019.pkl', 'rb'))
ratios = pd.read_csv('./data/ratios.csv')

time: 16 ms (started: 2022-10-16 14:24:00 +08:00)


In [47]:
# Shape of dataset
dataset.shape

(1256, 487)

time: 0 ns (started: 2022-10-16 13:39:38 +08:00)


In [48]:
# peek at data
set_option('display.width', 100)
dataset.head(5)

Symbol,A,AAL,AAP,AAPL,ABBV,ABC,ABMD,ABT,ACN,ADBE,...,WYNN,XEL,XOM,XRAY,XYL,YUM,ZBH,ZBRA,ZION,ZTS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-05,37.42,51.051,151.337,24.12,46.711,79.662,37.07,39.125,76.427,71.98,...,127.431,29.178,64.219,49.26,32.53,44.445,107.126,76.34,24.042,41.036
2015-01-06,36.837,50.256,151.23,24.122,46.48,80.097,36.13,38.681,75.875,70.53,...,124.712,29.334,63.878,48.649,32.339,43.9,106.218,75.79,23.124,40.636
2015-01-07,37.326,50.227,154.48,24.461,48.358,81.696,37.28,38.994,77.468,71.11,...,128.375,29.587,64.525,49.909,32.594,45.354,108.859,77.72,23.345,41.474
2015-01-08,38.445,50.843,155.834,25.4,48.864,81.882,38.96,39.796,78.649,72.92,...,129.919,29.93,65.599,51.142,32.84,46.144,110.015,79.38,23.671,42.113
2015-01-09,38.163,49.289,155.041,25.428,47.527,82.602,38.41,39.378,78.562,71.84,...,131.543,29.562,65.507,50.397,32.22,46.063,108.428,80.54,23.009,42.18


time: 16 ms (started: 2022-10-16 13:39:39 +08:00)


In [49]:
# describe data
set_option('precision', 3)
dataset.describe()

Symbol,A,AAL,AAP,AAPL,ABBV,ABC,ABMD,ABT,ACN,ADBE,...,WYNN,XEL,XOM,XRAY,XYL,YUM,ZBH,ZBRA,ZION,ZTS
count,1256.0,1256.0,1256.0,1256.0,1256.0,1256.0,1256.0,1256.0,1256.0,1256.0,...,1256.0,1256.0,1256.0,1256.0,1256.0,1256.0,1256.0,1256.0,1256.0,1256.0
mean,55.006,39.638,143.622,36.906,59.805,82.091,182.99,52.047,127.33,167.241,...,111.76,40.805,62.954,53.025,54.49,70.015,109.944,122.043,36.427,68.409
std,14.512,7.182,23.422,11.335,14.624,8.115,107.157,15.55,33.608,79.427,...,30.902,9.438,3.668,7.393,16.691,18.61,11.452,53.268,9.894,25.69
min,31.622,24.281,77.127,21.099,36.022,62.801,36.13,32.535,73.53,69.99,...,46.518,26.588,50.11,33.568,27.965,42.096,84.815,46.93,17.704,37.846
25%,39.922,33.877,136.224,26.777,47.497,76.607,97.265,39.348,100.986,93.993,...,89.535,34.143,60.485,49.051,38.141,55.215,103.562,80.683,25.753,46.105
50%,57.711,39.597,149.835,36.048,56.181,81.444,142.98,45.068,117.823,143.605,...,111.691,39.919,63.094,54.76,51.807,68.095,109.34,105.91,38.932,60.53
75%,66.881,45.37,158.723,45.07,73.447,85.989,265.538,64.727,152.798,249.12,...,131.382,45.372,65.577,58.498,71.192,82.795,116.975,163.58,45.054,86.562
max,84.129,56.989,194.023,72.026,99.695,102.871,449.75,85.388,205.783,331.2,...,190.356,62.595,71.204,66.581,82.366,114.009,144.435,258.28,53.566,131.554


time: 453 ms (started: 2022-10-16 13:39:40 +08:00)


In [5]:
ratios.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 505 entries, 0 to 504
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Symbol    505 non-null    object 
 1   Name      505 non-null    object 
 2   Sector    505 non-null    object 
 3   Price     505 non-null    float64
 4   PE        503 non-null    float64
 5   DivdYld   505 non-null    float64
 6   EPS       505 non-null    float64
 7   52WLow    505 non-null    float64
 8   52WkHigh  505 non-null    float64
 9   MktCap    505 non-null    float64
 10  EBITDA    505 non-null    float64
 11  PS        505 non-null    float64
 12  PB        497 non-null    float64
dtypes: float64(10), object(3)
memory usage: 51.4+ KB
time: 15 ms (started: 2022-10-16 14:24:25 +08:00)
