In [1]:
# Load libraries
import numpy as np
import pandas as pd
import pandas_datareader as dr
from pandas import read_csv, set_option
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler

# Import Model Packages 
from sklearn.cluster import KMeans, AgglomerativeClustering,AffinityPropagation, DBSCAN
from scipy.cluster.hierarchy import fcluster
from scipy.cluster.hierarchy import dendrogram, linkage, cophenet
from scipy.spatial.distance import pdist
from sklearn.metrics import adjusted_mutual_info_score
from sklearn import cluster, covariance, manifold

# Other Helper Packages and functions
import matplotlib.ticker as ticker
from itertools import cycle

import yahoo_fin.stock_info as si
import yfinance as yf

import datetime
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
%load_ext autotime

time: 0 ns (started: 2022-10-15 22:29:43 +08:00)


In [2]:
# The dataset was obtained from Kaggle and imported
df = read_csv('./data/stockprices.csv')

time: 1.26 s (started: 2022-10-15 22:29:43 +08:00)


In [3]:
df.head()

Unnamed: 0,Date,Symbol,Adj Close,Close,High,Low,Open,Volume
0,2010-01-04 00:00:00-05:00,MMM,59.318886,83.019997,83.449997,82.669998,83.089996,3043700.0
1,2010-01-05 00:00:00-05:00,MMM,58.947342,82.5,83.230003,81.699997,82.800003,2847000.0
2,2010-01-06 00:00:00-05:00,MMM,59.783295,83.669998,84.599998,83.510002,83.879997,5268500.0
3,2010-01-07 00:00:00-05:00,MMM,59.826176,83.730003,83.760002,82.120003,83.32,4470100.0
4,2010-01-08 00:00:00-05:00,MMM,60.247749,84.32,84.32,83.300003,83.690002,3405800.0


time: 16 ms (started: 2022-10-15 22:29:45 +08:00)


In [4]:
len(df)

1619660

time: 0 ns (started: 2022-10-15 22:29:45 +08:00)


In [5]:
# Transform table
pd.pivot_table(df, index='Date', columns='Symbol', values='Adj Close')

Symbol,A,AAL,AAP,AAPL,ABBV,ABC,ABMD,ABT,ACN,ADBE,...,WYNN,XEL,XOM,XRAY,XYL,YUM,ZBH,ZBRA,ZION,ZTS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-04 00:00:00-05:00,20.434929,4.496876,38.432693,6.572423,,22.101795,8.740000,19.778837,32.970871,37.090000,...,41.963718,13.932555,43.185623,32.782581,,19.900911,53.282867,28.670000,11.580979,
2010-01-05 00:00:00-05:00,20.212959,5.005957,38.204258,6.583786,,21.944103,8.530000,19.619041,33.174648,37.700001,...,44.515926,13.767320,43.354244,32.392876,,19.832848,54.969589,28.620001,11.989312,
2010-01-06 00:00:00-05:00,20.141132,4.798555,38.537392,6.479064,,21.736616,8.400000,19.727999,33.527321,37.619999,...,43.932011,13.793758,43.728970,32.606281,,19.691063,54.951843,28.400000,13.031859,
2010-01-07 00:00:00-05:00,20.115025,4.939965,38.527885,6.467087,,21.388039,8.400000,19.891426,33.495968,36.889999,...,44.870213,13.734278,43.591564,33.033108,,19.685392,56.212440,27.690001,14.491426,
2010-01-08 00:00:00-05:00,20.108498,4.845690,38.680153,6.510081,,21.620419,8.230000,19.993120,33.362751,36.689999,...,44.548744,13.740885,43.416687,33.033108,,19.691063,55.031742,27.600000,14.256856,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-10-04 00:00:00-04:00,131.410004,12.950000,167.830002,146.100006,141.990005,142.729996,264.010010,103.080002,274.309998,294.970001,...,70.739998,66.470001,95.269997,29.930000,93.139999,111.410004,111.690002,281.649994,54.520000,154.750000
2022-10-05 00:00:00-04:00,132.639999,12.870000,167.639999,146.399994,143.330002,142.449997,264.260010,103.379997,274.339996,297.380005,...,72.180000,64.330002,99.120003,30.240000,92.660004,112.330002,112.489998,281.880005,53.520000,154.589996
2022-10-06 00:00:00-04:00,132.179993,12.730000,165.139999,145.429993,140.289993,139.910004,267.209991,102.449997,269.470001,298.410004,...,74.169998,61.820000,102.059998,29.420000,91.660004,110.120003,110.599998,280.350006,52.410000,152.589996
2022-10-07 00:00:00-04:00,127.440002,12.180000,161.320007,140.089996,138.759995,137.350006,257.089996,101.790001,259.709991,288.769989,...,73.089996,59.860001,101.029999,28.410000,89.269997,108.930000,108.260002,266.679993,50.369999,147.369995


time: 969 ms (started: 2022-10-15 22:29:45 +08:00)


## Define Start and End Date

In [6]:
# Define start date and end date for the data
start_date = '2016-01-03'
end_date = '2019-12-31'

df_sp500 = df[(df.Date>=start_date) & (df.Date<=end_date)]

time: 109 ms (started: 2022-10-15 22:29:46 +08:00)


In [7]:
df_sp500.describe()

Unnamed: 0,Adj Close,Close,High,Low,Open,Volume
count,495242.0,495242.0,495242.0,495242.0,495242.0,495242.0
mean,96.619521,102.483144,103.461797,101.440198,102.463467,4898221.0
std,159.954705,160.412801,162.142631,158.604534,160.343233,12490210.0
min,0.7,0.7,0.71,0.65,0.7,0.0
25%,40.326423,44.3208,44.779999,43.860001,44.32,972600.0
50%,65.812489,72.839996,73.489998,72.133331,72.824356,1990700.0
75%,110.88184,118.400002,119.419998,117.300003,118.379997,4376375.0
max,3892.889893,3892.889893,3946.5,3815.5,3900.0,533478800.0


time: 78 ms (started: 2022-10-15 22:29:46 +08:00)


In [8]:
#Checking for any null values and removing the null values'''
print('Null Values =', df_sp500.isnull().any())

Null Values = Date         False
Symbol       False
Adj Close     True
Close         True
High          True
Low           True
Open          True
Volume        True
dtype: bool
time: 32 ms (started: 2022-10-15 22:29:46 +08:00)


In [9]:
df_sp500.isnull().sum()

Date             0
Symbol           0
Adj Close    12285
Close        12285
High         12285
Low          12285
Open         12285
Volume       12285
dtype: int64

time: 31 ms (started: 2022-10-15 22:29:46 +08:00)


In [10]:
ddf

NameError: name 'ddf' is not defined

time: 281 ms (started: 2022-10-15 22:29:46 +08:00)
