-
Notifications
You must be signed in to change notification settings - Fork 0
/
AlphaVantage Students (2.0).py
255 lines (161 loc) · 7.89 KB
/
AlphaVantage Students (2.0).py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
#Enter your wcd here if needed
# ### Install Packages
# The first segment will be about installing all necessary packages to run alphavantage as well as important files that you will need througout this script
pip install alpha_vantage
from alpha_vantage.timeseries import TimeSeries
from alpha_vantage.techindicators import TechIndicators
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt
import pandas as pd
import pandas
import numpy as np
import sys
import random
import time
from time import sleep
from scipy import interpolate
import math
# Remember, the more data you want to download the more keys you will need for each 500 stocks you will need one additional free key.
key_list = ['Enter your api keys here']
# You need yo use a specific column header to concat the Alphavantage output with what it requested.
# The format should be 'YYYY-MM-DD HH:MM:SS AM/PM' , this can be a bit tideous but should only take about 5 minutes a week. This seems like a unnecessary step, but not every stock is traded every minute resulting in gaps in the data that will be filled later on.
columns = pd.read_csv('Columns.csv')
cc = columns.columns
cc = cc.drop('date') #this is for the columns.csv file we used, but if your format is different feel free to change it
cc = pd.to_datetime(cc) #transform the format of the string into a datetime, this will be used in retrieving the data
tickers_list = pd.read_csv("companylist.csv") # This should be your ticker list you want to check
comp_list = tickers_list.iloc[:,0]
# ### Check Ticker List
# This function allows the user to submit a csv file that contains a list of tickers. Alphavnatage does not publish a list of all supported tickers, and therefore I will publish one as they might have their reason. However, this tool will need a time due to the limimtations of alphanatage of 5 calls per minute, but will filter all 'bad' tickers out so that the outcome will be a clean list of functioning tickers. I would repeat this step every time you pull large data sets for all tickers, as tickers change, are discontinued etc.
# This is not necessary to get data as the next step has it build in. It might be beneficial the first time around as from experience, long lists from the internet can have a lot of mistakes (~10%). So do this the firs time around to cut it down and then it shouldnt be necessary.
comp_list = list(np.zeros(0))
def data_gath(i,tickers_list,key_list):
sleep(12)
ticker = tickers_list.iloc[i,0]
j = i//500
key = key_list[j]
time = TimeSeries(key=key, output_format='pandas')
data = time.get_intraday(symbol=ticker, interval = '1min',outputsize = 'full')
return data
for i in range(len(tickers_list)):
try:
data_gath(i,tickers_list,key_list)
comp_list.append(tickers_list.iloc[i,0])
except:
print(str(i))
comp_list_pd = pd.DataFrame(comp_list)
comp_list_pd.to_csv("Full_list.csv")
# ### Retrieving Data
# Now that we have a list of tickers we can download up to 5 days of intraday data and other historical data. The following section retrieves the stock data and puts them into a df that is easily readable. At the end we merge the indivual ticker blocks into one df that you should download as a 'raw' data file.
# Weekly data pull
def Stock_pull(tickers_list,key_list,columns_1):
df_new = pd.DataFrame(np.zeros([0, len(columns_1)]))
df_new.columns = columns_1
df_new['Ticker'] = 0
for i in range(len(tickers_list)):
try:
data = data_gath(i,tickers_list,key_list)
df_int = data[0].T # new df with length of 390 * days
df_int.iloc[3,:] = df_int.iloc[1,:] - df_int.iloc[2,:]
df_int = df_int.rename(index={'4. close': '4. HML'})
df_int['Ticker'] = tickers_list.iloc[i,0]
df_new = pd.concat([df_new, df_int], axis=0)
except:
print(str(i))
return df_new
WK = Stock_pull(tickers_list,key_list,cc)
WK.to_csv("WK1_df.csv") # save this Raw data file each week as we will later merge 4 weeks into one big file
# ### Data Merging + Cleaning
# The raw data is not perfect and sometimes is missing data, while this could be for a number of reasons we will 'clean' the data, interpolate values and adjust missing volumes.
WK1_df = pd.read_csv("WK1_df.csv") #These are 4 weeks of data that we downloaded and now we will merge them as well as
WK2_df = pd.read_csv("WK2_df.csv") #clean them to ensure we have accurate data
WK3_df = pd.read_csv("WK3_df.csv")
WK4_df = pd.read_csv("WK4_df.csv")
df_list = [WK1_df,WK2_df,WK3_df,WK4_df]
def Merge(df_list):
df_list[len(df_list)-1]
df = pd.DataFrame()
for i in range(len(df_list)):
if i == 0:
df_list[0] = df_list[0].iloc[:, :-1]
df = df_list[0]
if i > 0 and i < (len(df_list)-1):
df_list[i] = df_list[i].iloc[:, 1:-1]
df = pd.concat([df, df_list[i]], axis=1)
if i == (len(df_list)-1):
df_list[i] = df_list[i].iloc[:, 1:]
df = pd.concat([df, df_list[i]], axis=1)
return df
WK1_4 = Merge(df_list) # Merged file that consists of all 4 weeks
def cleaning_arr(pd_df):
col = pd_df.columns
df = np.array(pd_df)
for i in range(len(df)):
print('Reihe ' + str(i))
n=1
if df[i,0] == '1. open':
for j in range(len(df[i,:].T)):
if pd.isna(df[i,j]) == True:
if pd.isna(df[i,j-1]) == False:
df[i,j] = df[i,j-1]
if type(df[i,j-1]) == str:
while True:
n += 1
df[i,1] = df[i,j+n-1]
if pd.isna(df[i,j+n-1]) == False:
break
if df[i,0] == '2. high':
for j in range(len(df[i,:].T)):
if pd.isna(df[i,j]) == True:
df[i,j] = df[i-1,j]
if df[i,0] == '3. low':
for j in range(len(df[i,:].T)):
if pd.isna(df[i,j]) == True:
df[i,j] = df[i-2,j]
if df[i,0] == '4. HML':
for j in range(len(df[i,:].T)):
if pd.isna(df[i,j]) == True:
df[i,j] = df[i-2,j]-df[i-1,j]
if df[i,0] == '5. volume':
for j in range(len(df[i,:].T)):
if pd.isna(df[i,j]) == True:
df[i,j] = 0
df = pd.DataFrame(df)
df.columns = pd_df.columns
df.index = df.iloc[:,0]
df = df.drop(df.columns[0], axis=1)
return df
WK1_4_clean = cleaning_arr(WK1_4) #this calls up the previous function and cleans it
WK1_4_clean.to_csv('WK1_4_clean.csv')
# ### Data Analysis
def Analysis(pd_df):
pd_df['Max'] = 0
pd_df['Min'] = 0
pd_df['Mean'] = 0
pd_df['Median'] = 0
pd_df['StD'] = 0
pd_df['Max Minute Change'] = 0
pd_df['Min Minute Change'] = 0
col = pd_df.columns
df = np.array(pd_df)
values = [0]*len(df[1,1:-8])
for i in range(len(df)):
print('Reihe ' + str(i))
df[i,-7] = df[i,1:-8].max()
df[i,-6] = df[i,1:-8].min()
df[i,-5] = df[i,1:-8].mean()
df[i,-4] = np.median(df[i,1:-8])
df[i,-3] = df[i,1:-8].std()
for j in range(len(df[1,1:-8])):
if j > 1:
if df[i,j] != 0 and df[i,j-1] != 0:
values[j] = (df[i,j] / df[i,j-1])-1
if df[i,j] == 0 or df[i,j-1] == 0:
values[j] = 0
df[i,-2] = max(values)
df[i,-1] = min(values)
df = pd.DataFrame(df)
df.columns = pd_df.columns
df.index = df.iloc[:,0]
df = df.drop(df.columns[0], axis=1)
return df