Working with stock market data downloaded from Yahoo Finance using yahoo_finance Python package. 

About data:
    Dates: 2007-1-1 -> 2017-04-17
    Hundreds of symbols on the NASDAQ
    Each symbol is stored in a csv file in the "prices" folder.
    CSV columns:
        date — the data's date
        close — the date's closing price
        open — the date's opening price
        high — the date's highest stock price during trading
        low — the date's lowest stock price during trading
        volume — the date's number of shares traded
Existing download_data.py script provided by DataQuest downloaded the data. 

In [1]:
import pandas as pd
import os

stock_dfs = []
for fn in os.listdir('prices'):
    df = pd.read_csv(os.path.join('prices', fn))
    df['name'] = fn.split('.')[0]
    stock_dfs.append(df)
stock_dfs = pd.concat(stock_dfs)
# test to see if it works
display(stock_dfs[stock_dfs.name == 'aapl'])

Unnamed: 0,date,close,open,high,low,volume,name
0,2007-01-03,83.800002,86.289999,86.579999,81.899999,309579900,aapl
1,2007-01-04,85.659998,84.050001,85.949998,83.820003,211815100,aapl
2,2007-01-05,85.049997,85.770000,86.199997,84.400002,208685400,aapl
3,2007-01-08,85.470000,85.959998,86.529998,85.280003,199276700,aapl
4,2007-01-09,92.570003,86.450003,92.979999,85.150000,837324600,aapl
...,...,...,...,...,...,...,...
2585,2017-04-10,143.169998,143.600006,143.880005,142.899994,18473000,aapl
2586,2017-04-11,141.630005,142.940002,143.350006,140.059998,30275300,aapl
2587,2017-04-12,141.800003,141.600006,142.149994,141.009995,20238900,aapl
2588,2017-04-13,141.050003,141.910004,142.380005,141.050003,17652900,aapl


In [2]:
import math
import functools
from multiprocessing import Pool

def make_chunks(data, num_chunks):
    chunk_size = math.ceil(len(data) / num_chunks)
    if type(data) == dict:
        keys = list(data.keys())
        keys = [keys[i:i+chunk_size] for i in range(0, len(data), chunk_size)]
        data_chunks = []
        for key_list in keys:
            data_chunks.append({key: data[key]} for key in key_list)
        return data_chunks
    return [data[i:i+chunk_size] for i in range(0, len(data), chunk_size)]

def make_chunks_by(data: pd.DataFrame, field: str) -> list:
    if field not in data.columns:
        raise KeyError(f"No field {field} found in column names of data")
    grouped = data.groupby(field)
    return [group for _, group in grouped]
    

def map_reduce(data, num_processes, mapper, reducer):
    chunks = make_chunks(data, num_processes)
    with Pool(num_processes) as pool:
        chunk_results = pool.map(mapper, chunks)
    return functools.reduce(reducer, chunk_results)

display(stock_dfs.groupby('name').mean())

Unnamed: 0_level_0,close,open,high,low,volume
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
aal,22.074954,22.102324,22.520093,21.650618,8.469081e+06
aame,2.779680,2.773317,2.824629,2.712714,6.318919e+03
aaon,23.617386,23.602819,23.995599,23.205729,2.112639e+05
aapl,257.176540,257.294170,259.864224,254.358062,1.301124e+08
aaww,44.331602,44.331305,45.089641,43.531606,2.874148e+05
...,...,...,...,...,...
flir,30.795726,30.774846,31.174459,30.367552,1.321257e+06
fll,2.471575,2.473483,2.523089,2.412764,4.306687e+04
flws,5.781958,5.782938,5.912247,5.648923,2.317831e+05
flxs,22.836367,22.825351,23.190236,22.472776,1.491429e+04


In [3]:
means = stock_dfs.groupby('name').mean()
max_close_id = means['close'].idxmax()  # amzn
min_close_id = means['close'].idxmin()  # blfs
stock_avg_closes = {idx: means.loc[idx, 'close'] for idx in means.index}
display(stock_avg_closes)

{'aal': 22.074953666795338,
 'aame': 2.7796795366795344,
 'aaon': 23.61738606177606,
 'aapl': 257.17654040231656,
 'aaww': 44.331602290347405,
 'aaxn': 11.863907341698843,
 'abax': 34.57868337992275,
 'abcb': 17.990475994208477,
 'abco': 47.647057967567655,
 'abeo': 2.5932200772200797,
 'abio': 2.2518008000000007,
 'abmd': 33.222420861003854,
 'abtl': 6.233108108494209,
 'acad': 13.82358687490347,
 'acet': 12.655212363320476,
 'acfc': 5.596733538610015,
 'acgl': 63.325907376833804,
 'achc': 24.047795338223956,
 'achn': 5.941177606949804,
 'aciw': 28.27269496023162,
 'acls': 3.343806946718146,
 'acnb': 17.343528900386115,
 'acor': 27.47286873938217,
 'acta': 11.32055983706564,
 'actg': 15.997490346718152,
 'acxm': 18.26306178378379,
 'adbe': 51.19943628416986,
 'adi': 42.24018144826256,
 'admp': 1.7122164397683428,
 'adp': 61.03234735559848,
 'adra': 27.3514517397683,
 'adrd': 22.51748262046331,
 'adre': 39.14505407104248,
 'adru': 22.371667961776062,
 'adsk': 42.247594632818625,
 'adtn

In [4]:
print(min_close_id)
max_close = max(stock_avg_closes, key=stock_avg_closes.get)
print(max_close)
print(max_close_id)

blfs
amzn
amzn


In [5]:
by_day_trades = stock_dfs.groupby('date').apply(lambda x: x[['volume', 'name']].to_dict('records'))

In [6]:
def sum_day_trades(trades: list) -> dict:
    trade_by_symbol = {}
    for trade in trades:
        if trade['name'] not in trade_by_symbol.keys():
            trade_by_symbol[trade['name']] = 0
        trade_by_symbol[trade['name']] += trade['volume']
    return trade_by_symbol

by_day_trade_totals = {key: sum_day_trades(val) for key, val in by_day_trades.items()}
display(by_day_trade_totals['2007-01-03'])
most_traded_stock_by_day = {}
for date, trades in by_day_trade_totals.items():
    symbol = max(trades, key=trades.get)
    most_traded_stock_by_day[date] = (trades[symbol], symbol)
# most_traded_stock_by_day = {key: max(val, val.get) for key, val in by_day_trade_totals.items()}
display(most_traded_stock_by_day)

{'dgica': 30100,
 'bdge': 100,
 'cvco': 36500,
 'blkb': 365800,
 'bbox': 108200,
 'ffbc': 192600,
 'fbiz': 400,
 'ffic': 46300,
 'bdsi': 29500,
 'amgn': 12908400,
 'expe': 1703300,
 'expd': 2343300,
 'clct': 25100,
 'alny': 269000,
 'evol': 72900,
 'ahgp': 127900,
 'dfbg': 4700,
 'afsi': 1312400,
 'chy': 203200,
 'bmrn': 1946200,
 'agys': 993300,
 'adrd': 19700,
 'drrx': 380300,
 'crus': 1216400,
 'brew': 15500,
 'fbms': 0,
 'emcf': 1600,
 'bsqr': 63000,
 'csfl': 4300,
 'car': 1146900,
 'cmcsa': 39543600,
 'cmtl': 241800,
 'elos': 820400,
 'eltk': 11600,
 'agii': 151000,
 'coke': 23300,
 'egan': 4000,
 'cpss': 114000,
 'adtn': 2596900,
 'ffiv': 1740400,
 'cspi': 3500,
 'bwen': 2700,
 'cgnx': 434200,
 'cdns': 5240400,
 'egt': 10600,
 'cray': 485600,
 'arcw': 500,
 'bncn': 27400,
 'admp': 1100,
 'cnsl': 205100,
 'abax': 193000,
 'aris': 2000,
 'cyrn': 257600,
 'asys': 8500,
 'bosc': 700,
 'achn': 54900,
 'ahpi': 4200,
 'ebay': 45536900,
 'apog': 375000,
 'camp': 235100,
 'acor': 266300,


{'2007-01-03': (309579900, 'aapl'),
 '2007-01-04': (211815100, 'aapl'),
 '2007-01-05': (208685400, 'aapl'),
 '2007-01-08': (199276700, 'aapl'),
 '2007-01-09': (837324600, 'aapl'),
 '2007-01-10': (738220000, 'aapl'),
 '2007-01-11': (360063200, 'aapl'),
 '2007-01-12': (328172600, 'aapl'),
 '2007-01-16': (311019100, 'aapl'),
 '2007-01-17': (411565000, 'aapl'),
 '2007-01-18': (591151400, 'aapl'),
 '2007-01-19': (341118400, 'aapl'),
 '2007-01-22': (363506500, 'aapl'),
 '2007-01-23': (301856100, 'aapl'),
 '2007-01-24': (231953400, 'aapl'),
 '2007-01-25': (226493400, 'aapl'),
 '2007-01-26': (246718500, 'aapl'),
 '2007-01-29': (225416100, 'aapl'),
 '2007-01-30': (144492600, 'aapl'),
 '2007-01-31': (214017300, 'aapl'),
 '2007-02-01': (166085500, 'aapl'),
 '2007-02-02': (155382500, 'aapl'),
 '2007-02-05': (144713100, 'aapl'),
 '2007-02-06': (216098400, 'aapl'),
 '2007-02-07': (266706300, 'aapl'),
 '2007-02-08': (169757700, 'aapl'),
 '2007-02-09': (215135200, 'aapl'),
 '2007-02-12': (181017900, '

In [7]:
most_traded_stock_by_day

{'2007-01-03': (309579900, 'aapl'),
 '2007-01-04': (211815100, 'aapl'),
 '2007-01-05': (208685400, 'aapl'),
 '2007-01-08': (199276700, 'aapl'),
 '2007-01-09': (837324600, 'aapl'),
 '2007-01-10': (738220000, 'aapl'),
 '2007-01-11': (360063200, 'aapl'),
 '2007-01-12': (328172600, 'aapl'),
 '2007-01-16': (311019100, 'aapl'),
 '2007-01-17': (411565000, 'aapl'),
 '2007-01-18': (591151400, 'aapl'),
 '2007-01-19': (341118400, 'aapl'),
 '2007-01-22': (363506500, 'aapl'),
 '2007-01-23': (301856100, 'aapl'),
 '2007-01-24': (231953400, 'aapl'),
 '2007-01-25': (226493400, 'aapl'),
 '2007-01-26': (246718500, 'aapl'),
 '2007-01-29': (225416100, 'aapl'),
 '2007-01-30': (144492600, 'aapl'),
 '2007-01-31': (214017300, 'aapl'),
 '2007-02-01': (166085500, 'aapl'),
 '2007-02-02': (155382500, 'aapl'),
 '2007-02-05': (144713100, 'aapl'),
 '2007-02-06': (216098400, 'aapl'),
 '2007-02-07': (266706300, 'aapl'),
 '2007-02-08': (169757700, 'aapl'),
 '2007-02-09': (215135200, 'aapl'),
 '2007-02-12': (181017900, '

In [11]:
# total the volume of trades every day in the by_day_trade_totals dictionary 
by_day_total_trades = {key: sum(trades.values()) for key, trades in by_day_trade_totals.items()}
display(by_day_total_trades)

{'2007-01-03': 996359900,
 '2007-01-04': 830112500,
 '2007-01-05': 738620900,
 '2007-01-08': 743314800,
 '2007-01-09': 1416558400,
 '2007-01-10': 1315041800,
 '2007-01-11': 969483700,
 '2007-01-12': 985230800,
 '2007-01-16': 940514100,
 '2007-01-17': 1036131100,
 '2007-01-18': 1187800900,
 '2007-01-19': 884970000,
 '2007-01-22': 884708600,
 '2007-01-23': 877569300,
 '2007-01-24': 948420100,
 '2007-01-25': 969960200,
 '2007-01-26': 828665600,
 '2007-01-29': 760918200,
 '2007-01-30': 594663900,
 '2007-01-31': 783022500,
 '2007-02-01': 771135300,
 '2007-02-02': 713955300,
 '2007-02-05': 641509800,
 '2007-02-06': 770228800,
 '2007-02-07': 860303200,
 '2007-02-08': 724329300,
 '2007-02-09': 780471300,
 '2007-02-12': 644847200,
 '2007-02-13': 684019300,
 '2007-02-14': 755958900,
 '2007-02-15': 658319400,
 '2007-02-16': 575412800,
 '2007-02-20': 682362600,
 '2007-02-21': 774421900,
 '2007-02-22': 730870000,
 '2007-02-23': 594946800,
 '2007-02-26': 684389100,
 '2007-02-27': 1120232900,
 '2007-

In [16]:
# Find the top 10 trading days
sorted_days = sorted(by_day_total_trades, key=by_day_total_trades.get)
display(sorted_days[-10:])

['2008-01-24',
 '2008-01-16',
 '2007-11-08',
 '2008-09-29',
 '2008-02-07',
 '2008-01-22',
 '2008-10-08',
 '2007-07-26',
 '2008-10-10',
 '2008-01-23']

In [18]:
for date in sorted_days[-10:]:
    print(date, by_day_total_trades[date])

2008-01-24 1533363200
2008-01-16 1536176400
2007-11-08 1553880500
2008-09-29 1555072400
2008-02-07 1559032100
2008-01-22 1578877700
2008-10-08 1599183500
2007-07-26 1611272800
2008-10-10 1770266900
2008-01-23 1964583900


In [21]:
# most profitable stocks
symbol_data = [group for _, group in stock_dfs.groupby('name')]
print(symbol_data[1])

            date  close  open  high   low  volume  name
0     2007-01-03   3.05  3.10  3.18  3.05    5500  aame
1     2007-01-04   3.11  3.12  3.20  3.10   20000  aame
2     2007-01-05   3.42  3.18  3.43  3.17   18600  aame
3     2007-01-08   3.59  3.45  3.62  3.45    9000  aame
4     2007-01-09   3.63  3.75  3.75  3.59   16800  aame
...          ...    ...   ...   ...   ...     ...   ...
2585  2017-04-10   3.98  3.95  4.00  3.95    1400  aame
2586  2017-04-11   3.80  4.00  4.03  3.80    5000  aame
2587  2017-04-12   3.80  3.80  4.00  3.80    1600  aame
2588  2017-04-13   3.80  3.80  3.80  3.80     600  aame
2589  2017-04-17   3.85  3.70  3.95  3.70    2700  aame

[2590 rows x 7 columns]


In [24]:
stock_growths = {}
for syb_data in symbol_data:
    name = syb_data.loc[0, 'name']
    start_price = syb_data.head(1).iloc[0,1]
    end_price = syb_data.tail(1).iloc[0,1]
    growth = (end_price - start_price) / start_price
    stock_growths[name] = growth
sorted_growths = sorted(stock_growths, key=stock_growths.get)
for stock in sorted_growths[-10:]:
    print(stock, stock_growths[stock])

achc 13.300000666666667
bcli 13.392137535980346
cui 15.251625162516252
apdn 15.496700659868026
anip 17.073554472785034
amzn 22.307234281466815
blfs 24.374365640858976
arcw 38.9860048982856
adxs 40.050000000000004
admp 74.83838922594839


ADDITIONAL QUESTIONS TO ANSWER (SUGGESTED BY DATAQUEST):
    Which stocks would have been the best to short-sell at the start of the period?
Which stocks have the most after-hours trading, and show the biggest changes between the closing price and the next day open?
Can technical indicators like Bollinger Bands help us forecast the market?
Which time periods have resulted in steady increases in prices, and which periods have resulted in steady declines?
Based on price, what was the optimal day to buy each stock if we wanted to hold them until now?
On days with high trading volume, do stocks move in one direction (up or down) more than the other one?