In [None]:
%%writefile nasdaq.csv
2017-01-03,7329,8520,6000,8777,6787,2725
2017-01-04,6604,6774,4204,5534,6082,4417
...
2019-09-28,8996,8620,9972,9564,7215,9455
2019-09-29,6021,7479,1173,5655,8865,4851

## Import and parse data

In [2]:
from pyspark import SparkConf, SparkContext

# Create a Sprark context
sc = SparkContext(conf=SparkConf().setAppName("MyApp").setMaster("local"))

In [3]:
from collections import namedtuple
Record = namedtuple('Record', ['date', 'open', 'high', 'low', 'close', 'adj_close', 'volume'])

def parse_record(s):
    fields = s.split(',')
    return Record(fields[0], float(fields[1]), float(fields[2]), float(fields[3]), float(fields[4]), float(fields[5]), int(fields[6]))

In [4]:
parsed_data = sc.textFile('nasdaq.csv').map(parse_record).cache()
parsed_data.take(2)

[Record(date=u'2017-01-03', open=7329.0, high=8520.0, low=6000.0, close=8777.0, adj_close=6787.0, volume=2725),
 Record(date=u'2017-01-04', open=6604.0, high=6774.0, low=4204.0, close=5534.0, adj_close=6082.0, volume=4417)]

## Get next day

In [5]:
from datetime import datetime, timedelta

def get_next_date(s):
    fmt = '%Y-%m-%d'
    return (datetime.strptime(s, fmt) + timedelta(days=1)).strftime(fmt)

In [6]:
# test
get_next_date('2017-01-31')

'2017-02-01'

## join

In [7]:
# date_and_close_price = (date, close price)
date_and_close_price = parsed_data.map(lambda r: (r.date, r.close))
date_and_close_price.take(3)

[(u'2017-01-03', 8777.0), (u'2017-01-04', 5534.0), (u'2017-01-05', 9760.0)]

In [8]:
# date_and_prev_close_price = (date, previous date close price)
date_and_prev_close_price = parsed_data.map(lambda r: (get_next_date(r.date), r.close))
date_and_prev_close_price.take(3)

[('2017-01-04', 8777.0), ('2017-01-05', 5534.0), ('2017-01-06', 9760.0)]

In [9]:
# join = (date, (date_and_close_price, date_and_prev_close_price))
joined = date_and_close_price.join(date_and_prev_close_price)
joined.take(3)

[(u'2018-03-06', (5497.0, 7289.0)),
 (u'2019-06-08', (3414.0, 2532.0)),
 (u'2019-06-06', (4919.0, 8851.0))]

In [10]:
# lookup
joined.lookup('2017-01-04')

[(5534.0, 8777.0)]

In [11]:
# returns = (date, return = (date_and_close_price/date_and_prev_close_price-1)*100)
returns = joined.mapValues(lambda p: (p[0]/p[1]-1.0)*100.0)
returns.take(3)

[(u'2018-03-06', -24.584991082453012),
 (u'2019-06-08', 34.834123222748815),
 (u'2019-06-06', -44.424358829510794)]

In [12]:
returns.lookup('2017-01-04')

[-36.94884356841746]

## Left, right, full outer join

In [13]:
# Left outter join
joined_left = date_and_close_price.leftOuterJoin(date_and_prev_close_price)

In [14]:
# Right outter join
joined_right = date_and_close_price.rightOuterJoin(date_and_prev_close_price)

In [15]:
# Full outter join
joined_full = date_and_close_price.fullOuterJoin(date_and_prev_close_price)

In [16]:
# 1st day
print joined.lookup('2017-01-03')
print joined_left.lookup('2017-01-03')
print joined_right.lookup('2017-01-03')
print joined_full.lookup('2017-01-03')

[]
[(8777.0, None)]
[]
[(8777.0, None)]


In [18]:
# last day
print joined.lookup('2019-09-30')
print joined_left.lookup('2019-09-30')
print joined_right.lookup('2019-09-30')
print joined_full.lookup('2019-09-30')

[]
[]
[(None, 5655.0)]
[(None, 5655.0)]
