In [1]:
%%writefile nasdaq.csv
2017-01-03,5425,620117,5452,56824,990234,5429,080078,5429080078,188620000
2017-01-04,5455,620156,5459,56849,990559,5456,969699,5429080955,188620959
2017-01-05,5495,660156,5895,56589,954559,5556,949699,5429088955,189520959
2017-02-09,4559,565522,5599,45559,841841,4959,181262,7448485848,226262281
2017-03-10,2252,757575,2424,49242,242442,4435,424224,4242424242,254453453

Writing nasdaq.csv


## Basic

In [2]:
from pyspark import SparkConf, SparkContext

# Create a Sprark context
sc = SparkContext(conf=SparkConf().setAppName("MyApp").setMaster("local"))

In [3]:
# create an RDD from a list
a = [1,2,3,4,5]
a = sc.parallelize(a)

In [4]:
# Check number of partition
a.getNumPartitions()

1

In [5]:
# Checkk RDD
a.collect()

[1, 2, 3, 4, 5]

In [6]:
# Map
b = a.map(lambda x: 2*x)
b.collect()

[2, 4, 6, 8, 10]

## RDD from file

In [7]:
# Create an RDD from a file
raw_data = sc.textFile('nasdaq.csv')

In [8]:
# Check the content
raw_data.take(2)

[u'2017-01-03,5425,620117,5452,56824,990234,5429,080078,5429080078,188620000',
 u'2017-01-04,5455,620156,5459,56849,990559,5456,969699,5429080955,188620959']

In [9]:
from collections import namedtuple
Record = namedtuple('Record', ['date', 'open', 'high', 'low', 'close', 'adj_close', 'volume'])

def parse_record(s):
    fields = s.split(',')
    return Record(fields[0], float(fields[1]), float(fields[2]), float(fields[3]), float(fields[4]), float(fields[5]), int(fields[6]))

In [10]:
# parse
parsed_data = raw_data.map(parse_record)
parsed_data.take(2)

[Record(date=u'2017-01-03', open=5425.0, high=620117.0, low=5452.0, close=56824.0, adj_close=990234.0, volume=5429),
 Record(date=u'2017-01-04', open=5455.0, high=620156.0, low=5459.0, close=56849.0, adj_close=990559.0, volume=5456)]

In [11]:
# Cache the RDD
parsed_data.cache()

PythonRDD[6] at RDD at PythonRDD.scala:48

## Analysis

In [12]:
# Min date
parsed_data.map(lambda x: x.date).min()

u'2017-01-03'

In [13]:
# Max date
parsed_data.map(lambda x: x.date).max()

u'2017-03-10'

In [14]:
# Total trade volume
parsed_data.map(lambda x: x.volume).sum()

25835

## Map-reduce

In [15]:
# New data (key: month, values: ...)
with_month_data = parsed_data.map(lambda x:(x.date[:7], x))
with_month_data.take(2)

[(u'2017-01',
  Record(date=u'2017-01-03', open=5425.0, high=620117.0, low=5452.0, close=56824.0, adj_close=990234.0, volume=5429)),
 (u'2017-01',
  Record(date=u'2017-01-04', open=5455.0, high=620156.0, low=5459.0, close=56849.0, adj_close=990559.0, volume=5456))]

In [16]:
# New data (key: month, value: total volume) by map-reduce
by_month_data = with_month_data.mapValues(lambda x: x.volume).reduceByKey(lambda x,y: x + y)
by_month_data.take(5)

[(u'2017-01', 16441), (u'2017-02', 4959), (u'2017-03', 4435)]

In [17]:
# return the top 1
by_month_data.top(1, lambda x: x[1])

[(u'2017-01', 16441)]

In [18]:
# Clean and save data
result_data = by_month_data.map(lambda t:','.join(map(str, t)))
print result_data.take(3)

result_data.repartition(1).saveAsTextFile('outdir')

['2017-01,16441', '2017-02,4959', '2017-03,4435']
