In [1]:
from pyspark.sql import SparkSession, functions, Window

In [2]:
def get_session():
    return SparkSession \
        .builder \
        .appName('Market Analytics') \
        .getOrCreate()

In [3]:
sess = get_session()

## Overview datasets

In [4]:
!ls -l dataset/Stocks | head

total 686608
-rw-rw-r-- 1 faisal faisal  21911 Nov 13  2017 aaap.us.txt
-rw-rw-r-- 1 faisal faisal 246552 Nov 13  2017 aaba.us.txt
-rw-rw-r-- 1 faisal faisal  33723 Nov 13  2017 aac.us.txt
-rw-rw-r-- 1 faisal faisal  49499 Nov 13  2017 aal.us.txt
-rw-rw-r-- 1 faisal faisal  50642 Nov 13  2017 aamc.us.txt
-rw-rw-r-- 1 faisal faisal 135301 Nov 13  2017 aame.us.txt
-rw-rw-r-- 1 faisal faisal 155628 Nov 13  2017 aan.us.txt
-rw-rw-r-- 1 faisal faisal  46086 Nov 13  2017 aaoi.us.txt
-rw-rw-r-- 1 faisal faisal 154078 Nov 13  2017 aaon.us.txt
ls: write error: Broken pipe


In [5]:
!cat dataset/Stocks/aac.us.txt | head

Date,Open,High,Low,Close,Volume,OpenInt
2014-10-02,20,20.1,17.6,18.5,2799073,0
2014-10-03,18.2,18.75,18.05,18.65,155562,0
2014-10-06,18.48,19.58,18.48,19.24,188229,0
2014-10-07,19.25,19.48,18.93,19.24,176606,0
2014-10-08,19.17,19.48,18.9,19.13,37046,0
2014-10-09,19.16,19.55,19,19.25,114523,0
2014-10-10,19.25,19.36,18.77,19,34775,0
2014-10-13,18.92,19.1,18.64,19,52514,0
2014-10-14,19.03,19.03,17.91,18.1,126457,0


In [6]:
!du -sh .

1,1G	.


## Load Datasets

In [7]:
@functions.udf
def extract_symbol(filename):
    return filename.split('/')[-1].split('.')[0]

def load_data(session):
    df = session.read.format('csv').option('header', 'true').load('dataset/Stocks/*.txt')
    df = df.drop('OpenInt')
    df = df.withColumn('Symbol', extract_symbol(functions.input_file_name()))
    df = df.withColumn('Date', df['Date'].cast('date'))
    df = df.withColumn('Open', df['Open'].cast('float'))
    df = df.withColumn('High', df['High'].cast('float'))
    df = df.withColumn('Low', df['Low'].cast('float'))
    df = df.withColumn('Close', df['Close'].cast('float'))
    return df

In [8]:
load = load_data(sess)
load.show(10)

+----------+------+------+------+------+-------+------+
|      Date|  Open|  High|   Low| Close| Volume|Symbol|
+----------+------+------+------+------+-------+------+
|1962-01-02|0.6277|0.6362|0.6201|0.6201|2575579|    ge|
|1962-01-03|0.6201|0.6201|0.6122|0.6201|1764749|    ge|
|1962-01-04|0.6201|0.6201|0.6037|0.6122|2194010|    ge|
|1962-01-05|0.6122|0.6122|0.5798|0.5957|3255244|    ge|
|1962-01-08|0.5957|0.5957|0.5716|0.5957|3696430|    ge|
|1962-01-09|0.5957|0.6037|0.5878|0.5957|2778285|    ge|
|1962-01-10|0.5957|0.6037|0.5957|0.5957|2337096|    ge|
|1962-01-11|0.5957|0.5957|0.5878|0.5957|1943605|    ge|
|1962-01-12|0.5957|0.6037|0.5878|0.5878|2015151|    ge|
|1962-01-15|0.5957|0.5957|0.5957|0.5957|2527879|    ge|
+----------+------+------+------+------+-------+------+
only showing top 10 rows



## Technical Indicator

### Moving Average

In [9]:
days = lambda i: i * 86400 

def moving_average(df, symbol, day):
    df = df.where(df['Symbol'] == symbol.lower())
    
    w = (Window()
         .orderBy(df['Date'].cast('long'))
         .rangeBetween(-days(day), 0))

    return df.withColumn('sma', functions.avg('Close').over(w))

In [10]:
moving_average(load, 'aapl', 10).show(50)

+----------+-------+-------+-------+-------+---------+------+-----------------+
|      Date|   Open|   High|    Low|  Close|   Volume|Symbol|              sma|
+----------+-------+-------+-------+-------+---------+------+-----------------+
|1984-09-07|0.42388|0.42902|0.41874|0.42388| 23220030|  aapl|22.28101799117213|
|1984-09-10|0.42388|0.42516|0.41366|0.42134| 18022532|  aapl|22.28101799117213|
|1984-09-11|0.42516|0.43668|0.42516|0.42902| 42498199|  aapl|22.28101799117213|
|1984-09-12|0.42902|0.43157|0.41618|0.41618| 37125801|  aapl|22.28101799117213|
|1984-09-13|0.43927|0.44052|0.43927|0.43927| 57822062|  aapl|22.28101799117213|
|1984-09-14|0.44052|0.45589|0.44052|0.44566| 68847968|  aapl|22.28101799117213|
|1984-09-17|0.45718|0.46357|0.45718|0.45718| 53755262|  aapl|22.28101799117213|
|1984-09-18|0.45718|0.46103|0.44052|0.44052| 27136886|  aapl|22.28101799117213|
|1984-09-19|0.44052|0.44566|0.43157|0.43157| 29641922|  aapl|22.28101799117213|
|1984-09-20|0.43286|0.43668|0.43286|0.43