In [94]:
import logging
import time
import os

model_name = 'FredTimeseries' 
# path of log file
log_path = "./"

# local or cluster
#run_mode = 'cluster'
run_mode = 'local'

# other stuff
sc_setLogLevel = "WARN"  # ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN
seed = 42


#################################################
# ### START
#################################################

# set logging
logger = logging.getLogger(model_name)
hdlr = logging.handlers.RotatingFileHandler(os.path.join(log_path, model_name + ".log"), maxBytes=1000000,
                                            backupCount=5, mode='w')
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr) 
logger.setLevel(logging.INFO)


logger.info("Start Spark")

run_mode="local"
sc_setLogLevel = "INFO"  # ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN
seed = 42
if run_mode == 'cluster':
    SPARK_SUBMIT_ARGS = "--conf spark.dynamicAllocation.enabled=true " \
                        "--conf spark.shuffle.service.enabled=true " \
                        "--conf spark.dynamicAllocation.maxExecutors=80 " \
                        "--conf spark.dynamicAllocation.minExecutors=10 " \
                        "--conf spark.dynamicAllocation.executorIdleTimeout=120 " \
                        "--queue datascience.normal " \
                        "--driver-memory 16g " \
                        "--executor-memory 8g " \
                        "--conf spark.shuffle.manager=tungsten-sort " \
                        "pyspark-shell "
else:
    SPARK_SUBMIT_ARGS = " --master local[*]"
    SPARK_SUBMIT_ARGS += " --driver-memory 12g --executor-memory 4g --num-executors 8"
    SPARK_SUBMIT_ARGS += "--packages com.databricks:spark-csv_2.11:1.5.0"
    SPARK_SUBMIT_ARGS += "--spark.sql.pivotMaxValues=200000"
    jars_dir = "/Users/guillermobreto/Downloads/jars/"
    SPARK_SUBMIT_ARGS += " --jars "
    SPARK_SUBMIT_ARGS += ("local:" + jars_dir + "/sparkts-0.3.0-jar-with-dependencies.jar")
    SPARK_SUBMIT_ARGS += (",local:" + jars_dir + "sparkts-0.4.0-SNAPSHOT-jar-with-dependencies.jar")
    #SPARK_SUBMIT_ARGS += (",local:" + jars_dir + "/commons-csv-1.1.jar")
    #SPARK_SUBMIT_ARGS += (",local:" + jars_dir + "/univocity-parsers-1.5.1.jar")
    SPARK_SUBMIT_ARGS += " pyspark-shell"

os.environ["PYSPARK_SUBMIT_ARGS"] = SPARK_SUBMIT_ARGS

sc = SparkContext()
sqlContext = HiveContext(sc)
#sqlContext.setConf("spark.sql.parquet.binaryAsString", "true")
sc.setLogLevel(sc_setLogLevel)

logger.info("Default Parallelism: {}, Spark Version: {}".format(sc.defaultParallelism, sc.version))
logger.info("------------------------------------------------")

In [1]:
import logging
import time
import os

model_name = 'FredTimeseries' 
# path of log file
log_path = "./"

# local or cluster
#run_mode = 'cluster'
run_mode = 'local'


#################################################
# ### START
#################################################

# set logging
logger = logging.getLogger(model_name)
hdlr = logging.handlers.RotatingFileHandler(os.path.join(log_path, model_name + ".log"), maxBytes=1000000,
                                            backupCount=5, mode='w')
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr) 
logger.setLevel(logging.INFO)


logger.info("Start Spark")

sc_setLogLevel = "INFO"  # ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN
seed = 42

from datetime import datetime

from pyspark import SparkContext, SQLContext
from pyspark.sql import Row
from pyspark.sql.types import StructType, StructField, TimestampType, DoubleType, StringType, IntegerType

from sparkts.datetimeindex import uniform, BusinessDayFrequency
from sparkts.timeseriesrdd import time_series_rdd_from_observations

from pyspark.sql.functions import explode
import pyspark.sql.functions as f
from pyspark.sql.functions import udf
from datetime import datetime

import numpy as np
import pandas as pd

In [2]:
sc.version

u'1.6.2'

In [3]:
import sparkts.datetimeindex as dt

In [4]:
dt.DayFrequency

sparkts.datetimeindex.DayFrequency

In [5]:
def lineToRow(line):
    (year, month, day, symbol, volume, price) = line.split("\t")
    # Python 2.x compatible timestamp generation
    dt = datetime(int(year), int(month), int(day))
    return (dt, symbol, float(price))

def loadObservations(sparkContext, sqlContext, path):
    textFile = sparkContext.textFile(path)
    rowRdd = textFile.map(lineToRow)
    schema = StructType([
        StructField('timestamp', TimestampType(), nullable=True),
        StructField('symbol', StringType(), nullable=True),
        StructField('price', DoubleType(), nullable=True),
    ])
    return sqlContext.createDataFrame(rowRdd, schema);

# Get the data

In [6]:
%%time
rdd = sc.wholeTextFiles("/Users/guillermobreto/Downloads/fred_timeseries/data/fred_codes/")
print("Number of time series: {}".format(rdd.count()))


Number of time series: 40809
CPU times: user 74.4 ms, sys: 28.9 ms, total: 103 ms
Wall time: 11min 20s


In [13]:
original = rdd

In [8]:
from sparkts.datetimeindex import DayFrequency

In [9]:
freq = DayFrequency(1,sc)

In [10]:
%matplotlib inline 
import matplotlib.pyplot as plt

In [11]:
#freq = BusinessDayFrequency(1, 1, sc)
dtIndex = uniform(start='2005-02-01T00:00-00:00', end='2005-06-01T00:00-00:00', freq=freq, sc=sc)

In [138]:
rdd_df = rdd.map(lambda r: (r[0].split("/")[-1].strip(".csv"),filter(None, r[1].split("\n")[1:]))).toDF(["symbol","v"])

In [12]:
%%time
rdd_df.select("symbol").distinct().count()

CPU times: user 22.6 ms, sys: 9.43 ms, total: 32.1 ms
Wall time: 2min 3s


40809

In [13]:
#rdd_df=rdd_df.limit(100)
rdd_df.show(3)

+--------------------+--------------------+
|              symbol|                   v|
+--------------------+--------------------+
|FRED_00XALCCHM086...|[2005-01-01,97.0,...|
|FRED_00XALCFIM086...|[1996-01-01,71.22...|
|FRED_00XALCHRM086...|[2004-12-01,81.32...|
+--------------------+--------------------+
only showing top 3 rows



In [139]:
rdd_df=rdd_df.limit(1000)

In [14]:
%%time
rdd_df.select("symbol").distinct().count()

CPU times: user 22.7 ms, sys: 9.47 ms, total: 32.2 ms
Wall time: 2min 3s


40809

In [140]:
rdd_df_exp =  rdd_df.select([rdd_df.symbol,explode(rdd_df.v).alias("DATA-VALUE")])

In [141]:
valueUdf = udf(lambda s: float(s.split(",")[1]), DoubleType())
dateUdf = udf(lambda s: s.split(",")[0], StringType())
new_df =rdd_df_exp.withColumn("Date", (f.to_date(f.lit(dateUdf(rdd_df_exp["DATA-VALUE"]))).cast(TimestampType())))
new_df =new_df.withColumn("price", valueUdf(new_df["DATA-VALUE"]))

In [142]:
new_df.show(3)

+--------------------+---------------+--------------------+-----+
|              symbol|     DATA-VALUE|                Date|price|
+--------------------+---------------+--------------------+-----+
|FRED_00XALCCHM086...|2005-01-01,97.0|2005-01-01 00:00:...| 97.0|
|FRED_00XALCCHM086...|2005-02-01,97.3|2005-02-01 00:00:...| 97.3|
|FRED_00XALCCHM086...|2005-03-01,97.4|2005-03-01 00:00:...| 97.4|
+--------------------+---------------+--------------------+-----+
only showing top 3 rows



In [143]:
freq = DayFrequency(1,sc)
dtIndex = uniform(start='2015-01-01T00:00-05:00', end='2016-10-01T00:00-05:00', freq=freq, sc=sc)

In [144]:
dates = ("2000-11-30",  "2016-10-30")
date_from, date_to = [f.to_date(f.lit(s)).cast(TimestampType()) for s in dates]
df_filtered = new_df.where((new_df.Date > date_from) & (new_df.Date < date_to))

In [145]:
df_filtered.show(3)

+--------------------+---------------+--------------------+-----+
|              symbol|     DATA-VALUE|                Date|price|
+--------------------+---------------+--------------------+-----+
|FRED_00XALCCHM086...|2005-01-01,97.0|2005-01-01 00:00:...| 97.0|
|FRED_00XALCCHM086...|2005-02-01,97.3|2005-02-01 00:00:...| 97.3|
|FRED_00XALCCHM086...|2005-03-01,97.4|2005-03-01 00:00:...| 97.4|
+--------------------+---------------+--------------------+-----+
only showing top 3 rows



In [146]:
df = df_filtered.select(["symbol", "Date", "price"])
df = df.withColumnRenamed("Date", "timestamp")

In [147]:
df.show(2, truncate=False)

+---------------------+---------------------+-----+
|symbol               |timestamp            |price|
+---------------------+---------------------+-----+
|FRED_00XALCCHM086NEST|2005-01-01 00:00:00.0|97.0 |
|FRED_00XALCCHM086NEST|2005-02-01 00:00:00.0|97.3 |
+---------------------+---------------------+-----+
only showing top 2 rows



In [148]:
pd.to_datetime(["2000-11-30"])
print(dates[0])

2000-11-30


## Get the S&P 500

In [149]:
import datetime
import pandas as pd
import pandas.io.data
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0)

start = pd.to_datetime(dates[0])
end = pd.to_datetime(dates[1])

sp =  pd.io.data.get_data_yahoo('^GSPC', start, end)


sp.columns.values[-1] = 'AdjClose'
sp.columns = sp.columns + '_SP500'
sp['Return_SP500'] = sp['AdjClose_SP500'].pct_change()
sp.columns

sp = sp.reset_index()[["Date","Close_SP500"]]
sp["Date"]=sp["Date"].map(lambda x: str(x))
sp["symbol"]="Close_SP500"

sp.columns = [ "Date", "price", "symbol"]
sp_df = sqlContext.createDataFrame(sp)
sp_df = sp_df.select(sp_df.symbol, f.to_date(sp_df.Date).cast(TimestampType()).alias("timestamp"), sp_df.price)
sp_df_filtered = sp_df.where((sp_df.timestamp > date_from) & (sp_df.timestamp < date_to))
sp_df_filtered.show(3)

+-----------+--------------------+-----------+
|     symbol|           timestamp|      price|
+-----------+--------------------+-----------+
|Close_SP500|2000-12-01 00:00:...| 1315.22998|
|Close_SP500|2000-12-04 00:00:...|1324.969971|
|Close_SP500|2000-12-05 00:00:...|1376.540039|
+-----------+--------------------+-----------+
only showing top 3 rows



In [150]:
unioned = sp_df_filtered.rdd.union(df.rdd)

In [151]:
unioned_df = unioned.toDF()

In [152]:
unioned_df.show(3)

+-----------+--------------------+-----------+
|     symbol|           timestamp|      price|
+-----------+--------------------+-----------+
|Close_SP500|2000-12-01 00:00:...| 1315.22998|
|Close_SP500|2000-12-04 00:00:...|1324.969971|
|Close_SP500|2000-12-05 00:00:...|1376.540039|
+-----------+--------------------+-----------+
only showing top 3 rows



In [153]:
unioned_df.printSchema()

root
 |-- symbol: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- price: double (nullable = true)



In [154]:
%%time
tickerTsrdd = time_series_rdd_from_observations(dtIndex, unioned_df, "timestamp", "symbol", "price")

CPU times: user 1.65 ms, sys: 1.03 ms, total: 2.69 ms
Wall time: 8.23 ms


In [155]:
%%time
tickerTsrdd.take(2)

CPU times: user 6.2 ms, sys: 2.88 ms, total: 9.08 ms
Wall time: 21 s


[(u'FRED_00XHOUEU27M086NEST',
  array([ 116.38,     nan,     nan,     nan,     nan,     nan,     nan,
             nan,     nan,     nan,     nan,     nan,     nan,     nan,
             nan,     nan,     nan,     nan,     nan,     nan,     nan,
             nan,     nan,     nan,     nan,     nan,     nan,     nan,
             nan,     nan,     nan,  117.07,     nan,     nan,     nan,
             nan,     nan,     nan,     nan,     nan,     nan,     nan,
             nan,     nan,     nan,     nan,     nan,     nan,     nan,
             nan,     nan,     nan,     nan,     nan,     nan,     nan,
             nan,     nan,     nan,  118.29,     nan,     nan,     nan,
             nan,     nan,     nan,     nan,     nan,     nan,     nan,
             nan,     nan,     nan,     nan,     nan,     nan,     nan,
             nan,     nan,     nan,     nan,     nan,     nan,     nan,
             nan,     nan,     nan,     nan,     nan,     nan,  118.67,
             nan,     nan,     nan

In [156]:
filled = tickerTsrdd.fill("linear")

In [157]:
filled.take(2)

[(u'FRED_00XHOUEU27M086NEST',
  array([ 116.38      ,  116.40225806,  116.42451613,  116.44677419,
          116.46903226,  116.49129032,  116.51354839,  116.53580645,
          116.55806452,  116.58032258,  116.60258065,  116.62483871,
          116.64709677,  116.66935484,  116.6916129 ,  116.71387097,
          116.73612903,  116.7583871 ,  116.78064516,  116.80290323,
          116.82516129,  116.84741935,  116.86967742,  116.89193548,
          116.91419355,  116.93645161,  116.95870968,  116.98096774,
          117.00322581,  117.02548387,  117.04774194,  117.07      ,
          117.11357143,  117.15714286,  117.20071429,  117.24428571,
          117.28785714,  117.33142857,  117.375     ,  117.41857143,
          117.46214286,  117.50571429,  117.54928571,  117.59285714,
          117.63642857,  117.68      ,  117.72357143,  117.76714286,
          117.81071429,  117.85428571,  117.89785714,  117.94142857,
          117.985     ,  118.02857143,  118.07214286,  118.11571429,
    

In [158]:
rr = filled.map(lambda ts: (ts[0], np.nan_to_num(ts[1])))

In [159]:
rr.take(1)

[(u'FRED_00XHOUEU27M086NEST',
  array([ 116.38      ,  116.40225806,  116.42451613,  116.44677419,
          116.46903226,  116.49129032,  116.51354839,  116.53580645,
          116.55806452,  116.58032258,  116.60258065,  116.62483871,
          116.64709677,  116.66935484,  116.6916129 ,  116.71387097,
          116.73612903,  116.7583871 ,  116.78064516,  116.80290323,
          116.82516129,  116.84741935,  116.86967742,  116.89193548,
          116.91419355,  116.93645161,  116.95870968,  116.98096774,
          117.00322581,  117.02548387,  117.04774194,  117.07      ,
          117.11357143,  117.15714286,  117.20071429,  117.24428571,
          117.28785714,  117.33142857,  117.375     ,  117.41857143,
          117.46214286,  117.50571429,  117.54928571,  117.59285714,
          117.63642857,  117.68      ,  117.72357143,  117.76714286,
          117.81071429,  117.85428571,  117.89785714,  117.94142857,
          117.985     ,  118.02857143,  118.07214286,  118.11571429,
    

In [160]:
previous = filled.fill("previous")

In [161]:
previous.take(3)

[(u'FRED_00XHOUEU27M086NEST',
  array([ 116.38      ,  116.40225806,  116.42451613,  116.44677419,
          116.46903226,  116.49129032,  116.51354839,  116.53580645,
          116.55806452,  116.58032258,  116.60258065,  116.62483871,
          116.64709677,  116.66935484,  116.6916129 ,  116.71387097,
          116.73612903,  116.7583871 ,  116.78064516,  116.80290323,
          116.82516129,  116.84741935,  116.86967742,  116.89193548,
          116.91419355,  116.93645161,  116.95870968,  116.98096774,
          117.00322581,  117.02548387,  117.04774194,  117.07      ,
          117.11357143,  117.15714286,  117.20071429,  117.24428571,
          117.28785714,  117.33142857,  117.375     ,  117.41857143,
          117.46214286,  117.50571429,  117.54928571,  117.59285714,
          117.63642857,  117.68      ,  117.72357143,  117.76714286,
          117.81071429,  117.85428571,  117.89785714,  117.94142857,
          117.985     ,  118.02857143,  118.07214286,  118.11571429,
    

In [162]:
nearest = previous.fill("nearest")

In [163]:
previous.take(1)

[(u'FRED_00XHOUEU27M086NEST',
  array([ 116.38      ,  116.40225806,  116.42451613,  116.44677419,
          116.46903226,  116.49129032,  116.51354839,  116.53580645,
          116.55806452,  116.58032258,  116.60258065,  116.62483871,
          116.64709677,  116.66935484,  116.6916129 ,  116.71387097,
          116.73612903,  116.7583871 ,  116.78064516,  116.80290323,
          116.82516129,  116.84741935,  116.86967742,  116.89193548,
          116.91419355,  116.93645161,  116.95870968,  116.98096774,
          117.00322581,  117.02548387,  117.04774194,  117.07      ,
          117.11357143,  117.15714286,  117.20071429,  117.24428571,
          117.28785714,  117.33142857,  117.375     ,  117.41857143,
          117.46214286,  117.50571429,  117.54928571,  117.59285714,
          117.63642857,  117.68      ,  117.72357143,  117.76714286,
          117.81071429,  117.85428571,  117.89785714,  117.94142857,
          117.985     ,  118.02857143,  118.07214286,  118.11571429,
    

In [164]:
returns = previous.return_rates()

In [165]:
returns.take(1)

[(u'FRED_00XHOUEU27M086NEST',
  array([  1.91253347e-04,   1.91216776e-04,   1.91180219e-04,
           1.91143676e-04,   1.91107147e-04,   1.91070632e-04,
           1.91034131e-04,   1.90997644e-04,   1.90961171e-04,
           1.90924712e-04,   1.90888267e-04,   1.90851835e-04,
           1.90815418e-04,   1.90779014e-04,   1.90742625e-04,
           1.90706249e-04,   1.90669887e-04,   1.90633539e-04,
           1.90597205e-04,   1.90560884e-04,   1.90524578e-04,
           1.90488285e-04,   1.90452006e-04,   1.90415741e-04,
           1.90379490e-04,   1.90343252e-04,   1.90307029e-04,
           1.90270819e-04,   1.90234623e-04,   1.90198440e-04,
           1.90162272e-04,   3.72182699e-04,   3.72044231e-04,
           3.71905865e-04,   3.71767603e-04,   3.71629443e-04,
           3.71491386e-04,   3.71353431e-04,   3.71215579e-04,
           3.71077829e-04,   3.70940181e-04,   3.70802636e-04,
           3.70665192e-04,   3.70527850e-04,   3.70390610e-04,
           3.70253472e-04

In [166]:
rr = returns.map(lambda ts: (ts[0], np.nan_to_num(ts[1])))

In [167]:
rr.take(1)

[(u'FRED_00XHOUEU27M086NEST',
  array([  1.91253347e-04,   1.91216776e-04,   1.91180219e-04,
           1.91143676e-04,   1.91107147e-04,   1.91070632e-04,
           1.91034131e-04,   1.90997644e-04,   1.90961171e-04,
           1.90924712e-04,   1.90888267e-04,   1.90851835e-04,
           1.90815418e-04,   1.90779014e-04,   1.90742625e-04,
           1.90706249e-04,   1.90669887e-04,   1.90633539e-04,
           1.90597205e-04,   1.90560884e-04,   1.90524578e-04,
           1.90488285e-04,   1.90452006e-04,   1.90415741e-04,
           1.90379490e-04,   1.90343252e-04,   1.90307029e-04,
           1.90270819e-04,   1.90234623e-04,   1.90198440e-04,
           1.90162272e-04,   3.72182699e-04,   3.72044231e-04,
           3.71905865e-04,   3.71767603e-04,   3.71629443e-04,
           3.71491386e-04,   3.71353431e-04,   3.71215579e-04,
           3.71077829e-04,   3.70940181e-04,   3.70802636e-04,
           3.70665192e-04,   3.70527850e-04,   3.70390610e-04,
           3.70253472e-04

In [168]:
def moving_average(a, n=3) :
    ret = np.cumsum(a, dtype=float)
    ret[n:] = ret[n:] - ret[:-n]
    return ret[n - 1:] / n


def shifting(a, delta):
    from scipy.ndimage.interpolation import shift
    return shift(a, delta, cval=np.NaN)

In [169]:
ma = rr.map(lambda row:  (row[0], moving_average(row[1], 10)))

In [170]:
ma.take(2)

[(u'FRED_00XHOUEU27M086NEST',
  array([  1.91088946e-04,   1.91052438e-04,   1.91015944e-04,
           1.90979464e-04,   1.90942997e-04,   1.90906545e-04,
           1.90870107e-04,   1.90833682e-04,   1.90797272e-04,
           1.90760875e-04,   1.90724492e-04,   1.90688123e-04,
           1.90651768e-04,   1.90615427e-04,   1.90579100e-04,
           1.90542786e-04,   1.90506487e-04,   1.90470201e-04,
           1.90433929e-04,   1.90397671e-04,   1.90361426e-04,
           1.90325196e-04,   2.08494637e-04,   2.26653860e-04,
           2.44802872e-04,   2.62941683e-04,   2.81070302e-04,
           2.99188738e-04,   3.17296999e-04,   3.35395095e-04,
           3.53483034e-04,   3.71560825e-04,   3.71422818e-04,
           3.71284914e-04,   3.71147113e-04,   3.71009414e-04,
           3.70871817e-04,   3.70734322e-04,   3.70596928e-04,
           3.70459637e-04,   3.70322447e-04,   3.70185359e-04,
           3.70048373e-04,   3.69911487e-04,   3.69774703e-04,
           3.69638020e-04

In [171]:
sh = rr.map(lambda row:  (row[0] + '_shift', np.nan_to_num(shifting(row[1], 1))))
ma = rr.map(lambda row:  (row[0] + "_mov_avg", moving_average(row[1])))

In [172]:
sh.take(1)

[(u'FRED_00XHOUEU27M086NEST_shift',
  array([  0.00000000e+000,   1.91253347e-004,   1.91216776e-004,
           1.91180219e-004,   1.91143676e-004,   1.91107147e-004,
           1.91070632e-004,   1.91034131e-004,   1.90997644e-004,
           1.90961171e-004,   1.90924712e-004,   1.90888267e-004,
           1.90851835e-004,   1.90815418e-004,   1.90779014e-004,
           1.90742625e-004,   1.90706249e-004,   1.90669887e-004,
           1.90633539e-004,   1.90597205e-004,   1.90560884e-004,
           1.90524578e-004,   1.90488285e-004,   1.90452006e-004,
           1.90415741e-004,   1.90379490e-004,   1.90343252e-004,
           1.90307029e-004,   1.90270819e-004,   1.90234623e-004,
           1.90198440e-004,   1.90162272e-004,   3.72182699e-004,
           3.72044231e-004,   3.71905865e-004,   3.71767603e-004,
           3.71629443e-004,   3.71491386e-004,   3.71353431e-004,
           3.71215579e-004,   3.71077829e-004,   3.70940181e-004,
           3.70802636e-004,   3.70665192

In [173]:
total = sc.union([rr, ma, sh])

In [174]:
from pyspark.mllib.linalg import Vectors
total_df = total.map(lambda x: Row(symbol=x[0], feat=Vectors.dense(x[1]))).map(lambda x: [x[1], x[0]]).toDF(["symbol","feat"])

In [175]:
total_df.show(3)

+--------------------+--------------------+
|              symbol|                feat|
+--------------------+--------------------+
|FRED_00XHOUEU27M0...|[1.91253346933484...|
|   FRED_4BIGEURORECM|[0.0,0.0,0.0,0.0,...|
|FRED_A011RZ2A224NBEA|[0.0,0.0,0.0,0.0,...|
+--------------------+--------------------+
only showing top 3 rows



In [176]:
total_df_clean = total_df

In [177]:
limited =  total_df_clean.map(lambda ts: [ts[0], [float(l) for l in ts[1].toArray()]])

In [178]:
limited.take(2)

[[u'FRED_00XHOUEU27M086NEST',
  [0.00019125334693348428,
   0.00019121677608513465,
   0.00019118021921982198,
   0.00019114367632999674,
   0.00019110714740722123,
   0.0001910706324437239,
   0.00019103413143151116,
   0.00019099764436236732,
   0.00019096117122852085,
   0.0001909247120217561,
   0.00019088826673452353,
   0.00019085183535838546,
   0.00019081541788579237,
   0.00019077901430830657,
   0.00019074262461837854,
   0.00019070624880801468,
   0.00019066988686922137,
   0.00019063353879378298,
   0.00019059720457437201,
   0.0001905608842025508,
   0.00019052457767054776,
   0.0001904882849703693,
   0.0001904520060944659,
   0.0001904157410346219,
   0.0001903794897828437,
   0.00019034325233158178,
   0.00019030702867284255,
   0.0001902708187986324,
   0.00019023462270117975,
   0.00019019844037271305,
   0.00019016227180679302,
   0.00037218269899574885,
   0.0003720442305699123,
   0.00037190586513835555,
   0.00037176760258650354,
   0.00037162944279955923,
   0.00

In [179]:
ts = limited.toDF(["Symbol", "ts"])

In [180]:
ts.show(1)

+--------------------+--------------------+
|              Symbol|                  ts|
+--------------------+--------------------+
|FRED_00XHOUEU27M0...|[1.91253346933484...|
+--------------------+--------------------+
only showing top 1 row



In [181]:
ts_exploded = ts.select([ts.Symbol,explode(ts.ts).alias("values")])

In [182]:
ts_exploded.show(10)

+--------------------+--------------------+
|              Symbol|              values|
+--------------------+--------------------+
|FRED_00XHOUEU27M0...|1.912533469334842...|
|FRED_00XHOUEU27M0...|1.912167760851346...|
|FRED_00XHOUEU27M0...|1.911802192198219...|
|FRED_00XHOUEU27M0...|1.911436763299967...|
|FRED_00XHOUEU27M0...|1.911071474072212...|
|FRED_00XHOUEU27M0...|1.910706324437239E-4|
|FRED_00XHOUEU27M0...|1.910341314315111...|
|FRED_00XHOUEU27M0...|1.909976443623673...|
|FRED_00XHOUEU27M0...|1.909611712285208...|
|FRED_00XHOUEU27M0...|1.909247120217561E-4|
+--------------------+--------------------+
only showing top 10 rows



In [183]:
from pyspark.sql.functions import monotonicallyIncreasingId

# This will return a new DF with all the columns + id
res = ts_exploded.coalesce(1).withColumn("index", monotonicallyIncreasingId())

In [184]:
res.show(10, truncate = False)

+-----------------------+---------------------+-----+
|Symbol                 |values               |index|
+-----------------------+---------------------+-----+
|FRED_00XHOUEU27M086NEST|1.9125334693348428E-4|0    |
|FRED_00XHOUEU27M086NEST|1.9121677608513465E-4|1    |
|FRED_00XHOUEU27M086NEST|1.9118021921982198E-4|2    |
|FRED_00XHOUEU27M086NEST|1.9114367632999674E-4|3    |
|FRED_00XHOUEU27M086NEST|1.9110714740722123E-4|4    |
|FRED_00XHOUEU27M086NEST|1.910706324437239E-4 |5    |
|FRED_00XHOUEU27M086NEST|1.9103413143151116E-4|6    |
|FRED_00XHOUEU27M086NEST|1.9099764436236732E-4|7    |
|FRED_00XHOUEU27M086NEST|1.9096117122852085E-4|8    |
|FRED_00XHOUEU27M086NEST|1.909247120217561E-4 |9    |
+-----------------------+---------------------+-----+
only showing top 10 rows



In [185]:
%%time
from pyspark.sql.window import Window
import pyspark.sql.functions as f
ranked = res.select("Symbol", "index", "values",
     f.rowNumber()
     .over(Window
           .partitionBy("Symbol")
           .orderBy(f.col("index"))
            )
     .alias("rank")
    )

CPU times: user 6.94 ms, sys: 2.89 ms, total: 9.83 ms
Wall time: 18.1 ms


In [186]:
%%time
ranked.show(3)

+-----------+------+--------------------+----+
|     Symbol| index|              values|rank|
+-----------+------+--------------------+----+
|Close_SP500|332919|                 0.0|   1|
|Close_SP500|332920|-0.00609270169656...|   2|
|Close_SP500|332921|-0.00613005026421...|   3|
+-----------+------+--------------------+----+
only showing top 3 rows

CPU times: user 1.6 ms, sys: 1.21 ms, total: 2.81 ms
Wall time: 5.06 s


In [198]:
%%time
pivoted = ranked.groupBy("rank").pivot("Symbol").sum("values")

CPU times: user 2.79 ms, sys: 1.41 ms, total: 4.2 ms
Wall time: 6.48 s


In [188]:
type(pivoted)

pyspark.sql.dataframe.DataFrame

In [189]:
pivoted.persist(StorageLevel.MEMORY_AND_DISK)

DataFrame[rank: int, Close_SP500: double, Close_SP500_mov_avg: double, Close_SP500_shift: double, FRED_00XALCCHM086NEST: double, FRED_00XALCCHM086NEST_mov_avg: double, FRED_00XALCCHM086NEST_shift: double, FRED_00XALCFIM086NEST: double, FRED_00XALCFIM086NEST_mov_avg: double, FRED_00XALCFIM086NEST_shift: double, FRED_00XALCHRM086NEST: double, FRED_00XALCHRM086NEST_mov_avg: double, FRED_00XALCHRM086NEST_shift: double, FRED_00XAPFEEM086NEST: double, FRED_00XAPFEEM086NEST_mov_avg: double, FRED_00XAPFEEM086NEST_shift: double, FRED_00XE00EU27M086NEST: double, FRED_00XE00EU27M086NEST_mov_avg: double, FRED_00XE00EU27M086NEST_shift: double, FRED_00XE00EZ17M086NEST: double, FRED_00XE00EZ17M086NEST_mov_avg: double, FRED_00XE00EZ17M086NEST_shift: double, FRED_00XE00NOM086NEST: double, FRED_00XE00NOM086NEST_mov_avg: double, FRED_00XE00NOM086NEST_shift: double, FRED_00XEFDEU27M086NEST: double, FRED_00XEFDEU27M086NEST_mov_avg: double, FRED_00XEFDEU27M086NEST_shift: double, FRED_00XEFDSEM086NEST: doubl

In [190]:
%%time
from pyspark.sql.functions import col, round
pivoted.select(*(round(col(c), 8).alias(c) for c in pivoted.columns[:5])).show(10, truncate=False)

+----+-----------+-------------------+-----------------+---------------------+
|rank|Close_SP500|Close_SP500_mov_avg|Close_SP500_shift|FRED_00XALCCHM086NEST|
+----+-----------+-------------------+-----------------+---------------------+
|1   |0.0        |-0.00407425        |0.0              |-1.1579E-4           |
|2   |-0.0060927 |-0.0061302         |0.0              |-1.1581E-4           |
|3   |-0.00613005|-0.00706379        |-0.0060927       |-1.1582E-4           |
|4   |-0.00616786|-0.00114383        |-0.00613005      |-1.1583E-4           |
|5   |-0.00889347|0.00687488         |-0.00616786      |-1.1585E-4           |
|6   |0.01162984 |0.0070381          |-0.00889347      |-1.1586E-4           |
|7   |0.01788828 |0.00226219         |0.01162984       |-1.1587E-4           |
|8   |-0.00840381|-0.0046023         |0.01788828       |-1.1589E-4           |
|9   |-0.0026979 |-0.00270521        |-0.00840381      |-1.159E-4            |
|10  |-0.00270519|-0.00266543        |-0.0026979    

## Modeling

In [199]:
%%time
from pyspark.sql.functions import col
udfFlag = udf(lambda value: 1.0 if value > 0 else 0.0, DoubleType())

pivoted_flag = pivoted.withColumn("flag", udfFlag(pivoted.Close_SP500)).drop("Close_SP500")

CPU times: user 167 ms, sys: 115 ms, total: 282 ms
Wall time: 981 ms


In [197]:
ranked.show(3)

+-----------+------+--------------------+----+
|     Symbol| index|              values|rank|
+-----------+------+--------------------+----+
|Close_SP500|332919|                 0.0|   1|
|Close_SP500|332920|-0.00609270169656...|   2|
|Close_SP500|332921|-0.00613005026421...|   3|
+-----------+------+--------------------+----+
only showing top 3 rows



In [74]:
pivoted_flag.columns[-1]

'flag'

In [75]:
features = pivoted_flag.columns[1:-1]

In [77]:

from __future__ import division, print_function

import logging
import os
from time import time

from pyspark import SparkContext
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier
from pyspark.ml.feature import StringIndexer

from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel
from pyspark.ml.feature import VectorAssembler


assembler = (VectorAssembler()
    .setInputCols(features)
    .setOutputCol("features"))



label_col_name = 'label'
target="flag"

pivoted_flag = pivoted_flag.dropna()

print("Number of rows to model: {}".format(pivoted.count()))
df = pivoted_flag.withColumn(label_col_name, f.col(target).cast('double'))

label_idx_col_name = "indexed_label"
label_indexer = StringIndexer(inputCol=label_col_name, outputCol=label_idx_col_name)
pipeline = Pipeline(stages=[label_indexer, assembler])
df = pipeline.fit(df).transform(df)



Number of rows to model: 639


In [78]:

model_name = 'FredTimeseries' 
# path of log file
log_path = "./"

# local or cluster
#run_mode = 'cluster'
run_mode = 'local'

# other stuff
sc_setLogLevel = "INFO"  # ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN
seed = 42


#################################################
# ### START
#################################################

# set logging
logger = logging.getLogger(model_name)
hdlr = logging.handlers.RotatingFileHandler(os.path.join(log_path, model_name + ".log"), maxBytes=1000000,
                                            backupCount=5, mode='w')
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr) 
logger.setLevel(logging.INFO)
start = time()

logger.info("Start Spark")

In [79]:
from time import time

from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql import functions as f

def test_performance_cross_validation(dataset, logger, classifier, label_col, n_folds, seed=None):
    """
     Evaluate classifier performance using k-fold cross validation
     https://spark.apache.org/docs/1.6.0/mllib-evaluation-metrics.html
     """
    rand_col = "uid_rand"
    h = 1.0 / n_folds
    df = dataset.select("*", f.rand(seed).alias(rand_col))
#   metrics_dict = {"roc_auc": [],}

    metrics_dict = {"roc_auc": [],  # roc: y=tpr x=fpr
                    "true_pos_rate": [],  # recall = true pos rate 
                    "false_pos_rate": [],
                    "precision": [],
                    "n_true_neg": [],
                    "n_false_neg": [],
                    "n_false_pos": [],
                    "n_true_pos": [], }

    model = None
    for i in range(n_folds):
        if i == 4:
            logger.info("Just keeping model for {} fold".format(i))
        validate_lb = i * h  # lower bound
        validate_ub = (i + 1) * h  # upper bound
        condition = (df[rand_col] >= validate_lb) & (df[rand_col] < validate_ub)
        validation = df.filter(condition)
        train = df.filter(~condition)

        #         # train
        model = classifier.fit(train)

        #         # predict
        prediction = model.transform(validation)

        #         # assess performance metrics
        prediction_and_labels = prediction.map(lambda x: (x['prediction'], x[label_col]))
        #         print(prediction_and_labels)
        metrics = MulticlassMetrics(prediction_and_labels)
        metrics_areas = BinaryClassificationMetrics(prediction_and_labels)  # gets roc and precRecall curves
        metrics_dict['roc_auc'].append(metrics_areas.areaUnderROC)
        #         # a bit slow, have to calc outside loop
        cm = metrics.confusionMatrix().toArray()
        n_true_neg = cm[0, 0]
        n_false_neg = cm[1, 0]
        n_true_pos = cm[1, 1]
        n_false_pos = cm[0, 1]
        #         #
        metrics_dict['n_true_neg'].append(n_true_neg) 
        metrics_dict['n_false_neg'].append(n_false_neg)
        metrics_dict['n_true_pos'].append(n_true_pos)
        metrics_dict['n_false_pos'].append(n_false_pos) 
        metrics_dict['true_pos_rate'].append(n_true_pos / (n_true_pos+n_false_neg))
        metrics_dict['false_pos_rate'].append(n_false_pos / (n_false_pos+n_true_neg))
        metrics_dict['precision'].append(n_true_pos / (n_true_pos+n_false_pos))

    return model, metrics_dict

In [80]:
features_col_name="features"

model_dict = {
    "logistic_regression": LogisticRegression(labelCol=label_idx_col_name, featuresCol=features_col_name,
                                              maxIter=20, standardization=True,
                                              regParam=0.0, elasticNetParam=0.0),
    
    "decision_tree": DecisionTreeClassifier(labelCol=label_idx_col_name, featuresCol=features_col_name,
                                            impurity='gini', maxDepth=5,
                                            minInstancesPerNode=1, minInfoGain=0.0),

    "random_forest": RandomForestClassifier(labelCol=label_idx_col_name, featuresCol=features_col_name,
                                            maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                                            impurity="gini", numTrees=20, featureSubsetStrategy="auto", seed=seed),
    
    "gradient_boosted_tree": GBTClassifier(labelCol=label_idx_col_name, featuresCol=features_col_name,
                                            maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                                            lossType="logistic", maxIter=20, stepSize=0.1),
}


#####################################################
# ### Fit and Assess Models Perfomance
#####################################################

n_folds = 5
columns = ["fold_" + str(fold) for fold in range(n_folds)]
frames = []
t_time = time()
model_dict_fitted = {}
fold = 0

# cache df to speed up fitting loop!
df.cache()

for name, estimator in model_dict.items():
    logger.info("Estimator: {}".format(name))
    model, results = test_performance_cross_validation(df, logger=logger, classifier=estimator,
                                                         label_col=label_idx_col_name, n_folds=n_folds, seed=seed)
    # save model for later use 
    model_dict_fitted[name] = model 
    
    logger.info("Estimator performace metrics {}".format(results))
    
    tmp_df = pd.DataFrame.from_dict(results, orient="index")
    tmp_df.index.name = "metric"
    tmp_df.columns = columns            
    tmp_df["estimator"] = name
    frames.append(tmp_df)
    fold += 1
    
    logger.info("The modeling for estimator {} took: {}".format(name, time()-t_time))
    logger.info("------------------------------------------------")

modeling_results = pd.concat(frames)
modeling_results["mean"] = modeling_results[columns].mean(axis=1) 
modeling_results["std"] = modeling_results[columns].std(axis=1) 
modeling_results = modeling_results.reset_index().set_index(["estimator", "metric"])

logger.info("The modelling for all estimators took: {}".format(time()-t_time))
logger.info("Modelling performance results")
logger.info(modeling_results)

modeling_results.to_csv(model_name+'_performance.csv', index=True)



In [81]:
modeling_results

Unnamed: 0_level_0,Unnamed: 1_level_0,fold_0,fold_1,fold_2,fold_3,fold_4,mean,std
estimator,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
decision_tree,n_true_pos,31.0,40.0,40.0,45.0,48.0,40.8,6.457554
decision_tree,n_false_pos,14.0,14.0,18.0,28.0,17.0,18.2,5.761944
decision_tree,precision,0.688889,0.740741,0.689655,0.616438,0.738462,0.694837,0.050544
decision_tree,false_pos_rate,0.181818,0.297872,0.25,0.383562,0.34,0.29065,0.078449
decision_tree,n_false_neg,28.0,22.0,28.0,21.0,15.0,22.8,5.449771
decision_tree,n_true_neg,63.0,33.0,54.0,45.0,33.0,45.6,13.145341
decision_tree,true_pos_rate,0.525424,0.645161,0.588235,0.681818,0.761905,0.640509,0.090059
decision_tree,roc_auc,0.671803,0.673644,0.669118,0.649128,0.710952,0.674929,0.022408
logistic_regression,n_true_pos,35.0,33.0,43.0,49.0,43.0,40.6,6.542171
logistic_regression,n_false_pos,35.0,10.0,37.0,32.0,20.0,26.8,11.476062


In [None]:
sc.version