In [1]:
from datetime import datetime

from pyspark import SparkContext, SQLContext
from pyspark.sql import Row
from pyspark.sql.types import StructType, StructField, TimestampType, DoubleType, StringType

from sparkts.datetimeindex import uniform, BusinessDayFrequency
from sparkts.timeseriesrdd import time_series_rdd_from_observations

In [242]:
import sparkts.datetimeindex as dt

In [None]:
dt.DayFrequency

In [3]:
def lineToRow(line):
    (year, month, day, symbol, volume, price) = line.split("\t")
    # Python 2.x compatible timestamp generation
    dt = datetime(int(year), int(month), int(day))
    return (dt, symbol, float(price))

def loadObservations(sparkContext, sqlContext, path):
    textFile = sparkContext.textFile(path)
    rowRdd = textFile.map(lineToRow)
    schema = StructType([
        StructField('timestamp', TimestampType(), nullable=True),
        StructField('symbol', StringType(), nullable=True),
        StructField('price', DoubleType(), nullable=True),
    ])
    return sqlContext.createDataFrame(rowRdd, schema);

In [3]:
!wget https://raw.githubusercontent.com/sryza/spark-ts-examples/master/data/ticker.tsv

--2016-10-06 17:14:18--  https://raw.githubusercontent.com/sryza/spark-ts-examples/master/data/ticker.tsv
Resolving raw.githubusercontent.com... 151.101.60.133
Connecting to raw.githubusercontent.com|151.101.60.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 105146 (103K) [text/plain]
Saving to: 'ticker.tsv.1'


2016-10-06 17:14:19 (719 KB/s) - 'ticker.tsv.1' saved [105146/105146]



In [4]:
tickerObs = loadObservations(sc, sqlContext, "/Users/guillermobreto/Downloads/spark-timeseries/DOCS_REPO/spark-timeseries/ticker.tsv")

In [24]:
tickerObs.select("timestamp").take(3)

[Row(timestamp=datetime.datetime(2015, 8, 14, 0, 0)),
 Row(timestamp=datetime.datetime(2015, 9, 14, 0, 0)),
 Row(timestamp=datetime.datetime(2015, 9, 18, 0, 0))]

In [252]:
tickerObs.show(3, truncate=False)

+---------------------+------+------+
|timestamp            |symbol|price |
+---------------------+------+------+
|2015-08-14 00:00:00.0|ADP   |82.99 |
|2015-09-14 00:00:00.0|NKE   |111.78|
|2015-09-18 00:00:00.0|DO    |20.18 |
+---------------------+------+------+
only showing top 3 rows



In [29]:
tickerObs.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- symbol: string (nullable = true)
 |-- price: double (nullable = true)



In [26]:

# Create an daily DateTimeIndex over August and September 2015
freq = BusinessDayFrequency(1, 1, sc)
dtIndex = uniform(start='2015-08-03T00:00-04:00', end='2015-09-22T00:00-04:00', freq=freq, sc=sc)



In [27]:
tickerTsrdd = time_series_rdd_from_observations(dtIndex, tickerObs, "timestamp", "symbol", "price")


In [28]:
tickerTsrdd.take(2)

[(u'AAL',
  array([ 41.71,  42.74,  42.96,  42.18,  41.49,  41.68,  42.7 ,  42.52,
          42.67,  42.88,  43.95,  43.63,  43.53,  41.98,  39.75,  37.62,
          37.5 ,  38.85,  39.03,  38.62,  38.98,  39.2 ,  41.51,  40.89,
          40.6 ,    nan,  40.8 ,  41.  ,  41.21,  42.15,  42.29,  42.64,
          43.06,  43.99,  43.49,  43.24,  41.19])),
 (u'AAPL',
  array([ 118.44  ,  114.64  ,  115.4   ,  115.13  ,  115.52  ,  119.6901,
          113.5499,  115.24  ,  115.15  ,  116.    ,  117.1601,  116.5   ,
          115.01  ,  112.65  ,  105.76  ,  103.155 ,  103.74  ,  109.625 ,
          112.92  ,  113.29  ,  112.76  ,  107.72  ,  112.34  ,  110.37  ,
          109.27  ,       nan,  112.21  ,  110.15  ,  112.57  ,  114.017 ,
          115.3   ,  116.28  ,  116.35  ,  113.92  ,  113.45  ,  115.23  ,
          113.432 ]))]

In [11]:

# Count the number of series (number of symbols)
print(tickerTsrdd.count())

# Impute missing values using linear interpolation
filled = tickerTsrdd.fill("linear")

# Compute return rates
returnRates = filled.return_rates()

104


In [12]:
filled.take(2)

[(u'AAL',
  array([ 41.71,  42.74,  42.96,  42.18,  41.49,  41.68,  42.7 ,  42.52,
          42.67,  42.88,  43.95,  43.63,  43.53,  41.98,  39.75,  37.62,
          37.5 ,  38.85,  39.03,  38.62,  38.98,  39.2 ,  41.51,  40.89,
          40.6 ,  40.7 ,  40.8 ,  41.  ,  41.21,  42.15,  42.29,  42.64,
          43.06,  43.99,  43.49,  43.24,  41.19])),
 (u'AAPL',
  array([ 118.44  ,  114.64  ,  115.4   ,  115.13  ,  115.52  ,  119.6901,
          113.5499,  115.24  ,  115.15  ,  116.    ,  117.1601,  116.5   ,
          115.01  ,  112.65  ,  105.76  ,  103.155 ,  103.74  ,  109.625 ,
          112.92  ,  113.29  ,  112.76  ,  107.72  ,  112.34  ,  110.37  ,
          109.27  ,  110.74  ,  112.21  ,  110.15  ,  112.57  ,  114.017 ,
          115.3   ,  116.28  ,  116.35  ,  113.92  ,  113.45  ,  115.23  ,
          113.432 ]))]

In [12]:
# Durbin-Watson test for serial correlation, ported from TimeSeriesStatisticalTests.scala
def dwtest(residuals):
    residsSum = residuals[0] * residuals[0]
    diffsSum = 0.0
    i = 1
    while i < len(residuals):
        residsSum += residuals[i] * residuals[i]
        diff = residuals[i] - residuals[i - 1]
        diffsSum += diff * diff
        i += 1
    return diffsSum / residsSum

# Compute Durbin-Watson stats for each series
# Swap ticker symbol and stats so min and max compare the statistic value, not the
# ticker names.
dwStats = returnRates.map_series(lambda row: (row[0], [dwtest(row[1])])).map(lambda x: (x[1], x[0]))

print(dwStats.min())
print(dwStats.max())

([0.99930539174187916], u'NFLX')
([2.3701164736953166], u'DISCK')


In [40]:
from sparkts.models import ARIMA

In [44]:
type(tickerTsrdd)

sparkts.timeseriesrdd.TimeSeriesRDD

In [42]:
arima_model = tickerTsrdd.map(lambda ts: ARIMA.fit_model(ts))

In [45]:
pwd

u'/Users/guillermobreto/Downloads/spark-timeseries/python'

In [46]:
mv /Users/guillermobreto/Downloads/FRED-datasets-codes.csv .

In [47]:
!head FRED-datasets-codes.csv

FRED/CGBD1819M,"Unemployment Rate - College Graduates - Bachelor's Degree, 18 to 19 years, Men"
FRED/DOTSRG3Q086SBEA,Personal consumption expenditures: Other services (chain-type price index)
FRED/Y804RL1Q225SBEA,Real imports of services: Transport
FRED/FYFSGDA188S,Federal Surplus or Deficit [-] as Percent of Gross Domestic Product
FRED/SMS16000001000000001,All Employees: Mining and Logging in Idaho
FRED/CGRA1819,"Unemployment Rate - College Graduates - Bachelor's Degree and Higher, 18 to 19 years"
FRED/CGBDUM1819,"Unemployment Level - College Graduates - Bachelor's Degree, 18 to 19 years, Men"
FRED/SMU26198046562200001A,"All Employees: Health Care: Hospitals in Detroit-Livonia-Dearborn, MI (MD)"
FRED/SMU29000004322000001,All Employees: Utilities in Missouri
FRED/PORT933MFGN,"Manufacturing Employment in Portsmouth, NH-ME (NECTA)"


In [8]:
import graphlab as gl

In [9]:
metadata = gl.SFrame.read_csv("FRED-datasets-codes.csv", header=False)

[INFO] graphlab.cython.cy_server: GraphLab Create v2.0.1 started. Logging: /tmp/graphlab_server_1476120295.log


This non-commercial license of GraphLab Create for academic use is assigned to kevglynn@gmail.com and will expire on May 03, 2017.


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [10]:
metadata["dataset_code"] = metadata["X1"].apply(lambda code: code.split("/")[1])

In [11]:
metadata.head(5)

X1,X2,dataset_code
FRED/CGBD1819M,Unemployment Rate - College Graduates - ...,CGBD1819M
FRED/DOTSRG3Q086SBEA,Personal consumption expenditures: Other ...,DOTSRG3Q086SBEA
FRED/Y804RL1Q225SBEA,Real imports of services: Transport ...,Y804RL1Q225SBEA
FRED/FYFSGDA188S,Federal Surplus or Deficit [-] as Percen ...,FYFSGDA188S
FRED/SMS16000001000000001,All Employees: Mining and Logging in Idaho ...,SMS16000001000000001


In [12]:
metadata[metadata["dataset_code"]=="EXPEF54171ALLEST"]

X1,X2,dataset_code
FRED/EXPEF54171ALLEST,Total Expense for Research and Development ...,EXPEF54171ALLEST


# Get the data

In [58]:
mkdir fred_data

In [59]:
cd fred_data/

/Users/guillermobreto/Downloads/spark-timeseries/python/fred_data


In [10]:
import subprocess
from subprocess import Popen

url = 'https://www.quandl.com/api/v3/datasets.csv?database_code=FRED&per_page=100&sort_by=id&page='
location = '/Users/guillermobreto/Downloads/spark-timeseries/python/fred_data'

for ts in range(500,1000):
    
    args = ['wget', '-r', '-l', '1', '-p', '-P', location, url + str(ts)]
    p = Popen(args, stdout=subprocess.PIPE)
    stdout, stderr = p.communicate()

In [5]:
!open ../fred_data/

The file /Users/guillermobreto/Downloads/spark-timeseries/fred_data does not exist.


In [1]:
pwd

u'/Users/guillermobreto/Downloads/spark-timeseries/python'

In [13]:
df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('./fred_data/www.quandl.com/api/v3/')

In [14]:
df.show(10, truncate=False)

+--------+-------------------+-------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+---------------------+---------------------+-----------------+---------+-----------+-------+-----------+
|id      |dataset_code       |database_code|name                                                                                                                                                                                                     |description                                                                                                                                                                      |refreshed_at           

In [8]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- dataset_code: string (nullable = true)
 |-- database_code: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- refreshed_at: string (nullable = true)
 |-- newest_available_date: string (nullable = true)
 |-- oldest_available_date: string (nullable = true)
 |-- column_names: string (nullable = true)
 |-- frequency: string (nullable = true)
 |-- type: string (nullable = true)
 |-- premium: boolean (nullable = true)
 |-- database_id: integer (nullable = true)



In [9]:
df.select("dataset_code").distinct().count()

10500

In [16]:
df.select(["newest_available_date","column_names"]).show()

+---------------------+-----------------+
|newest_available_date|     column_names|
+---------------------+-----------------+
|           2014-01-01|["DATE", "VALUE"]|
|           2014-01-01|["DATE", "VALUE"]|
|           2014-01-01|["DATE", "VALUE"]|
|           2014-01-01|["DATE", "VALUE"]|
|           2014-01-01|["DATE", "VALUE"]|
|           2014-01-01|["DATE", "VALUE"]|
|           2014-01-01|["DATE", "VALUE"]|
|           2014-01-01|["DATE", "VALUE"]|
|           2007-01-01|["DATE", "VALUE"]|
|           2014-01-01|["DATE", "VALUE"]|
|           2014-01-01|["DATE", "VALUE"]|
|           2014-01-01|["DATE", "VALUE"]|
|           2014-01-01|["DATE", "VALUE"]|
|           2014-01-01|["DATE", "VALUE"]|
|           2014-01-01|["DATE", "VALUE"]|
|           2014-01-01|["DATE", "VALUE"]|
|           2014-01-01|["DATE", "VALUE"]|
|           2014-01-01|["DATE", "VALUE"]|
|           2014-01-01|["DATE", "VALUE"]|
|           2014-01-01|["DATE", "VALUE"]|
+---------------------+-----------

In [17]:
!head /Users/guillermobreto/Downloads/fred_timeseries_project/data/FRED-datasets-codes.csv

FRED/CGBD1819M,"Unemployment Rate - College Graduates - Bachelor's Degree, 18 to 19 years, Men"
FRED/DOTSRG3Q086SBEA,Personal consumption expenditures: Other services (chain-type price index)
FRED/Y804RL1Q225SBEA,Real imports of services: Transport
FRED/FYFSGDA188S,Federal Surplus or Deficit [-] as Percent of Gross Domestic Product
FRED/SMS16000001000000001,All Employees: Mining and Logging in Idaho
FRED/CGRA1819,"Unemployment Rate - College Graduates - Bachelor's Degree and Higher, 18 to 19 years"
FRED/CGBDUM1819,"Unemployment Level - College Graduates - Bachelor's Degree, 18 to 19 years, Men"
FRED/SMU26198046562200001A,"All Employees: Health Care: Hospitals in Detroit-Livonia-Dearborn, MI (MD)"
FRED/SMU29000004322000001,All Employees: Utilities in Missouri
FRED/PORT933MFGN,"Manufacturing Employment in Portsmouth, NH-ME (NECTA)"


In [27]:
df = sqlContext.read.format('com.databricks.spark.csv').\
options(header='true', inferschema='true').\
load('/Users/guillermobreto/Downloads/fred_timeseries_project/data/fred_codes/FRED_DOTSRG3Q086SBEA.csv')

In [28]:
df.show(4)

+----------+-----+
|      DATE|VALUE|
+----------+-----+
|1947-01-01|8.341|
|1947-04-01|8.392|
|1947-07-01|8.619|
|1947-10-01|8.737|
+----------+-----+
only showing top 4 rows



In [29]:
import pyspark.sql.functions as f

In [31]:
df = df.withColumn("name", f.lit("DOTSRG3Q086SBEA"))

In [32]:
df.show(3)

+----------+-----+---------------+
|      DATE|VALUE|           name|
+----------+-----+---------------+
|1947-01-01|8.341|DOTSRG3Q086SBEA|
|1947-04-01|8.392|DOTSRG3Q086SBEA|
|1947-07-01|8.619|DOTSRG3Q086SBEA|
+----------+-----+---------------+
only showing top 3 rows



In [261]:
rdd = sc.wholeTextFiles("/Users/guillermobreto/Downloads/fred_timeseries_project/data/fred_codes/")
print(rdd.count())
from pyspark.sql.functions import explode
import pyspark.sql.functions as f
from pyspark.sql.functions import udf
from datetime import datetime

from pyspark import SparkContext, SQLContext
from pyspark.sql import Row
from pyspark.sql.types import StructType, StructField, TimestampType, DoubleType, StringType

from sparkts.datetimeindex import uniform, BusinessDayFrequency
from sparkts.timeseriesrdd import time_series_rdd_from_observations

import numpy as np
import pandas as pd


rdd_df = rdd.map(lambda r: (r[0].split("_")[2].strip(".csv"),filter(None, r[1].split("\n")[1:]))).toDF(["symbol","v"])



def grabValue(tup):
    if isinstance(tuple(tup), tuple):
        return float(tup.split(",")[1])
    else:
        return np.nan

def grabDate(tup):
    if isinstance(tuple(tup), tuple):
        return pd.to_datetime(tup.split(",")[0])
    else:
        return np.nan
    
def to_tuple(tup):
    if len(tup)>5:
        return tup.split(",")
    else:
        return ["This is not working", "for Now"]

 
udfgrabValue=udf(grabValue, DoubleType())
udfgrabDate=udf(grabDate, TimestampType())
udfto_tuple=udf(to_tuple, ArrayType(StringType()))


rdd_df_exp =  rdd_df.select([rdd_df.symbol,explode(rdd_df.v).alias("DATA-VALUE")])
rdd_df_exp = rdd_df_exp.withColumn("price", udfgrabValue("DATA-VALUE"))
rdd_df_exp = rdd_df_exp.withColumn("timestamp", udfgrabDate("DATA-VALUE"))
df = rdd_df_exp.select(["timestamp", "symbol", "price"])
dates = ("2003-01-01",  "2016-09-01")
date_from, date_to = [f.to_date(f.lit(s)).cast(TimestampType()) for s in dates]
df_filtered = df.where((df.timestamp > date_from) & (df.timestamp < date_to))
tickerTsrdd = time_series_rdd_from_observations(dtIndex, df_filtered, "timestamp", "symbol", "price")



40809


In [80]:
ls

DW_test-Copy1.ipynb      [34mbuild[m[m/                   [34msparkts[m[m/
DW_test.ipynb            [34mdist[m[m/                    [34msparkts.egg-info[m[m/
FRED-datasets-codes.csv  [34mfred_data[m[m/               ticker.tsv
MANIFEST.in              setup.py                 ticker.tsv.1
Makefile                 [34msource[m[m/


In [None]:
rdd.count()

In [5]:
rdd =  sc.wholeTextFiles("/Users/guillermobreto/Desktop/sample_2/")
rdd.count()

In [6]:
rdd.count()

2

In [7]:
rdd.toDF().show()

+--------------------+--------------------+
|                  _1|                  _2|
+--------------------+--------------------+
|file:/Users/guill...|DATE,VALUE
2005-0...|
|file:/Users/guill...|DATE,VALUE
1996-0...|
+--------------------+--------------------+



In [55]:
rdd.toDF(["key","val"]).select("val").show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [56]:
ele = rdd.toDF(["key","val"])

In [59]:
ele.show(2)

+--------------------+--------------------+
|                 key|                 val|
+--------------------+--------------------+
|file:/Users/guill...|DATE,VALUE
2005-0...|
|file:/Users/guill...|DATE,VALUE
1996-0...|
+--------------------+--------------------+



In [8]:
from pyspark.sql.functions import explode

In [9]:
import pyspark.sql.functions as f

In [10]:
rdd_df = rdd.map(lambda r: (r[0].split("_")[2].strip(".csv"),filter(None, r[1].split("\n")[1:]))).toDF(["symbol","v"])

In [11]:
rdd_df_exp =  rdd_df.select([rdd_df.symbol,explode(rdd_df.v).alias("DATA-VALUE")])


In [12]:
rdd_df_exp.show(2)

+----------------+---------------+
|          symbol|     DATA-VALUE|
+----------------+---------------+
|00XALCCHM086NEST|2005-01-01,97.0|
|00XALCCHM086NEST|2005-02-01,97.3|
+----------------+---------------+
only showing top 2 rows



In [13]:
rdd_df_exp.select("symbol").distinct().count()

2

In [169]:
import pandas as pd

In [19]:
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType, TimestampType, ArrayType

import numpy as np
import pandas as pd
def grabValue(tup):
    if isinstance(tuple(tup), tuple):
        return float(tup.split(",")[1])
    else:
        return np.nan

def grabDate(tup):
    if isinstance(tuple(tup), tuple):
        return pd.to_datetime(tup.split(",")[0])
    else:
        return np.nan
    
def to_tuple(tup):
    if len(tup)>5:
        return tup.split(",")
    else:
        return ["This is not working", "for Now"]

 
udfgrabValue=udf(grabValue, DoubleType())
udfgrabDate=udf(grabDate, TimestampType())
udfto_tuple=udf(to_tuple, ArrayType(StringType()))


rdd_df_exp =  rdd_df.select([rdd_df.symbol,explode(rdd_df.v).alias("DATA-VALUE")])



In [20]:
rdd_df_exp.printSchema()

root
 |-- symbol: string (nullable = true)
 |-- DATA-VALUE: string (nullable = true)



In [21]:
rdd_df_exp = rdd_df_exp.withColumn("price", udfgrabValue("DATA-VALUE"))
rdd_df_exp = rdd_df_exp.withColumn("timestamp", udfgrabDate("DATA-VALUE"))



In [22]:
rdd_df_exp.show(3)

+----------------+---------------+-----+--------------------+
|          symbol|     DATA-VALUE|price|           timestamp|
+----------------+---------------+-----+--------------------+
|00XALCCHM086NEST|2005-01-01,97.0| 97.0|2005-01-01 00:00:...|
|00XALCCHM086NEST|2005-02-01,97.3| 97.3|2005-02-01 00:00:...|
|00XALCCHM086NEST|2005-03-01,97.4| 97.4|2005-03-01 00:00:...|
+----------------+---------------+-----+--------------------+
only showing top 3 rows



In [23]:
rdd_df_exp.take(800)

[Row(symbol=u'00XALCCHM086NEST', DATA-VALUE=u'2005-01-01,97.0', price=97.0, timestamp=datetime.datetime(2005, 1, 1, 0, 0)),
 Row(symbol=u'00XALCCHM086NEST', DATA-VALUE=u'2005-02-01,97.3', price=97.3, timestamp=datetime.datetime(2005, 2, 1, 0, 0)),
 Row(symbol=u'00XALCCHM086NEST', DATA-VALUE=u'2005-03-01,97.4', price=97.4, timestamp=datetime.datetime(2005, 3, 1, 0, 0)),
 Row(symbol=u'00XALCCHM086NEST', DATA-VALUE=u'2005-04-01,98.2', price=98.2, timestamp=datetime.datetime(2005, 4, 1, 0, 0)),
 Row(symbol=u'00XALCCHM086NEST', DATA-VALUE=u'2005-05-01,98.2', price=98.2, timestamp=datetime.datetime(2005, 5, 1, 0, 0)),
 Row(symbol=u'00XALCCHM086NEST', DATA-VALUE=u'2005-06-01,97.9', price=97.9, timestamp=datetime.datetime(2005, 6, 1, 0, 0)),
 Row(symbol=u'00XALCCHM086NEST', DATA-VALUE=u'2005-07-01,97.3', price=97.3, timestamp=datetime.datetime(2005, 7, 1, 0, 0)),
 Row(symbol=u'00XALCCHM086NEST', DATA-VALUE=u'2005-08-01,97.4', price=97.4, timestamp=datetime.datetime(2005, 8, 1, 0, 0)),
 Row(sym

In [24]:
x = rdd_df_exp.select(["timestamp", "symbol", "price"])

In [26]:
x.show(2)

+--------------------+----------------+-----+
|           timestamp|          symbol|price|
+--------------------+----------------+-----+
|2005-01-01 00:00:...|00XALCCHM086NEST| 97.0|
|2005-02-01 00:00:...|00XALCCHM086NEST| 97.3|
+--------------------+----------------+-----+
only showing top 2 rows



In [97]:
dates = ("2005-01-01",  "2005-05-01")
date_from, date_to = [f.to_date(f.lit(s)).cast(TimestampType()) for s in dates]
df_filtered = x.where((x.timestamp > date_from) & (x.timestamp < date_to))

In [98]:
df_filtered = x.where((x.timestamp > date_from) & (x.timestamp < date_to))

In [105]:
df_filtered.groupBy(["timestamp", "symbol"]).agg({"price": "avg"}).show(10, truncate=False)

+---------------------+----------------+----------+
|timestamp            |symbol          |avg(price)|
+---------------------+----------------+----------+
|2005-02-01 00:00:00.0|00XALCCHM086NEST|97.3      |
|2005-03-01 00:00:00.0|00XALCCHM086NEST|97.4      |
|2005-02-01 00:00:00.0|00XALCFIM086NEST|82.61     |
|2005-04-01 00:00:00.0|00XALCFIM086NEST|83.22     |
|2005-04-01 00:00:00.0|00XALCCHM086NEST|98.2      |
|2005-03-01 00:00:00.0|00XALCFIM086NEST|82.97     |
+---------------------+----------------+----------+



In [227]:
df_filtered.show(10, truncate=False)

+---------------------+----------------+-----+
|timestamp            |symbol          |price|
+---------------------+----------------+-----+
|2005-02-01 00:00:00.0|00XALCCHM086NEST|97.3 |
|2005-03-01 00:00:00.0|00XALCCHM086NEST|97.4 |
|2005-04-01 00:00:00.0|00XALCCHM086NEST|98.2 |
|2005-02-01 00:00:00.0|00XALCFIM086NEST|82.61|
|2005-03-01 00:00:00.0|00XALCFIM086NEST|82.97|
|2005-04-01 00:00:00.0|00XALCFIM086NEST|83.22|
+---------------------+----------------+-----+



In [231]:
df_filtered.select("timestamp").take(3)

[Row(timestamp=datetime.datetime(2005, 2, 1, 0, 0)),
 Row(timestamp=datetime.datetime(2005, 3, 1, 0, 0)),
 Row(timestamp=datetime.datetime(2005, 4, 1, 0, 0))]

In [75]:

from datetime import datetime

from pyspark import SparkContext, SQLContext
from pyspark.sql import Row
from pyspark.sql.types import StructType, StructField, TimestampType, DoubleType, StringType

from sparkts.datetimeindex import uniform, BusinessDayFrequency
from sparkts.timeseriesrdd import time_series_rdd_from_observations



In [243]:
from sparkts.datetimeindex import DayFrequency


In [245]:
freq = DayFrequency(1,sc)

In [258]:
freq = BusinessDayFrequency(1, 1, sc)
dtIndex = uniform(start='2005-02-01T00:00-05:00', end='2005-06-01T00:00-05:00', freq=freq, sc=sc)

In [254]:
BusinessDayFrequency?

In [239]:
x.show(3)

+--------------------+----------------+-----+
|           timestamp|          symbol|price|
+--------------------+----------------+-----+
|2005-01-01 00:00:...|00XALCCHM086NEST| 97.0|
|2005-02-01 00:00:...|00XALCCHM086NEST| 97.3|
|2005-03-01 00:00:...|00XALCCHM086NEST| 97.4|
+--------------------+----------------+-----+
only showing top 3 rows



In [259]:
tickerTsrdd = time_series_rdd_from_observations(dtIndex, df_filtered, "timestamp", "symbol", "price")

In [260]:
tickerTsrdd.take(2)

[(u'00XALCCHM086NEST',
  array([ 97.3,   nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,
           nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,
           nan,   nan,  97.4,   nan,   nan,   nan,   nan,   nan,   nan,
           nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,
           nan,   nan,   nan,   nan,   nan,   nan,   nan,  98.2,   nan,
           nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,
           nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,
           nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,
           nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,
           nan,   nan,   nan,   nan,   nan,   nan])),
 (u'00XALCFIM086NEST',
  array([ 82.61,    nan,    nan,    nan,    nan,    nan,    nan,    nan,
            nan,    nan,    nan,    nan,    nan,    nan,    nan,    nan,
            nan,    nan,    nan,    nan,  82.97,    nan,    nan,    nan,
            nan,    nan,    nan, 

In [92]:
%matplotlib inline 
import matplotlib.pyplot as plt

In [265]:
rdd = sc.wholeTextFiles("/Users/guillermobreto/Downloads/fred_timeseries_project/data/fred_codes/")
rdd.count()
from pyspark.sql.functions import explode
import pyspark.sql.functions as f
rdd_df = rdd.map(lambda r: (r[0].split("/")[-1].strip(".csv"),filter(None, r[1].split("\n")[1:]))).toDF(["symbol","v"])
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType, TimestampType, ArrayType

import numpy as np
import pandas as pd
def grabValue(tup):
    if isinstance(tuple(tup), tuple):
        return float(tup.split(",")[1])
    else:
        return np.nan

def grabDate(tup):
    if isinstance(tuple(tup), tuple):
        return pd.to_datetime(tup.split(",")[0])
    else:
        return np.nan
    
def to_tuple(tup):
    if len(tup)>5:
        return tup.split(",")
    else:
        return ["This is not working", "for Now"]

 
udfgrabValue=udf(grabValue, DoubleType())
udfgrabDate=udf(grabDate, TimestampType())
udfto_tuple=udf(to_tuple, ArrayType(StringType()))


rdd_df_exp =  rdd_df.select([rdd_df.symbol,explode(rdd_df.v).alias("DATA-VALUE")])
rdd_df_exp = rdd_df_exp.withColumn("price", udfgrabValue("DATA-VALUE"))
rdd_df_exp = rdd_df_exp.withColumn("timestamp", udfgrabDate("DATA-VALUE"))
df = rdd_df_exp.select(["timestamp", "symbol", "price"])
dates = ("2003-01-01",  "2016-09-01")
date_from, date_to = [f.to_date(f.lit(s)).cast(TimestampType()) for s in dates]
df_filtered = df.where((df.timestamp > date_from) & (df.timestamp < date_to))
tickerTsrdd = time_series_rdd_from_observations(dtIndex, df_filtered, "timestamp", "symbol", "price")


from datetime import datetime

from pyspark import SparkContext, SQLContext
from pyspark.sql import Row
from pyspark.sql.types import StructType, StructField, TimestampType, DoubleType, StringType

from sparkts.datetimeindex import uniform, BusinessDayFrequency
from sparkts.timeseriesrdd import time_series_rdd_from_observations
freq = BusinessDayFrequency(1, 1, sc)
dtIndex = uniform(start='2000-01-01T00:00-05:00', end='2015-10-10T00:00-05:00', freq=freq, sc=sc)


In [267]:
df.show(3, truncate=False)

+---------------------+---------------------+-----+
|timestamp            |symbol               |price|
+---------------------+---------------------+-----+
|2005-01-01 00:00:00.0|FRED_00XALCCHM086NEST|97.0 |
|2005-02-01 00:00:00.0|FRED_00XALCCHM086NEST|97.3 |
|2005-03-01 00:00:00.0|FRED_00XALCCHM086NEST|97.4 |
+---------------------+---------------------+-----+
only showing top 3 rows



In [269]:
rdd_df.select("symbol").distinct().count()

40809

In [270]:
rdd_df.show(3)

+--------------------+--------------------+
|              symbol|                   v|
+--------------------+--------------------+
|FRED_00XALCCHM086...|[2005-01-01,97.0,...|
|FRED_00XALCFIM086...|[1996-01-01,71.22...|
|FRED_00XALCHRM086...|[2004-12-01,81.32...|
+--------------------+--------------------+
only showing top 3 rows



In [273]:
>>> from pyspark.sql.types import IntegerType
>>> slen = udf(lambda s: len(s), IntegerType())
new_df =rdd_df.withColumn("lengh", (slen(rdd_df.v).alias('slen')))

In [274]:
new_df.show(3)

+--------------------+--------------------+-----+
|              symbol|                   v|lengh|
+--------------------+--------------------+-----+
|FRED_00XALCCHM086...|[2005-01-01,97.0,...|  140|
|FRED_00XALCFIM086...|[1996-01-01,71.22...|  248|
|FRED_00XALCHRM086...|[2004-12-01,81.32...|  141|
+--------------------+--------------------+-----+
only showing top 3 rows



In [275]:
rdd_df_exp =  rdd_df.select([rdd_df.symbol,explode(rdd_df.v).alias("DATA-VALUE")])

In [276]:
new_df =rdd_df_exp.withColumn("lengh", (slen(rdd_df_exp["DATA-VALUE"]).alias('slen')))

In [277]:
new_df.show(3)

+--------------------+---------------+-----+
|              symbol|     DATA-VALUE|lengh|
+--------------------+---------------+-----+
|FRED_00XALCCHM086...|2005-01-01,97.0|   15|
|FRED_00XALCCHM086...|2005-02-01,97.3|   15|
|FRED_00XALCCHM086...|2005-03-01,97.4|   15|
+--------------------+---------------+-----+
only showing top 3 rows



In [279]:
new_df.printSchema()

root
 |-- symbol: string (nullable = true)
 |-- DATA-VALUE: string (nullable = true)
 |-- lengh: integer (nullable = true)



In [278]:
>>> from pyspark.sql import functions as F
>>> new_df.agg(F.min(new_df.lengh)).collect()

[Row(min(lengh)=14)]

In [284]:
dateUdf = udf(lambda s: s.split(",")[0], StringType())
valueUdf = udf(lambda s: s.split(",")[1], DoubleType())


f.to_date(f.lit(s)).cast(TimestampType())

new_df =rdd_df_exp.withColumn("Date", (f.to_date(f.lit(dateUdf(rdd_df_exp["DATA-VALUE"]))).cast(TimestampType())))

In [285]:
new_df.show(3)

+--------------------+---------------+--------------------+
|              symbol|     DATA-VALUE|                Date|
+--------------------+---------------+--------------------+
|FRED_00XALCCHM086...|2005-01-01,97.0|2005-01-01 00:00:...|
|FRED_00XALCCHM086...|2005-02-01,97.3|2005-02-01 00:00:...|
|FRED_00XALCCHM086...|2005-03-01,97.4|2005-03-01 00:00:...|
+--------------------+---------------+--------------------+
only showing top 3 rows



In [286]:
new_df.select("symbol").distinct().count()

40809

In [291]:
valueUdf = udf(lambda s: float(s.split(",")[1]), DoubleType())
dateUdf = udf(lambda s: s.split(",")[0], StringType())
new_df =rdd_df_exp.withColumn("Date", (f.to_date(f.lit(dateUdf(rdd_df_exp["DATA-VALUE"]))).cast(TimestampType())))
new_df =new_df.withColumn("price", valueUdf(new_df["DATA-VALUE"]))

In [292]:
new_df.show(3)

+--------------------+---------------+--------------------+-----+
|              symbol|     DATA-VALUE|                Date|price|
+--------------------+---------------+--------------------+-----+
|FRED_00XALCCHM086...|2005-01-01,97.0|2005-01-01 00:00:...| 97.0|
|FRED_00XALCCHM086...|2005-02-01,97.3|2005-02-01 00:00:...| 97.3|
|FRED_00XALCCHM086...|2005-03-01,97.4|2005-03-01 00:00:...| 97.4|
+--------------------+---------------+--------------------+-----+
only showing top 3 rows



In [293]:
new_df.select("symbol").distinct().count()

40809

In [315]:
#freq = BusinessDayFrequency(1, 1, sc)
freq = DayFrequency(1,sc)
dtIndex = uniform(start='2005-02-01T00:00-05:00', end='2016-10-01T00:00-05:00', freq=freq, sc=sc)

In [316]:

dates = ("2005-01-01",  "2016-10-01")
date_from, date_to = [f.to_date(f.lit(s)).cast(TimestampType()) for s in dates]
df_filtered = new_df.where((new_df.Date > date_from) & (new_df.Date < date_to))

In [317]:
df_filtered.show(3)

+--------------------+---------------+--------------------+-----+
|              symbol|     DATA-VALUE|                Date|price|
+--------------------+---------------+--------------------+-----+
|FRED_00XALCCHM086...|2005-02-01,97.3|2005-02-01 00:00:...| 97.3|
|FRED_00XALCCHM086...|2005-03-01,97.4|2005-03-01 00:00:...| 97.4|
|FRED_00XALCCHM086...|2005-04-01,98.2|2005-04-01 00:00:...| 98.2|
+--------------------+---------------+--------------------+-----+
only showing top 3 rows



In [318]:
df = df_filtered.select(["symbol", "Date", "price"])
df = df.withColumnRenamed("Date", "timestamp")

In [319]:
df.show()

+--------------------+--------------------+-----+
|              symbol|           timestamp|price|
+--------------------+--------------------+-----+
|FRED_00XALCCHM086...|2005-02-01 00:00:...| 97.3|
|FRED_00XALCCHM086...|2005-03-01 00:00:...| 97.4|
|FRED_00XALCCHM086...|2005-04-01 00:00:...| 98.2|
|FRED_00XALCCHM086...|2005-05-01 00:00:...| 98.2|
|FRED_00XALCCHM086...|2005-06-01 00:00:...| 97.9|
|FRED_00XALCCHM086...|2005-07-01 00:00:...| 97.3|
|FRED_00XALCCHM086...|2005-08-01 00:00:...| 97.4|
|FRED_00XALCCHM086...|2005-09-01 00:00:...| 97.8|
|FRED_00XALCCHM086...|2005-10-01 00:00:...| 98.6|
|FRED_00XALCCHM086...|2005-11-01 00:00:...| 98.4|
|FRED_00XALCCHM086...|2005-12-01 00:00:...| 98.3|
|FRED_00XALCCHM086...|2006-01-01 00:00:...| 98.2|
|FRED_00XALCCHM086...|2006-02-01 00:00:...| 98.4|
|FRED_00XALCCHM086...|2006-03-01 00:00:...| 98.4|
|FRED_00XALCCHM086...|2006-04-01 00:00:...| 99.2|
|FRED_00XALCCHM086...|2006-05-01 00:00:...| 99.3|
|FRED_00XALCCHM086...|2006-06-01 00:00:...| 99.3|


In [320]:
tickerTsrdd = time_series_rdd_from_observations(dtIndex, df, "timestamp", "symbol", "price")

In [321]:
tickerTsrdd.take(3)

[(u'FRED_00XALCCHM086NEST',
  array([ 97.3,   nan,   nan, ...,   nan,   nan,   nan])),
 (u'FRED_00XALCFIM086NEST',
  array([ 82.61,    nan,    nan, ...,    nan,    nan,    nan])),
 (u'FRED_00XAPFEEM086NEST',
  array([ 68.34,    nan,    nan, ...,    nan,    nan,    nan]))]

In [322]:
filled = tickerTsrdd.fill("linear")

In [323]:
filled.take(2)

[(u'FRED_00XALCCHM086NEST',
  array([ 97.3       ,  97.30357143,  97.30714286, ...,          nan,
                  nan,          nan])),
 (u'FRED_00XALCFIM086NEST',
  array([ 82.61      ,  82.62285714,  82.63571429, ...,          nan,
                  nan,          nan]))]

In [326]:
previous = filled.fill("previous")

In [327]:
previous.take(3)

[(u'FRED_00XALCCHM086NEST',
  array([ 97.3       ,  97.30357143,  97.30714286, ...,  99.61      ,
          99.61      ,  99.61      ])),
 (u'FRED_00XALCFIM086NEST',
  array([  82.61      ,   82.62285714,   82.63571429, ...,  100.27      ,
          100.27      ,  100.27      ])),
 (u'FRED_00XAPFEEM086NEST',
  array([  68.34      ,   68.34678571,   68.35357143, ...,  101.52      ,
          101.52      ,  101.52      ]))]

In [328]:
rr = previous.return_rates()

In [329]:
rr.take(3)

[(u'FRED_00XALCCHM086NEST',
  array([  3.67053296e-05,   3.67039824e-05,   3.67026352e-05, ...,
           0.00000000e+00,   0.00000000e+00,   0.00000000e+00])),
 (u'FRED_00XALCFIM086NEST',
  array([ 0.00015564,  0.00015561,  0.00015559, ...,  0.        ,
          0.        ,  0.        ])),
 (u'FRED_00XAPFEEM086NEST',
  array([  9.92934487e-05,   9.92835905e-05,   9.92737343e-05, ...,
           0.00000000e+00,   0.00000000e+00,   0.00000000e+00]))]