In [16]:
import pymc3 as pm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from statsmodels.tsa.stattools import grangercausalitytests
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer

import seaborn as sns

plt.style.use('fivethirtyeight')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [17]:
coffee = pd.read_csv('./working_coffee_csv.csv')
coffee.head()


Unnamed: 0,Date,Title,Price,Price_Change,Direction,Rate_of_Change,CV_Vectors,TFIDF_Vectors,Hash_Vectors
0,2007-01-02,India earns more from higher coffee exports in...,1.1506,0.0,0,0.0,[0 0 0 ... 0 0 0],[0. 0. 0. ... 0. 0. 0.],[0. 0. 0. ... 0. 0. 0.]
1,2007-01-03,Friesland raises stake in Indonesian subsidiar...,1.176,0.0254,0,0.022075,[0 0 0 ... 0 0 0],[0. 0. 0. ... 0. 0. 0.],[0. 0. 0. ... 0. 0. 0.]
2,2007-01-04,Nymex announces start date for soft commodity ...,1.1451,-0.0309,0,-0.026276,[0 0 0 ... 0 0 0],[0. 0. 0. ... 0. 0. 0.],[0. 0. 0. ... 0. 0. 0.]
3,2007-01-05,India's largest coffee chain extends to Pakistan,1.1506,0.0055,0,0.004803,[0 0 0 ... 0 0 0],[0. 0. 0. ... 0. 0. 0.],[0. 0. 0. ... 0. 0. 0.]
4,2007-01-07,Honduran coffee sales Ugandan coffee funds Soy...,1.1506,-0.0,0,0.0,[0 0 0 ... 0 0 0],[0. 0. 0. ... 0. 0. 0.],[0. 0. 0. ... 0. 0. 0.]


In [18]:
coffee["Date"] = pd.to_datetime(coffee['Date'])
coffee.set_index('Date', inplace=True)
coffee.tail()

Unnamed: 0_level_0,Title,Price,Price_Change,Direction,Rate_of_Change,CV_Vectors,TFIDF_Vectors,Hash_Vectors
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-03-26,Futures Review: Arabica futures fall to one mo...,1.1107,0.0034,0,0.003071,[0 0 0 ... 0 0 0],[0. 0. 0. ... 0. 0. 0.],[0. 0. 0. ... 0. 0. 0.]
2018-03-27,Futures Review: Coffee ends narrowly mixed,1.1262,0.0155,0,0.013955,[0 0 0 ... 0 0 0],[0. 0. 0. ... 0. 0. 0.],[0. 0. 0. ... 0. 0. 0.]
2018-03-28,Futures Review: Coffee moves away from recent ...,1.1262,-0.0,0,0.0,[0 0 0 ... 0 0 0],[0. 0. 0. ... 0. 0. 0.],[0. 0. 0. ... 0. 0. 0.]
2018-03-29,"IEG Vu: Easter email alert schedule, customer ...",1.1239,-0.0023,0,-0.002042,[0 0 0 ... 0 0 0],[0. 0. 0. ... 0. 0. 0.],[0. 0. 0. ... 0. 0. 0.]
2018-04-03,Futures Review: Arabica coffee hits 10-month l...,1.1262,0.0023,0,0.002046,[0 0 0 ... 0 0 0],[0. 0. 0. ... 0. 0. 0.],[0. 0. 0. ... 0. 0. 0.]


In [20]:
coffee.shape

(1465, 8)

In [21]:
tf = TfidfVectorizer()

vector_tfidf = tf.fit(coffee['Title'])

vector_tfidf_transformed = tf.fit_transform(coffee['Title'])

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [22]:
vector_tfidf_transformed = pd.DataFrame(vector_tfidf_transformed.toarray(), index=coffee.index)

In [23]:
vector_tfidf_transformed.shape

(1465, 3679)

In [24]:
temp_df = pd.concat([coffee, vector_tfidf_transformed],axis=1)

In [10]:
print(temp_df.shape)

temp_df.head(2)

(1465, 3687)


Unnamed: 0_level_0,Title,Price,Price_Change,Direction,Rate_of_Change,CV_Vectors,TFIDF_Vectors,Hash_Vectors,0,1,...,3669,3670,3671,3672,3673,3674,3675,3676,3677,3678
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2007-01-02,India earns more from higher coffee exports in...,1.1506,0.0,0,0.0,[0 0 0 ... 0 0 0],[0. 0. 0. ... 0. 0. 0.],[0. 0. 0. ... 0. 0. 0.],0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2007-01-03,Friesland raises stake in Indonesian subsidiar...,1.176,0.0254,0,0.022075,[0 0 0 ... 0 0 0],[0. 0. 0. ... 0. 0. 0.],[0. 0. 0. ... 0. 0. 0.],0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
new_df = temp_df.drop(['Title', 'Price', 'Direction', 'CV_Vectors', 'TFIDF_Vectors','Hash_Vectors'], axis=1)

In [29]:
new_df.tail(2)

Unnamed: 0_level_0,Price_Change,Rate_of_Change,0,1,2,3,4,5,6,7,...,3669,3670,3671,3672,3673,3674,3675,3676,3677,3678
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-03-29,-0.0023,-0.002042,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-04-03,0.0023,0.002046,0.0,0.0,0.0,0.0,0.0,0.0,0.225947,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
with pm.Model() as cup_of_joe:
    print("Intializing intercept...")
    intercept = pm.Normal('Intercept', mu=0, sd=10)
    
    print("Intializing beta..") 
    beta = pm.Normal('beta', mu=0, sd=10, shape=3681)
    
    print("Intializing sigma..") 
    sigma = pm.HalfNormal('price_std', sd=1)
    
    mu = intercept
    for i in range(3679):
        mu += beta[i]*new_df.iloc[:,i]
    
    price = pm.Normal('Price', mu=mu, sd=sigma, observed=temp_df['Price'])

print("All Done!")

Intializing intercept...
Intializing beta..
Intializing sigma..
All Done!


In [None]:
with cup_of_joe:
    start = pm.find_MAP()
    trace = pm.sample(3500, start=start)

In [11]:
# with pm.Model() as cup_of_joe:
#     intercept = pm.Normal('Intercept', mu=0, sd=10)
#     beta = pm.Normal('beta', mu=0, sd=10, shape=3679)
#     sigma = pm.HalfNormal('price_std', sd=1)
    
#     mu = intercept + np.dot(beta, temp_df.iloc[:, 5:])
    
#     price = pm.Normal('Price', mu=mu, sd=sigma, observed=temp_df['Price'])

In [12]:
# with cup_of_joe:
#     start = pm.find_MAP()
#     trace = pm.sample(3500, start=start)

In [13]:
with pm.Model() as cup_of_joe:
    print("Intializing intercept...")
    intercept = pm.Normal('Intercept', mu=0, sd=10)
    
    print("Intializing beta..") 
    beta = pm.Normal('beta', mu=0, sd=10, shape=3679)
    
    print("Intializing sigma..") 
    sigma = pm.HalfNormal('price_std', sd=1)

Intializing intercept...
Intializing beta..
Intializing sigma..


In [15]:
beta.type

TensorType(float64, vector)

In [None]:
with pm.Model() as cup_of_joe:
    print("Intializing intercept...")
    intercept = pm.Normal('Intercept', mu=0, sd=10)
    
    print("Intializing beta..") 
    beta = pm.Normal('beta', mu=0, sd=10, shape=3679)
    
    print("Intializing sigma..") 
    sigma = pm.HalfNormal('price_std', sd=1)
    
        betas = []

        for index in range(3679):

            if index % 1000 == 0:
                print("... working sucka ...")
            betas.append(beta[index])

        betas_np = np.array(betas)

        x_vals = []

        for index in range(3679):

            if index % 1000 == 0:
                print("... working sucka ...")
            x_vals.append(temp_df.iloc[:, 5:])

        x_vals_np = np.array(x_vals)
    
    mu = intercept + np.dot(betas_np, x_vals_np)
    
    price = pm.Normal('Price', mu=mu, sd=sigma, observed=temp_df['Price'])

    
    #partial = np.sum([index, col for index, col in enumerate(temp_df.iloc[:, 5:].columns.tolist())]))
    
    # mu = intercept + np.dot(beta, temp_df.iloc[:, 5:])
    
    #price = pm.Normal('Price', mu=mu, sd=sigma, observed=temp_df['Price'])

Intializing intercept...
Intializing beta..
Intializing sigma..
... working sucka ...
... working sucka ...
... working sucka ...
... working sucka ...
... working sucka ...


In [None]:
with cup_of_joe:
    start = pm.find_MAP()
    trace = pm.sample(3500, start=start)

In [None]:
# x_vals = []

# for index in range(3679):
        
#     if index % 100 == 0:
#         print("... working sucka ...")
#     x_vals.append(temp_df[index])
        
# x_vals_np = np.array(x_vals)

In [None]:
print(x_vals_np.shape)
print(betas_np.shape)

In [None]:
with pm.Model() as cup_of_joe:
    print("Intializing intercept...")
    intercept = pm.Normal('Intercept', mu=0, sd=10)
    
    print("Intializing beta..") 
    beta = pm.Normal('beta', mu=0, sd=10, shape=3679)
    
    print("Intializing sigma..") 
    sigma = pm.HalfNormal('price_std', sd=1)
    
    mu = intercept + np.dot(betas_np, x_vals_np)
    
    price = pm.Normal('Price', mu=mu, sd=sigma, observed=temp_df['Price'])

In [None]:
mu = intercept + np.dot(betas_np, temp_df.iloc[:, 5:]) #need to match index and iloc

In [None]:
# cv = CountVectorizer()

# vector_cv = cv.fit_transform(coffee['Title'])

# vector_transformed = cv.fit_transform(coffee['Title'])

# coffee["CV_Vectors"] = list(vector_transformed.toarray())

# coffee.tail(2)

In [None]:
# hashed = HashingVectorizer()

# vector_hashed_transformed = hashed.fit_transform(coffee['Title'])

# coffee['Hash_Vectors'] = list(vector_hashed_transformed.toarray())
# coffee.tail(2)

In [None]:
#coffee.to_csv('working_coffee_csv.csv')

In [None]:
# unpack the columns - start with TFIDF Vectors save as a new DF join with the current coffee DF. need truncated SVD.
# can run TFIDF (or any other column) through PCA.

In [None]:
temp_df.shape

In [None]:
temp_df.head(2)

In [None]:
with pm.Model() as cup_of_joe:
    intercept = pm.Normal('Intercept', mu=0, sd=10)
    beta = pm.Normal('beta', mu=0, sd=10, shape=3679)
    sigma = pm.HalfNormal('price_std', sd=1)
    
    #mu = intercept + for i in range(3678):
    #    beta[i]
    #    temp_df[i]
    
    mu = intercept + np.dot(beta, temp_df.iloc[:, 5:])
    
    
    price = pm.Normal('Price', mu=mu, sd=sigma, observed=temp_df['Price'])

In [None]:
with cup_of_joe:
    start = pm.find_MAP()
    trace = pm.sample(3500, start=start)

In [None]:
# len(temp_df.columns[6:].tolist())
temp_df.columns[5:]

In [None]:
intercept + np.dot(beta, temp_df.iloc[:, 5:].head())

In [None]:
[(beta[index], col) for index, col in enumerate(temp_df.columns.tolist())]
# temp_df[temp_df.columns[5:]].head(5)


In [None]:
beta.

In [None]:
# with pm.Model() as cup_of_joe:
#     print("Intializing intercept...")
#     intercept = pm.Normal('Intercept', mu=0, sd=10)
    
#     print("Intializing beta..") 
#     beta = pm.Normal('beta', mu=0, sd=10, shape=3679)
    
#     print("Intializing sigma..") 
#     sigma = pm.HalfNormal('price_std', sd=1)
    
#     betas = []

#     for index in range(3679):

#       if index % 1000 == 0:
#         print("... working sucka ...")
#       betas.append(beta[index])

#       betas_np = np.array(betas)

#     x_vals = []

#     for index in range(3679):

#       if index % 1000 == 0:
#         print("... working sucka ...")
#       x_vals.append(temp_df.iloc[:, 5:])

#       x_vals_np = np.array(x_vals)
    
#     mu = intercept + np.dot(betas_np, x_vals_np)
    
#     price = pm.Normal('Price', mu=mu, sd=sigma, observed=temp_df['Price'])