In [1]:
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd
%matplotlib inline

from operator import itemgetter
from itertools import groupby

import nltk
from sklearn.linear_model import LogisticRegression, LinearRegression, BayesianRidge
from sklearn import metrics, svm
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveRegressor, ARDRegression, SGDRegressor, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.base import TransformerMixin
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
pd.set_option('display.max_columns', 999)
pd.set_option('display.max_colwidth', 999)

import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)

In [2]:
class DenseTransformer(TransformerMixin):

    def transform(self, X, y=None, **fit_params):
        return X.todense()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self

In [3]:
coffee = pd.read_csv('./working_coffee_csv.csv')

In [4]:
coffee.tail(2)

Unnamed: 0,Date,Title,Price,Price_Change,Direction,Rate_of_Change,CV_Vectors,TFIDF_Vectors,Hash_Vectors
1463,2018-03-29,"IEG Vu: Easter email alert schedule, customer support hours Weekly Review: Funds still favor coffee shorts Futures Review: Robusta coffee drops in low volume Brazilian coffee exports seen at 35 mln 60-kg bags",1.1239,-0.0023,0,-0.002042,[0 0 0 ... 0 0 0],[0. 0. 0. ... 0. 0. 0.],[0. 0. 0. ... 0. 0. 0.]
1464,2018-04-03,Futures Review: Arabica coffee hits 10-month low Central American coffee exports mixed in March Brazilian green coffee exports down y-o-y Ivory Coast and Indonesian coffee exports fell sharply,1.1262,0.0023,0,0.002046,[0 0 0 ... 0 0 0],[0. 0. 0. ... 0. 0. 0.],[0. 0. 0. ... 0. 0. 0.]


In [5]:
coffee.set_index('Date', inplace=True)

In [None]:
# plt.figure(figsize = (14,10))
# plt.plot(coffee['Price'], label='Price')
# plt.plot(coffee['Price'].rolling(30).mean(), label='30 Day MA')
# plt.plot(coffee['Price'].rolling(50).mean(), label = '50 Day MA')
# plt.xlabel("Days")
# plt.ylabel('Price $USD/ Pound')
# plt.legend()

## Count Vectorizer Pipeline

In [None]:
cv = CountVectorizer()
#tf = TfidfVectorizer()
#lr = LinearRegression()
svr = svm.SVR()

X = coffee[['Title', 'Price_Change', 'Rate_of_Change']]
y = coffee['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)


In [None]:

# Need to create a vector, can't give it a DF.  
# Isolate the title

def title(df):
    return df['Title']

title_tf = FunctionTransformer(title, validate=False)    

text_pipeline_cv = Pipeline([
    
    ('title_tf', title_tf),
    ('cv', cv),
    ('svr', svr)
    
])

params = {
    
    'cv__stop_words': [None,'english'],
    'cv__ngram_range' : [(1,2), (1,2), (1,3)],
    'cv__max_features' : [None, 1000],
    'cv__binary' : [True, False],
    'cv__min_df' : [1,2,3,],
    'svr__kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
    'svr__shrinking' : [True, False],
    'svr__gamma' : ['auto', 1.0, 2.0]
    
}
gs = GridSearchCV(text_pipeline_cv, param_grid=params)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

In [None]:
gs.score(X_test, y_test)







## Passive Agressive Regressor Pipeline

In [None]:
cv = CountVectorizer()
#tf = TfidfVectorizer()
#lr = LinearRegression()
pa = PassiveAggressiveRegressor()

X = coffee[['Title', 'Rate_of_Change', 'Price_Change']]
y = coffee['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

# Need to create a vector, can't give it a DF.  
# Isolate the title

def title(df):
    return df['Title']

title_tf = FunctionTransformer(title, validate=False)    

text_pipeline_cv = Pipeline([
    
    ('title_tf', title_tf),
    ('cv', cv),
    ('pa', pa)
    
])

params = {
    
    'cv__stop_words': [None,'english'],
    'cv__ngram_range' : [(1,2), (1,2), (1,3), (2,3)],
    'cv__max_features' : [None, 1000, 1500, 2000],
    'cv__binary' : [True, False],
    'cv__min_df' : [1,2,3],
    'cv__max_df' : [1.0, .75],
    'pa__n_iter': [5,10,15,20,25,30,35,40],
    'pa__loss' : ['squared_epsilon_insensitive', 'epsilon_insensitive'],
    'pa__C' : [1.0, .5, .25, .125, .0675]
    
}
gs = GridSearchCV(text_pipeline_cv, param_grid=params)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

In [None]:
gs.score(X_test, y_test)

## ARD Regression

In [None]:
cv = CountVectorizer()
#tf = TfidfVectorizer()
#lr = LinearRegression()
ard = ARDRegression()
to_dense = DenseTransformer()

X = coffee[['Title', 'Rate_of_Change', 'Price_Change']]
y = coffee['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
# Need to create a vector, can't give it a DF.  
# Isolate the title

def title(df):
    return df['Title']


title_tf = FunctionTransformer(title, validate=False) 

In [None]:
text_pipeline_ard = Pipeline([
    
    ('title_tf', title_tf),
    ('cv', cv),
    ('to_dense', DenseTransformer()),
    ('ard', ard)
    
])

# params = {
    
#     'cv__stop_words': [None,'english'],
#     'cv__ngram_range' : [(1,2), (1,2), (1,3), (2,3)],
#     'cv__max_features' : [None, 1000, 1500],
#     'cv__binary' : [True, False],
#     'cv__min_df' : [1,2,3,4],
#     'cv__max_df' : [1.0, .75],
#     'ard__n_iter' : [300,400,500,600],
#     'ard__compute_score' : [True, False],
#     'ard__tol' : [.001,.0001,.00001]
    
# }
# gs = GridSearchCV(text_pipeline_ard, param_grid=params)
# gs.fit(X_train, y_train)
# print(gs.best_score_)
# gs.best_params_


results = text_pipeline_ard.fit(X_train, y_train)
print(results)

In [None]:
results.score(X_train, y_train)

In [None]:
results.predict(X_test)

## Stochastic Gradient Descent Regressor.

In [None]:
cv = CountVectorizer()
#tf = TfidfVectorizer()
#lr = LinearRegression()
sgdreg = SGDRegressor()

X = coffee[['Title', 'Date']]
y = coffee['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
# Need to create a vector, can't give it a DF.  
# Isolate the title

def title(df):
    return df['Title']


title_tf = FunctionTransformer(title, validate=False)

In [None]:
text_pipeline_bayes = Pipeline([
    
    ('title_tf', title_tf),
    ('cv', cv),
    ('sgdreg', sgdreg)
    
])

params = {
    
    'cv__stop_words': [None,'english'],
    'cv__ngram_range' : [(1,2), (1,2), (1,3), (2,3)],
    'cv__max_features' : [None, 1000, 1500],
    'cv__binary' : [True, False],
    'cv__min_df' : [1,2,3,4],
    'cv__max_df' : [1.0, .75],
    'sgdreg__loss' : ['squared_loss', 'huber', 'epsilon_insensitive','squared_epsilon_insensitive'],
    'sgdreg__penalty' : ['none', 'l2', 'l1','elasticnet'],
    'sgdreg__n_iter' : [5,10,15]
    
    
}
gs = GridSearchCV(text_pipeline_bayes, param_grid=params)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

In [None]:
gs.score(X_test, y_test)

# New DF


In [None]:
cv = CountVectorizer()
#tf = TfidfVectorizer()
#lr = LinearRegression()
tree = DecisionTreeRegressor()

X = coffee[['Title', 'Price_Change', 'Rate_of_Change']]
y = coffee['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
# Need to create a vector, can't give it a DF.  
# Isolate the title

def title(df):
    return df['Title']


title_tf = FunctionTransformer(title, validate=False)

In [None]:
text_pipeline_tree = Pipeline([
    
    ('title_tf', title_tf),
    ('cv', cv),
    ('tree', tree)
    
])

params = {
    
    'cv__stop_words': [None,'english'],
    'cv__ngram_range' : [(1,2), (1,2), (1,3)],
    'cv__max_features' : [None, 1000, 1500],
    'cv__min_df' : [1,2,3],
    'tree__criterion' : ['mse', 'mae'],
    'tree__max_features' : ['auto', 'log2', 'sqrt'],
    'tree__max_depth' : [5, 10]
    
    
}
gs = GridSearchCV(text_pipeline_tree, param_grid=params)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

In [None]:
gs.score(X_test, y_test)

## Baysean Regression 


In [None]:
coffee = pd.read_csv('./updated_coffee_df.csv')

coffee.head()
coffee.drop(coffee[['Unnamed: 0', 'Unnamed: 0.1']], axis=1, inplace=True)

coffee['Date'] = pd.to_datetime(coffee['Date'])
coffee.head()

#tf = TfidfVectorizer()
#lr = LinearRegression()
bayes = BayesianRidge()

X = coffee['Title']
y = coffee['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
ss = StandardScaler()

#need to vectorize the text first.
vectorizer = TfidfVectorizer()

# tokenize and build vocab
vectorizer.fit(X_train)

# summarize
print(vectorizer.vocabulary_)
print(vectorizer.idf_)

# encode document
X_train_vector = vectorizer.transform(X_train)
X_test_vector = vectorizer.transform(X_test)
# summarize encoded vector
# print(vector.shape)
# print(vector.toarray())


X_train_scaled = ss.fit_transform(X_train_vector.toarray())
X_test_scaled = ss.transform(X_test_vector.toarray())

In [None]:
bayes.fit(X_train_scaled, y_train)

In [None]:
bayes.score(X_train_scaled, y_train)

In [None]:
bayes.score(X_test_scaled, y_test)

In [None]:
plt.plot(preds)

## KNN Rgression

In [None]:
cv = CountVectorizer()
#tf = TfidfVectorizer()
#lr = LinearRegression()
knn = KNeighborsRegressor() 

X = coffee[['Title', 'Date', 'Price_Change', 'Rate_of_Change']]
y = coffee['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
# Need to create a vector, can't give it a DF.  
# Isolate the title

def title(df):
    return df['Title']


title_tf = FunctionTransformer(title, validate=False)

In [None]:
text_pipeline_knn = Pipeline([
    
    ('title_tf', title_tf),
    ('cv', cv),
    ('knn', knn)
    
])

params = {
    
    'cv__stop_words': [None,'english'],
    'cv__ngram_range' : [(1,2), (1,2), (1,3)],
    'cv__max_features' : [None, 1000, 1500],
    'cv__min_df' : [1,2,3],
    'knn__n_neighbors' : [1,2,3,4,5,6],
    'knn__algorithm' : ['auto', 'brute'],
    'knn__weights' : ['uniform', 'distance']
    
    
    
}
gs = GridSearchCV(text_pipeline_knn, param_grid=params)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

## ARD Regression


In [None]:
coffee = pd.read_csv('./updated_coffee_df.csv')

coffee.head()
coffee.drop(coffee[['Unnamed: 0', 'Unnamed: 0.1']], axis=1, inplace=True)

coffee['Date'] = pd.to_datetime(coffee['Date'])
coffee.head()

#tf = TfidfVectorizer()
#lr = LinearRegression()
ard = ARDRegression()

X = coffee['Title']
y = coffee['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
ss = StandardScaler()

#need to vectorize the text first.
vectorizer = TfidfVectorizer()

# tokenize and build vocab
vectorizer.fit(X_train)

# summarize
print(vectorizer.vocabulary_)
print(vectorizer.idf_)

# encode document
X_train_vector = vectorizer.transform(X_train)
X_test_vector = vectorizer.transform(X_test)
# summarize encoded vector
# print(vector.shape)
# print(vector.toarray())


X_train_scaled = ss.fit_transform(X_train_vector.toarray())
X_test_scaled = ss.transform(X_test_vector.toarray())

In [None]:
ard.fit(X_train_scaled, y_train)

In [None]:
ard.score(X_train_scaled, y_train)

In [None]:
ard.score(X_test_scaled, y_test)

## Ridge Regresssion 

In [None]:
cv = CountVectorizer()
#tf = TfidfVectorizer()
#lr = LinearRegression()
ridge = Ridge()

X = coffee[['Title','Price_Change', 'Rate_of_Change']]
y = coffee['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
# Need to create a vector, can't give it a DF.  
# Isolate the title

def title(df):
    return df['Title']


title_tf = FunctionTransformer(title, validate=False)

In [None]:
text_pipeline_ridge = Pipeline([
    
    ('title_tf', title_tf),
    ('cv', cv),
    ('ridge', ridge)
    
])

params = {
    
    'cv__stop_words': [None],
    'cv__ngram_range' : [(1,3), (2,3), (3,4), (4,5)],
    'cv__max_features' : [1000, 1500, 2000],
    'cv__min_df' : [1,2,3],
    #'cv__norm' : ['l1', 'l2', None],
    'ridge__alpha': [ 4., 4.5, 5, 5.5, 6.0, 6.5],
    'ridge__max_iter': [2500, 2750, 2800, 2900],
    'ridge__solver' : ['auto','cholesky'],
    'ridge__normalize': [True, False],
    'ridge__tol' : [.01, .001, .0001]
    
    
    
}
gs = GridSearchCV(text_pipeline_ridge, param_grid=params)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

## Lasso Regression 

In [None]:
###cv = CountVectorizer()
tf = TfidfVectorizer()
#lr = LinearRegression()
lasso = Lasso()

X = coffee[['Title', 'Date', 'Price_Change', 'Rate_of_Change']]
y = coffee['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
# Need to create a vector, can't give it a DF.  
# Isolate the title

def title(df):
    return df['Title']


title_tf = FunctionTransformer(title, validate=False)

In [None]:
text_pipeline_lasso = Pipeline([
    
    ('title_tf', title_tf),
    ('cv', cv),
    ('lasso', lasso)
    
])

params = {
    
    'cv__stop_words': [None,'english'],
    'cv__ngram_range' : [(1,2), (1,2), (1,3)],
    'cv__max_features' : [None, 1000, 1500],
    'cv__min_df' : [1,2,3],
    
    'lasso__max_iter': [1000, 2000, 3000, 4000],
    'lasso__tol' : [.001, .0001, .00001]
    
    
    
    
}
gs = GridSearchCV(text_pipeline_lasso, param_grid=params)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

## Elastic Net Regression.



In [None]:
cv = CountVectorizer()
#tf = TfidfVectorizer()
#lr = LinearRegression()
enet = ElasticNet()

X = coffee[['Title', 'Date', 'Price_Change', 'Rate_of_Change']]
y = coffee['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
# Need to create a vector, can't give it a DF.  
# Isolate the title

def title(df):
    return df['Title']


title_tf = FunctionTransformer(title, validate=False)

In [None]:
text_pipeline_enet = Pipeline([
    
    ('title_tf', title_tf),
    ('cv', cv),
    ('enet', enet)
    
])

params = {
    
    'cv__stop_words': [None,'english'],
    'cv__ngram_range' : [(1,2), (1,3), (1,4)],
    'cv__max_features' : [None, 1000, 1500],
    'cv__min_df' : [1,2,3],
    'enet__alpha': [1., 1.5, 2.0, 2.5],
    'enet__l1_ratio' : [.25, .5, .75],
    'enet__precompute' : [True, False],
    'enet__max_iter' : [1000,1500, 2000, 2500]
    
    
    
    
}
gs = GridSearchCV(text_pipeline_enet, param_grid=params)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

## Random Forest Regressor

In [6]:
coffee.head(2)

Unnamed: 0_level_0,Title,Price,Price_Change,Direction,Rate_of_Change,CV_Vectors,TFIDF_Vectors,Hash_Vectors
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2007-01-02,India earns more from higher coffee exports in 2006,1.1506,0.0,0,0.0,[0 0 0 ... 0 0 0],[0. 0. 0. ... 0. 0. 0.],[0. 0. 0. ... 0. 0. 0.]
2007-01-03,"Friesland raises stake in Indonesian subsidiary MILK PRODUCT PRICES IN EUROPE AND THE US Milk product prices, Jan 3 Starbucks reveals trans fat removal plan Indonesian coffee shipments stable after floods",1.176,0.0254,0,0.022075,[0 0 0 ... 0 0 0],[0. 0. 0. ... 0. 0. 0.],[0. 0. 0. ... 0. 0. 0.]


In [6]:
rf = RandomForestRegressor()
cv = CountVectorizer()

In [7]:
X = coffee[['Price_Change', 'Rate_of_Change', 'Title']]
y = coffee['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

def title(df):
    return df['Title']


title_tf = FunctionTransformer(title, validate=False)

In [8]:
text_pipeline_rfreg = Pipeline([
    
    ('title_tf', title_tf),
    ('cv', cv),
    ('rf', rf)
    
])

params = {
    
    'cv__stop_words': [None,'english'],
    'cv__ngram_range' : [(1,2), (1,3), (1,4)],
    'cv__max_features' : [1000, 1500, 2000],
    'cv__min_df' : [1,2,3],
    'rf__n_estimators': [10,20,30,40,50],
    'rf__max_depth' : [None, 10, 15, 20],
    'rf__max_features' : ['auto', 'sqrt', 'log2'],
    'rf__n_jobs' : [2]
}
gs = GridSearchCV(text_pipeline_rfreg, param_grid=params)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

0.41316977963716595


{'cv__max_features': 1500,
 'cv__min_df': 2,
 'cv__ngram_range': (1, 4),
 'cv__stop_words': 'english',
 'rf__max_depth': None,
 'rf__max_features': 'sqrt',
 'rf__n_estimators': 30,
 'rf__n_jobs': 2}

In [9]:
cv = CountVectorizer()
bag = BaggingRegressor()

In [10]:
X = coffee[['Price_Change', 'Rate_of_Change', 'Title']]
y = coffee['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

def title(df):
    return df['Title']


title_tf = FunctionTransformer(title, validate=False)

In [11]:
text_pipeline_bag = Pipeline([
    
    ('title_tf', title_tf),
    ('cv', cv),
    ('bag', bag)
    
])

params = {
    
    'cv__stop_words': [None,'english'],
    'cv__ngram_range' : [(1,2), (1,3), (1,4)],
    'cv__max_features' : [1000, 1500, 2000],
    'cv__min_df' : [1,2,3],
    'bag__n_estimators' : [10,20,30,40,50],
    'bag__max_features' : [1.0, .75, .5, .25],
    'bag__n_jobs' : [2]
 
}

gs = GridSearchCV(text_pipeline_bag, param_grid=params)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

0.4288228067772207


{'bag__max_features': 0.5,
 'bag__n_estimators': 50,
 'bag__n_jobs': 2,
 'cv__max_features': 1000,
 'cv__min_df': 2,
 'cv__ngram_range': (1, 3),
 'cv__stop_words': 'english'}