In [155]:
%matplotlib inline
#load packages and data
import datetime
import MySQLdb
import pandas.io.sql as sql
import pandas as pd
import numpy as np
import scipy as sp
from matplotlib import pyplot as plt
import sklearn as sk
from sklearn import cross_validation, linear_model, neighbors, feature_extraction, grid_search, pipeline, metrics, ensemble
import dill
import seaborn as sns
dill.settings['recurse']=True

In [48]:
#get data from db
conn = MySQLdb.connect(host="localhost", port=3306, user="root", db="disney_db") #make db connection
cursor = conn.cursor()

In [49]:
#import preprocessed data for Disneyland
cursor.execute("SELECT hod, meanwait, tweetid, hour, conditions, wind, temp, we_ho FROM dl_test_clean")
rows = cursor.fetchall()
dl_df = pd.DataFrame( [[ij for ij in i] for i in rows] )
dl_df.rename(columns={0: 'timestamp', 1: 'meanwait', 2: 'tweetid',
                     3:'hour', 4:'conditions', 5:'wind', 6:'temp',
                     7:'we_ho'}, inplace=True)
dl_df.head()

Unnamed: 0,timestamp,meanwait,tweetid,hour,conditions,wind,temp,we_ho
0,2015-08-04 14:00:00,37.672414,12,14,Clear,6.9,89.1,0
1,2015-08-04 15:00:00,36.293103,13,15,Clear,5.8,89.1,0
2,2015-08-04 16:00:00,36.37931,17,16,Clear,8.1,87.1,0
3,2015-08-04 17:00:00,38.448276,8,17,Clear,6.9,82.0,0
4,2015-08-04 18:00:00,35.229885,13,18,Clear,4.6,78.1,0


In [50]:
#import preprocessed data for California Adventure
cursor.execute("SELECT hod, meanwait, tweetid, hour, conditions, wind, temp, we_ho FROM ca_test_clean")
rows = cursor.fetchall()
ca_df = pd.DataFrame( [[ij for ij in i] for i in rows] )
ca_df.rename(columns={0: 'timestamp', 1: 'meanwait', 2: 'tweetid',
                     3:'hour', 4:'conditions', 5:'wind', 6:'temp',
                     7:'we_ho'}, inplace=True)
ca_df.head()

Unnamed: 0,timestamp,meanwait,tweetid,hour,conditions,wind,temp,we_ho
0,2015-08-04 14:00:00,34.318182,12,14,Clear,6.9,89.1,0
1,2015-08-04 15:00:00,34.431818,13,15,Clear,5.8,89.1,0
2,2015-08-04 16:00:00,34.772727,17,16,Clear,8.1,87.1,0
3,2015-08-04 17:00:00,33.920455,8,17,Clear,6.9,82.0,0
4,2015-08-04 18:00:00,33.409091,13,18,Clear,4.6,78.1,0


In [51]:
#close sql (don't need it anymore)
conn.close()

In [121]:
#merge for training, one hot encode and normalize
result = pd.concat([dl_df, ca_df], keys=['Disneyland', 'California Adventure']).reset_index()

#one hot encoding
result = pd.concat([result, pd.get_dummies(result.level_0), pd.get_dummies(result.conditions)], axis=1)
result.drop(['level_0', 'level_1', 'conditions'], axis=1, inplace=True)

#normalize numericals
cols_to_norm = ['meanwait','tweetid']
result[cols_to_norm] = result[cols_to_norm].apply(lambda x: (x - x.mean()) / (x.max() - x.min()))

#add squares of hod, temp, and wind
result['temp2'] = result['temp']**2
result['hour2'] = result['hour']**2
result['wind2'] = result['wind']**2

#convert to dict
data = result.T.to_dict().values()
data[0]

{'California Adventure': 0.0,
 'Clear': 1.0,
 'Disneyland': 1.0,
 'Haze': 0.0,
 'Mostly Cloudy': 0.0,
 'Overcast': 0.0,
 'Partly Cloudy': 0.0,
 'Scattered Clouds': 0.0,
 'hour': 14,
 'hour2': 196,
 'meanwait': 0.5488904200224402,
 'temp': 89.1,
 'temp2': 7938.809999999999,
 'timestamp': Timestamp('2015-08-04 14:00:00'),
 'tweetid': -0.09508062484252958,
 'we_ho': 0,
 'wind': 6.9,
 'wind2': 47.61000000000001}

In [131]:
class ColumnSelector(sk.base.BaseEstimator, sk.base.TransformerMixin):
    def __init__(self, column_names): #initialize
        self.column_names = column_names

    def fit(self, X, y=None): #fit the transformation, optional here
        return self

    def transform(self, X):
        return [[x[column] for column in self.column_names] for x in X]


columns = sorted(data[0].keys())
knnpredictors = [columns[i] for i in [0,1,2,3,4,5,6,7,8,11,15,16]]
linpredictors = [columns[i] for i in [0,1,2,3,4,5,6,7,8,9,11,12,15,16,17]]
meanwaits = ColumnSelector(['meanwait']).transform(data) 
tweetid = ColumnSelector(['tweetid']).transform(data) #but should do this again with the full dataset

In [102]:
class SquareTransformer(sk.base.BaseEstimator, sk.base.TransformerMixin):
    def __init__(self, column_names): #initialize
        self.column_names = column_names

    def fit(self, X, y=None): #fit the transformation, optional here
        return self

    def transform(self, X):
        return [[x[column] for column in self.column_names] for x in X]

In [132]:
#knn model

#knnpipe = pipeline.Pipeline([('colsel', ColumnSelector(predictors)),
#                             ('est', sk.neighbors.KNeighborsRegressor())])

#parameters = dict(est__n_neighbors=range(1, 101, 5))
#knn_cv = sk.grid_search.GridSearchCV(knnpipe, param_grid=parameters)
#testfit = knn_cv.fit(data, meanwaits)
#knn_cv.best_params_

knnpipe = pipeline.Pipeline([('colsel', ColumnSelector(knnpredictors)),
                             ('est', sk.neighbors.KNeighborsRegressor(n_neighbors=6))])
knnpipe.fit(data, tweetid)
knnpipe.score(data, tweetid)

0.50026424835973893

In [148]:
linpipe = pipeline.Pipeline([('colsel', ColumnSelector(linpredictors)),
                             ('linreg', linear_model.LinearRegression())])

#loo = cross_validation.KFold(len(meanwaits), 10, shuffle=True)
#scores = cross_validation.cross_val_score(linpipe, data, meanwaits, cv=loo)
#scores.mean()

linpipe.fit(data, meanwaits)
linpipe.score(data, meanwaits)

0.37933394491617256

In [215]:
class EnsembleRegressor(sk.base.BaseEstimator, sk.base.RegressorMixin):
    """Joins a linear, random forest, and nearest neighbors model."""
    def __init__(self, neighbors, samples):
        self.neighbors = neighbors
        self.samples = samples
        pass
    
    def fit(self, X, y):
        self.linear_regression = linear_model.LinearRegression().fit(X, y)
        y_err = y - self.linear_regression.predict(X)

        self.nearest_neighbors = neighbors.KNeighborsRegressor(n_neighbors=self.neighbors).fit(X, y_err)
        self.random_forest = ensemble.RandomForestRegressor(min_samples_leaf=self.samples).fit(X, y_err)

        X_ensemble = pd.DataFrame({
            "NEAR": self.nearest_neighbors.predict(X),
            "FOREST": self.random_forest.predict(X),
            "LINEAR": self.linear_regression.predict(X),
        })
        self.ensemble_regression = linear_model.LinearRegression().fit(X_ensemble, y)
        return self
    
    def predict(self, X):
        X_ensemble = pd.DataFrame({
            "NEAR": self.nearest_neighbors.predict(X),
            "FOREST": self.random_forest.predict(X),
            "LINEAR": self.linear_regression.predict(X),
        })
        return self.ensemble_regression.predict(X_ensemble)

In [218]:
#neighbors = 6
#samples = 20
nestedreg = pipeline.Pipeline([('colsel', ColumnSelector(linpredictors)),
                               ('est', EnsembleRegressor(neighbors, samples))])  

y1d = [item for sublist in meanwaits for item in sublist]
#nestedreg.fit(data, y1d)
#nestedreg.score(data, y1d)    

nestedreg.get_params().keys()
parameters = dict(est__neighbors=range(1,101,5), est__samples=range(1,32,3))
nested_cv = sk.grid_search.GridSearchCV(nestedreg, param_grid=parameters)
nested_cv.fit(data, y1d)
#nested_cv.best_params_


#residual_regressor_performance = pd.DataFrame([
    #("Ensemble Regressor", compute_error(EnsembleRegressor(), X, y))
    #], columns=["Model", "MSE"])
#model_performance=model_performance.append(residual_regressor_performance)
#model_performance.plot(x="Model", y="MSE", kind="Bar")

AttributeError: 'int' object has no attribute 'KNeighborsRegressor'

In [None]:
#TWEET MODEL WILL COME LATER
#knnpipe.fit(data, tweetid)
#knnpipe.score(data, tweetid)
#linpipe.fit(data, tweetid)
#linpipe.score(data, tweetid)