In [1]:
%matplotlib inline
#load packages and data
import datetime
import MySQLdb
import pandas.io.sql as sql
import pandas as pd
import numpy as np
import scipy as sp
from matplotlib import pyplot as plt
import sklearn as sk
from sklearn import cross_validation, linear_model, neighbors, feature_extraction, grid_search, pipeline, metrics
import dill
import seaborn as sns
dill.settings['recurse']=True

In [2]:
#get data from db
conn = MySQLdb.connect(host="localhost", port=3306, user="root", db="disney_db") #make db connection
cursor = conn.cursor()

In [3]:
#import preprocessed data for Disneyland
cursor.execute("SELECT hod, meanwait, tweetid, hour, conditions, wind, temp, we_ho FROM dl_test_clean")
rows = cursor.fetchall()
dl_df = pd.DataFrame( [[ij for ij in i] for i in rows] )
dl_df.rename(columns={0: 'timestamp', 1: 'meanwait', 2: 'tweetid',
                     3:'hour', 4:'conditions', 5:'wind', 6:'temp',
                     7:'we_ho'}, inplace=True)
dl_df.head()

Unnamed: 0,timestamp,meanwait,tweetid,hour,conditions,wind,temp,we_ho
0,2015-08-04 14:00:00,37.672414,12,14,Clear,6.9,89.1,0
1,2015-08-04 15:00:00,36.293103,13,15,Clear,5.8,89.1,0
2,2015-08-04 16:00:00,36.37931,17,16,Clear,8.1,87.1,0
3,2015-08-04 17:00:00,38.448276,8,17,Clear,6.9,82.0,0
4,2015-08-04 18:00:00,35.229885,13,18,Clear,4.6,78.1,0


In [4]:
#import preprocessed data for California Adventure
cursor.execute("SELECT hod, meanwait, tweetid, hour, conditions, wind, temp, we_ho FROM ca_test_clean")
rows = cursor.fetchall()
ca_df = pd.DataFrame( [[ij for ij in i] for i in rows] )
ca_df.rename(columns={0: 'timestamp', 1: 'meanwait', 2: 'tweetid',
                     3:'hour', 4:'conditions', 5:'wind', 6:'temp',
                     7:'we_ho'}, inplace=True)
ca_df.head()

Unnamed: 0,timestamp,meanwait,tweetid,hour,conditions,wind,temp,we_ho
0,2015-08-04 14:00:00,34.318182,12,14,Clear,6.9,89.1,0
1,2015-08-04 15:00:00,34.431818,13,15,Clear,5.8,89.1,0
2,2015-08-04 16:00:00,34.772727,17,16,Clear,8.1,87.1,0
3,2015-08-04 17:00:00,33.920455,8,17,Clear,6.9,82.0,0
4,2015-08-04 18:00:00,33.409091,13,18,Clear,4.6,78.1,0


In [5]:
#close sql (don't need it anymore)
conn.close()

In [6]:
#merge for training, one hot encode and normalize
result = pd.concat([dl_df, ca_df], keys=['Disneyland', 'California Adventure']).reset_index()

#one hot encoding
result = pd.concat([result, pd.get_dummies(result.level_0), pd.get_dummies(result.conditions)], axis=1)
result.drop(['level_0', 'level_1', 'conditions'], axis=1, inplace=True)

#normalize numericals
cols_to_norm = ['meanwait','tweetid']
result[cols_to_norm] = result[cols_to_norm].apply(lambda x: (x - x.mean()) / (x.max() - x.min()))
result.head()

Unnamed: 0,timestamp,meanwait,tweetid,hour,wind,temp,we_ho,California Adventure,Disneyland,Clear,Haze,Mostly Cloudy,Overcast,Partly Cloudy,Scattered Clouds
0,2015-08-04 14:00:00,0.54889,-0.095081,14,6.9,89.1,0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1,2015-08-04 15:00:00,0.513016,-0.074672,15,5.8,89.1,0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
2,2015-08-04 16:00:00,0.515258,0.00696,16,8.1,87.1,0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
3,2015-08-04 17:00:00,0.56907,-0.176713,17,6.9,82.0,0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
4,2015-08-04 18:00:00,0.485363,-0.074672,18,4.6,78.1,0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0


In [None]:
class ColumnSelector(sk.base.BaseEstimator, sk.base.TransformerMixin):
    def __init__(self, column_names): #initialize
        self.column_names = column_names

    def fit(self, X, y=None): #fit the transformation, optional here
        return self

    def transform(self, X):
        return [[x[column] for column in self.column_names] for x in X]
    
#ColumnSelector('conditions')colnamelist = ['latitude', 'longitude']
colnamelist = ['latitude', 'longitude']
y1 = ColumnTransformer(['meanwaits']).transform(data)

In [None]:
class NumericTransformer(sk.base.BaseEstimator, sk.base.TransformerMixin):
    def __init__(self): #initialize
        return None

    def fit(self, X, y=None): #fit the transformation, optional here
        return self

    def transform(self, X):
        X_norm = (X - X.mean()) / (X.max() - X.min())
        return X_norm

In [None]:
catpipe = pipeline.Pipeline([('colsel', ColumnSelector('conditions')),
                    ('dict', feature_extraction.DictVectorizer())])

catpipe.fit()