In [1]:
%matplotlib inline
#load packages and data
import datetime
import holidays
import re #regular expressions
import MySQLdb
import pandas.io.sql as sql
import pandas as pd
import numpy as np
import scipy.stats
from matplotlib import pyplot as plt
import sklearn as sk
from sklearn import preprocessing, cross_validation, linear_model, neighbors, feature_extraction, grid_search, pipeline, metrics, ensemble
import seaborn as sns
import dill

In [2]:
#get current twitter data from db
conn = MySQLdb.connect(host="localhost", port=3306, user="root", db="disney_db") #make db connection
cursor = conn.cursor()
cursor.execute("SELECT id, user, timestamp, search, location FROM twitter_data_copy WHERE location = 'anaheim'")
rows = cursor.fetchall()
tweetdf = pd.DataFrame( [[ij for ij in i] for i in rows] )
tweetdf.rename(columns={0: 'tweetid', 1: 'user', 2: 'timestamps', 3: 'search', 4: 'location'}, inplace=True)
tweetdf.search = tweetdf.search.apply(lambda x: x.title())
print tweetdf.head()

   tweetid              user          timestamps      search location
0       51  @catcherintherei 2015-08-01 16:55:00  Disneyland  anaheim
1       52      @LambdaPanda 2015-08-01 16:53:00  Disneyland  anaheim
2       53        @rubyray92 2015-08-01 16:46:00  Disneyland  anaheim
3       54         @LovezDee 2015-08-01 16:46:00  Disneyland  anaheim
4       55    @TheEricGraham 2015-08-01 16:34:00  Disneyland  anaheim


In [3]:
#now bin tweets by hour
tweetdf['hod'] = [datetime.datetime(dt.year, dt.month, dt.day, dt.hour) for dt in tweetdf.timestamps]
tweetdf['hour'] = [dt.hour for dt in tweetdf.timestamps]
tweetdftrim = tweetdf[tweetdf.hour >= 8]
tweetdftrim = tweetdftrim[tweetdftrim.hour <= 22]
tweets_per_hour = pd.DataFrame(tweetdftrim.groupby(['search','hod'])['user'].count()).reset_index()
print tweets_per_hour.head()

                 search                 hod  user
0  California Adventure 2014-12-31 16:00:00     3
1  California Adventure 2014-12-31 17:00:00     2
2  California Adventure 2014-12-31 18:00:00     7
3  California Adventure 2014-12-31 19:00:00     2
4  California Adventure 2014-12-31 20:00:00     2


In [4]:
#import weather data
cursor.execute("SELECT TemperatureF, Wind_SpeedMPH, PrecipitationIN, Conditions, DateUTC FROM anaheim_weather WHERE DateUTC > '2014-12-30'")
wrows = cursor.fetchall()
weatherdf = pd.DataFrame( [[ij for ij in i] for i in wrows] )
weatherdf.rename(columns={0: 'temp', 1:'wind', 2:'precip', 3:'conditions', 4:'datetimeUTC'}, inplace=True)

In [5]:
#fix time zone on weather data and process
weatherdf['datetimeUTC'] =  pd.to_datetime(weatherdf['datetimeUTC'], format='%Y-%m-%d %H:%M:%S.')
weatherdf['timestamp'] = weatherdf['datetimeUTC'] - datetime.timedelta(hours=8)
weatherdf['hod'] = [datetime.datetime(dt.year, dt.month, dt.day, dt.hour) for dt in weatherdf.timestamp] 
weatherdf.wind.replace(-9999.0, float('NaN'), inplace=True)
weatherdf.temp.replace(-9999.0, float('NaN'), inplace=True)
hourly_weather = pd.DataFrame(weatherdf.groupby(['hod']).agg({'temp': np.nanmean, 'wind': np.nanmean, 'conditions': 'first'})).reset_index()



In [6]:
#join weather and twitter data
tweetwaits = pd.merge(tweets_per_hour, hourly_weather, on='hod')
tweetwaits.head()
#tweetwaits['hour'] = [dt.hour for dt in tweetwaits.hod]

Unnamed: 0,search,hod,user,conditions,wind,temp
0,California Adventure,2014-12-31 16:00:00,3,Clear,5.8,52.0
1,Disneyland,2014-12-31 16:00:00,11,Clear,5.8,52.0
2,California Adventure,2014-12-31 17:00:00,2,Clear,3.5,51.1
3,Disneyland,2014-12-31 17:00:00,21,Clear,3.5,51.1
4,California Adventure,2014-12-31 18:00:00,7,Clear,0.0,48.0


In [7]:
#get business days and holidays
tweetwaits['business_day'] = [dt.weekday() >= 5 for dt in tweetwaits.hod]
us_holidays = holidays.UnitedStates()
tweetwaits['holiday'] = [day in us_holidays for day in tweetwaits.hod]
tweetwaits.head()

Unnamed: 0,search,hod,user,conditions,wind,temp,business_day,holiday
0,California Adventure,2014-12-31 16:00:00,3,Clear,5.8,52.0,False,False
1,Disneyland,2014-12-31 16:00:00,11,Clear,5.8,52.0,False,False
2,California Adventure,2014-12-31 17:00:00,2,Clear,3.5,51.1,False,False
3,Disneyland,2014-12-31 17:00:00,21,Clear,3.5,51.1,False,False
4,California Adventure,2014-12-31 18:00:00,7,Clear,0.0,48.0,False,False


In [8]:
#close SQL connection
conn.close()

In [30]:
#one hot encoding
result = pd.concat([tweetwaits, pd.get_dummies(tweetwaits.search), pd.get_dummies(tweetwaits.conditions)], axis=1)
result.reset_index(inplace=True)
result.drop('index', axis=1, inplace=True)
result = result.dropna()
result.head()

Unnamed: 0,search,hod,user,conditions,wind,temp,business_day,holiday,California Adventure,Disneyland,...,Fog,Haze,Heavy Rain,Light Rain,Mostly Cloudy,Overcast,Partly Cloudy,Rain,Scattered Clouds,Unknown
0,California Adventure,2014-12-31 16:00:00,3,Clear,5.8,52.0,False,False,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Disneyland,2014-12-31 16:00:00,11,Clear,5.8,52.0,False,False,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,California Adventure,2014-12-31 17:00:00,2,Clear,3.5,51.1,False,False,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Disneyland,2014-12-31 17:00:00,21,Clear,3.5,51.1,False,False,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,California Adventure,2014-12-31 18:00:00,7,Clear,0.0,48.0,False,False,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
#normalize numericals
tweeters = result['user']

###OTHER PARKS: KNOTT'S BERRY FARM, LEGOLAND CALIFORNIA RESORT (CALRSBAD), UNIVERSAL STUDIOS (HOLLYWOOD)
###HERE CALCULATE TWEETS PER ACRE OF THE PARK: Disney 85, CA 67, Knott's 160, Legoland 128, US 415
CAm2 = 67.
DLm2 = 85.

result['size'] = np.where(result['search']=='Disneyland', 85, 67)
result['tweetsperacre'] = result['user']/ result['size']

meant = result['tweetsperacre'].mean()
stdt = result['tweetsperacre'].std()
#result['tweetsnormed'] = result['tweetsperacre'].apply(lambda x: ((x - meant) / stdt))

#add squares of hod, temp, and wind
result['hour'] = [dt.hour for dt in result.hod]
result['temp2'] = result['temp']**2
result['hour2'] = result['hour']**2
result['wind2'] = result['wind']**2

hourly_averages = pd.DataFrame(result.groupby(['hour', 'search']).agg({'user':np.mean, 'size': 'first'})).reset_index()
hourly_averages['peracre'] = hourly_averages['user']/hourly_averages['size']
hourly_averages.drop(['user', 'size'], axis=1, inplace=True)
hour_averages = hourly_averages.T.to_dict().values()
hour_averages
dill.dump(hour_averages, open('hourly_averages.pkl', 'w'))

In [10]:
result.drop(['conditions', 'search'], axis=1, inplace=True)
data = result.T.to_dict().values()
data[0]

{'California Adventure': 1.0,
 'Clear': 1.0,
 'Disneyland': 0.0,
 'Fog': 0.0,
 'Haze': 0.0,
 'Heavy Rain': 0.0,
 'Light Rain': 0.0,
 'Mostly Cloudy': 0.0,
 'Overcast': 0.0,
 'Partly Cloudy': 0.0,
 'Rain': 0.0,
 'Scattered Clouds': 0.0,
 'Unknown': 0.0,
 'business_day': False,
 'hod': Timestamp('2014-12-31 16:00:00'),
 'holiday': False,
 'hour': 16,
 'hour2': 256,
 'size': 67,
 'temp': 52.0,
 'temp2': 2704.0,
 'tweetsperacre': 0.04477611940298507,
 'user': 3,
 'wind': 5.8,
 'wind2': 33.64}

In [12]:
#model definition

class ColumnSelector(sk.base.BaseEstimator, sk.base.TransformerMixin):
    def __init__(self, column_names): #initialize
        self.column_names = column_names

    def fit(self, X, y=None): #fit the transformation, optional here
        return self

    def transform(self, X):
        return [[x[column] for column in self.column_names] for x in X]


columns = sorted(data[0].keys())
#knnpredictors = ['Clear', 'Fog', 'Haze', 'Heavy Rain', 'Light Rain', 'Mist', 
#                 'Mostly Cloudy', 'Overcast', 'Partly Cloudy', 'Rain', 'Scattered Clouds', 
#                 'business_day', 'California Adventure', 'Disneyland', 'holiday', 'hour', 'temp', 'wind']
linpredictors = ['Clear', 'Fog', 'Haze', 'Heavy Rain', 'Light Rain', 
                 'Mostly Cloudy', 'Overcast', 'Partly Cloudy', 'Rain', 'Scattered Clouds', 
                 'business_day', 'California Adventure', 'Disneyland', 'holiday', 'hour', 
                 'hour2', 'temp', 'temp2', 'wind', 'wind2']

y = result.tweetsperacre.as_matrix()
print y

[ 0.04477612  0.12941176  0.02985075 ...,  0.02352941  0.02352941
  0.01176471]


In [None]:
result.to_csv("hourly_data.csv", na_rep='NA')

In [13]:
class EnsembleRegressor(sk.base.BaseEstimator, sk.base.RegressorMixin):
    """Joins a linear, random forest, and nearest neighbors model."""
    def __init__(self, nbrs, samples):
        self.nbrs = nbrs
        self.samples = samples
        pass
    
    def fit(self, X, y):
        self.linear_regression = linear_model.LinearRegression().fit(X, y)
        y_err = y - self.linear_regression.predict(X)

        self.nearest_neighbors = neighbors.KNeighborsRegressor(n_neighbors=self.nbrs).fit(X, y_err)
        self.random_forest = ensemble.RandomForestRegressor(min_samples_leaf=self.samples).fit(X, y_err)

        X_ensemble = pd.DataFrame({
            "NEAR": self.nearest_neighbors.predict(X),
            "FOREST": self.random_forest.predict(X),
            "LINEAR": self.linear_regression.predict(X),
        })
        self.ensemble_regression = linear_model.LinearRegression().fit(X_ensemble, y)
        return self
    
    def predict(self, X):
        X_ensemble = pd.DataFrame({
            "NEAR": self.nearest_neighbors.predict(X),
            "FOREST": self.random_forest.predict(X),
            "LINEAR": self.linear_regression.predict(X),
        })
        return self.ensemble_regression.predict(X_ensemble)

In [14]:
nbrs = 5 
samples = 5
nestedreg = pipeline.Pipeline([('colsel', ColumnSelector(linpredictors)),
                               #('poly', preprocessing.PolynomialFeatures(2)),
                               ('est', EnsembleRegressor(nbrs, samples))])  

In [16]:
#nestedreg.get_params().keys()
parameters = dict(est__nbrs=range(90,122,3), est__samples=range(30,61,3))
nested_cv = sk.grid_search.GridSearchCV(nestedreg, param_grid=parameters)
nested_cv.fit(data, y)
nested_cv.best_params_

{'est__nbrs': 105, 'est__samples': 57}

In [None]:
nested_cv.score(data, y)    

In [17]:
nbrs = 105
samples = 57
tweet_reg = pipeline.Pipeline([('colsel', ColumnSelector(linpredictors)),
                               #('poly', preprocessing.PolynomialFeatures(2)),
                               ('est', EnsembleRegressor(nbrs, samples))])  

#run on all the data with the good alpha
loo = cross_validation.ShuffleSplit(len(y), n_iter=50, test_size=0.25)
scores = cross_validation.cross_val_score(tweet_reg, data, y, cv=loo)
#scores.mean()
print scores

[ 0.60433435  0.59243611  0.60668579  0.61545149  0.58082615  0.603613
  0.60332038  0.61060583  0.60284392  0.55816121  0.61759524  0.59501898
  0.59030809  0.62290686  0.6249765   0.60416887  0.60804957  0.59422897
  0.61215561  0.61297409  0.60292351  0.59611196  0.60148523  0.59261496
  0.60842046  0.60558821  0.61806513  0.61662511  0.5929643   0.62116854
  0.60814463  0.61525332  0.57543601  0.61815714  0.61183754  0.62774625
  0.61102822  0.62475083  0.60045772  0.62690227  0.58737317  0.60094312
  0.59380294  0.60105217  0.60520236  0.58957819  0.6157756   0.62139555
  0.59857291  0.60527059]


In [18]:
tweet_reg.fit(data,y)
dill.dump(tweet_reg, open('hourly_model.pkl', 'w'))

In [20]:
tweet_reg.score(data, y)

0.62792734060825983