In [1]:
import json
from streamio import jsonstream
import pandas as pd
import numpy as np
from datetime import datetime
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.model_selection import KFold
import csv
import pickle
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

  from pandas.core import datetools


In [86]:
def get_data(df):
    df = df.set_index('date')
    hour_data = df.groupby('date').resample('H')
    x = np.zeros((len(hour_data),9))
    y = np.zeros((len(hour_data)))
    
    for j,(interval,group) in enumerate(hour_data):
        x[j,0] = group.tweets.sum()
        x[j,1] = group.retweets.sum()
        x[j,2] = group.followers.sum()
        x[j,3] = interval.hour
        x[j,4] = group.influence.sum()
        x[j,5] = group.replies.sum()
        x[j,6] = group.ranking_score.sum()
        x[j,7] = group.impressions.sum()
        x[j,8] = group.fav_count.sum()
        
        y[j] = group.tweets.sum()
      
    x = np.nan_to_num(x[:-1])
    y = y[1:]
    return x, y

def cross_validation(x,y, mod):
    mae = []
   
    kf = KFold(n_splits=10, random_state=1)
    for train, test in kf.split(x):
        xtrain, xtest = x[train], x[test]
        ytrain, ytest = y[train], y[test]
        #OLS
        if mod is 'OLS':
            model = sm.OLS(ytrain,xtrain).fit()
        
        #SVM
        if mod is 'SVR':
            model = SVR(C=1.0)
            model.fit(xtrain, ytrain)
        
        #RFR
        if mod is 'RFR':
            model = RandomForestRegressor(n_estimators = 70, random_state = 42)
            model.fit(xtrain, ytrain)
        
        #print xtrain[:5,:]
        yhat = model.predict(xtest)
        error = np.abs(yhat-ytest)
        error = np.mean(error)
        mae.append(error)
    
    return np.mean(mae)
        
def get_CVerror(df, file, model):
    time1 = datetime(2015,2,1,8,0,0)
    time2 = datetime(2015,2,1,20,0,0)
    
    #before game
    df1 = df[df.date<time1]
    x,y = get_data(df1)
    mae = cross_validation(x,y, model)
    print "MAE before the active period for",file,": ",mae
    
    #during game
    df2 = df[(df.date>=time1) & (df.date<=time2)]
    x,y = get_data(df2)
    mae = cross_validation(x,y, model)
    print "MAE during the active period for",file,": ",mae
    
    #after game
    df3 = df[df.date>time2]
    x,y = get_data(df3)
    mae = cross_validation(x,y,model)
    print "MAE after the active period for ",file,": ",mae
    
    print "--------------------------------------------------------------"

In [82]:
def get_df(csv_df, count, features):
    csv_df.fillna(0)
    df = pd.DataFrame(index=range(count), columns=features)
    i=0
    for row in csv_df.itertuples(index=True):
        date = datetime.fromtimestamp(getattr(row,"citation_date"))
        df.set_value(i, 'date', date)
        df.set_value(i, 'tweets', getattr(row,"tweet_count"))
        df.set_value(i, 'retweets', getattr(row,"retweetcount"))
        df.set_value(i, 'followers', getattr(row,"follower_count"))
        df.set_value(i, 'influence', getattr(row,"influence_level"))
        df.set_value(i, 'replies', getattr(row,"replies"))
        df.set_value(i, 'ranking_score', getattr(row,"ranking_score"))
        df.set_value(i, 'impressions', getattr(row,"impressions"))
        df.set_value(i, 'fav_count', getattr(row,"favorite_count"))
        i=i+1
        
    return df

In [83]:
"""fileslist=["gohawks", "gopatriots","nfl","patriots","sb49","superbowl"]
df = pd.read_csv("csv/"+fileslist[0]+".csv")
for num in range(1,len(fileslist)):
    df1 = pd.read_csv("csv/"+fileslist[num]+".csv")
    df = df.append(df1)
    
#storing data    
fileobj = open("full_df",'wb') 
pickle.dump(df, fileobj)
fileobj.close()"""

def get_alldata():
    f = open("full_df", 'r')
    df = pickle.load(f)
    f.close()
    return df    

In [85]:
allfiles1 = {'gopatriots' : 26232}
allfiles={'gohawks':188136,'gopatriots':26232,'nfl':259024,'patriots':489713,'sb49':826951,'superbowl':1348767}
features=['date','tweets','retweets','followers','influence_level','replies','ranking_score',
          'impressions','favorite_count']
models = ['OLS', 'SVR', 'RFR']
for (file,count) in allfiles.iteritems():
    for model in models:
        print "-----------------------",file,"-",model,"--------------------------"
        csv_df = pd.read_csv("csv/"+file+".csv")
        df = get_df(csv_df, count, features)
        get_CVerror(df, file, model)


    

----------------------- superbowl - OLS --------------------------
MAE before the active period for superbowl :  315.712727525
MAE during the active period for superbowl :  965778.461457
MAE after the active period for  superbowl :  427.871479744
--------------------------------------------------------------
----------------------- superbowl - SVR --------------------------
MAE before the active period for superbowl :  431.892167019
MAE during the active period for superbowl :  120504.2
MAE after the active period for  superbowl :  589.586703297
--------------------------------------------------------------
----------------------- superbowl - RFR --------------------------
MAE before the active period for superbowl :  245.839173127
MAE during the active period for superbowl :  55543.2
MAE after the active period for  superbowl :  267.358131868
--------------------------------------------------------------
----------------------- nfl - OLS --------------------------
MAE before the activ

In [87]:
#aggregating data and using the best model
print "-----------------------full data---------------------------"
full_df = get_alldata()
df = get_df(full_df, full_df.shape[0], features)
get_CVerror(df, "full data", 'RFR')

-----------------------full data---------------------------
MAE before the active period for full data :  689.639489077
MAE during the active period for full data :  106053.14
MAE after the active period for  full data :  404.773174603
--------------------------------------------------------------
