In [2]:
import pandas as pd
import datetime
import numpy as np
import sklearn as sk
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
import xgboost as xgb

In [3]:
data = pd.read_csv('Data_For_Modeling/esketit2.csv', encoding = "ISO-8859-1")

## Feature Function Definitions ##

In [4]:
def seasonTime(date): 
    if pd.Timestamp('2019-6-14') > date >= pd.Timestamp('2019-4-13'): 
        return 'Playoffs' 
    elif pd.Timestamp('2019-4-13')> date > pd.Timestamp('2019-2-17'):
        return 'PostAllStarBreak'
    elif pd.Timestamp('2019-2-17')>= date >= pd.Timestamp('2019-2-15'):
        return 'AllStarBreak'
    elif pd.Timestamp('2019-2-15')> date >= pd.Timestamp('2018-10-16'):
        return 'BeforeAllStarBreak'
    elif pd.Timestamp('2018-6-8') > date >= pd.Timestamp('2018-4-14'): 
        return 'Playoffs' 
    elif pd.Timestamp('2018-4-14')> date > pd.Timestamp('2018-2-18'):
        return 'PostAllStarBreak'
    elif pd.Timestamp('2018-2-18')>= date >= pd.Timestamp('2018-2-16'):
        return 'AllStarBreak'
    elif pd.Timestamp('2018-2-16')> date >= pd.Timestamp('2017-10-17'):
        return 'BeforeAllStarBreak'
    elif pd.Timestamp('2017-6-12') > date >= pd.Timestamp('2017-4-15'): 
        return 'Playoffs' 
    elif pd.Timestamp('2017-4-15')> date > pd.Timestamp('2017-2-19'):
        return 'PostAllStarBreak'
    elif pd.Timestamp('2017-2-19')>= date >= pd.Timestamp('2017-2-17'):
        return 'AllStarBreak'
    elif pd.Timestamp('2017-2-17')> date >= pd.Timestamp('2016-10-25'):
        return 'BeforeAllStarBreak'
    else: return 'Offseason'

In [5]:
def vidAllStar(row): 
    if row['SeasonTime'] == 'AllStarBreak' and row['Type'] == 'Video': 
        return 1
    else:
        return 0 

In [6]:
def nonvidOffseason(row): 
    if row['SeasonTime'] == 'Offseason' and row['Type'] != 'Video': 
        return 1
    else:
        return 0 

In [7]:
data['Created']= pd.to_datetime(data['Created'])
data['SeasonTime'] = data.Created.map(lambda a: seasonTime(a))

In [8]:
data['vidAllStar'] = data.apply(lambda row: vidAllStar(row), axis =1 )

In [9]:
data['nonvidOffseason'] = data.apply(lambda row: nonvidOffseason(row), axis =1 )

In [10]:
x= pd.get_dummies(data['Type'])
x['Not_Video'] = x['Album']+x['Photo']
data = data.join(x.drop(['Album','Photo'],axis = 1))
data = data.drop(['shoutouts','hashtags','monthDateYear','Created','Type','Description'],axis = 1)
data = data.join(pd.get_dummies(data['timeOfDay']))
data = data.join(pd.get_dummies(data['DayOfWeek']))
data = data.join(pd.get_dummies(data['month']))
data = data.join(pd.get_dummies(data['SeasonTime']))
data = data.drop(['DayOfWeek','timeOfDay','hour','month','SeasonTime'],axis = 1)

In [11]:
data.columns.tolist()

['Engagements',
 'NBA_Follower_Count',
 'Len_Desc',
 'Tagged_Count',
 'playOffDate',
 'propOfSpecialTexts',
 'teamTagged',
 'otherTagged',
 'top3Tagged',
 'bottom3Tagged',
 'hasHashtag',
 'postsInThatDay',
 'vidAllStar',
 'nonvidOffseason',
 'Video',
 'Not_Video',
 'Afternoon',
 'Evening',
 'Morning',
 'Night',
 'Friday',
 'Monday',
 'Saturday',
 'Sunday',
 'Thursday',
 'Tuesday',
 'Wednesday',
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 'AllStarBreak',
 'BeforeAllStarBreak',
 'Offseason',
 'Playoffs',
 'PostAllStarBreak']

In [12]:
data['Tagged_Count^2'] = data['Tagged_Count']*data['Tagged_Count']

In [13]:
standardize = data[['NBA_Follower_Count', 'Len_Desc', 'Tagged_Count','Tagged_Count^2',
         'propOfSpecialTexts','postsInThatDay']]
standardize

Unnamed: 0,NBA_Follower_Count,Len_Desc,Tagged_Count,Tagged_Count^2,propOfSpecialTexts,postsInThatDay
0,36984682,95,4,16,0.266667,5
1,36984682,64,2,4,0.166667,5
2,36984682,46,2,4,0.250000,5
3,36984682,43,0,0,0.000000,5
4,36984682,57,1,1,0.100000,5
5,36955156,104,2,4,0.133333,19
6,36955156,106,1,1,0.176471,19
7,36955156,88,2,4,0.176471,19
8,36955156,75,2,4,0.181818,19
9,36955156,121,2,4,0.176471,19


In [14]:
dummy = data[['Engagements','teamTagged',
 'otherTagged',
 'top3Tagged',
 'bottom3Tagged',
 'hasHashtag',
 'vidAllStar',
 'nonvidOffseason',
 'Video',
 'Not_Video',
 'Afternoon',
 'Evening',
 'Morning',
 'Night',
 'Friday',
 'Monday',
 'Saturday',
 'Sunday',
 'Thursday',
 'Tuesday',
 'Wednesday',
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 'AllStarBreak',
 'BeforeAllStarBreak',
 'Offseason',
 'Playoffs',
 'PostAllStarBreak']]

In [15]:
len(dummy.columns.tolist())

38

In [16]:
X = preprocessing.scale(standardize)
Y = pd.DataFrame(X).rename(columns={0:'NBA_Follower_Count', 1:'Len_Desc', 2:'Tagged_Count',3:'Tagged_Count^2',
       4:'propOfSpecialTexts',5:'postsInThatDay'})
data_post_standardize= Y.join(dummy)

In [17]:
data_post_standardize['hasHashtag'] = data_post_standardize['hasHashtag'].astype(int)
data_post_standardize

Unnamed: 0,NBA_Follower_Count,Len_Desc,Tagged_Count,Tagged_Count^2,propOfSpecialTexts,postsInThatDay,Engagements,teamTagged,otherTagged,top3Tagged,...,8,9,10,11,12,AllStarBreak,BeforeAllStarBreak,Offseason,Playoffs,PostAllStarBreak
0,1.749128,0.698583,2.216012,2.522685,0.003876,-1.651685,502093,1,1,0,...,0,0,0,0,0,0,0,0,1,0
1,1.749128,-0.136075,0.436787,0.092998,-0.543573,-1.651685,603380,1,1,0,...,0,0,0,0,0,0,0,0,1,0
2,1.749128,-0.620715,0.436787,0.092998,-0.087365,-1.651685,603380,1,1,0,...,0,0,0,0,0,0,0,0,1,0
3,1.749128,-0.701488,-1.342437,-0.716897,-1.455990,-1.651685,725100,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,1.749128,-0.324546,-0.452825,-0.514423,-0.908540,-1.651685,661446,0,1,0,...,0,0,0,0,0,0,0,0,1,0
5,1.741252,0.940903,0.436787,0.092998,-0.726057,0.348201,322444,0,1,0,...,0,0,0,0,0,0,0,0,1,0
6,1.741252,0.994752,-0.452825,-0.514423,-0.489902,0.348201,722540,1,0,0,...,0,0,0,0,0,0,0,0,1,0
7,1.741252,0.510112,0.436787,0.092998,-0.489902,0.348201,339265,1,0,0,...,0,0,0,0,0,0,0,0,1,0
8,1.741252,0.160094,0.436787,0.092998,-0.460626,0.348201,443330,1,0,0,...,0,0,0,0,0,0,0,0,1,0
9,1.741252,1.398619,0.436787,0.092998,-0.489902,0.348201,652193,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [18]:
data_post_standardize.corr()['Engagements']

NBA_Follower_Count   -0.053031
Len_Desc             -0.274011
Tagged_Count         -0.152747
Tagged_Count^2       -0.216684
propOfSpecialTexts    0.050351
postsInThatDay        0.069409
Engagements           1.000000
teamTagged           -0.223341
otherTagged           0.194495
top3Tagged            0.308200
bottom3Tagged        -0.163549
hasHashtag           -0.148315
vidAllStar            0.073051
nonvidOffseason      -0.365660
Video                 0.861826
Not_Video            -0.861826
Afternoon            -0.173961
Evening              -0.058677
Morning              -0.081930
Night                 0.203917
Friday               -0.034605
Monday               -0.030319
Saturday              0.025240
Sunday                0.066819
Thursday             -0.001808
Tuesday              -0.017213
Wednesday            -0.017960
1                     0.042564
2                     0.056058
3                    -0.001950
4                    -0.091631
5                    -0.012298
6       

## END OF PREPROCESSING AND STUFF ##

In [19]:
target = data_post_standardize['Engagements']

In [20]:
features_from_old_tests = ['Video', 'top3Tagged', 'Len_Desc', 'teamTagged','Night' ,'otherTagged', 'Afternoon', 
            'bottom3Tagged','Tagged_Count^2','hasHashtag',7,4,'Morning',10,
            'postsInThatDay','Sunday',11,9,'Evening',2]

## Code for Determining Good Features ##

In [21]:
non_target = data_post_standardize.drop('Engagements',axis=1)
bestfeatures = SelectKBest(score_func=f_regression, k=7)
fit = bestfeatures.fit(non_target,target)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(non_target.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(40,'Score'))  #print 10 best features

                 Specs         Score
14           Not_Video  22416.055961
13               Video  22416.055961
12     nonvidOffseason   1198.324382
8           top3Tagged    814.885384
1             Len_Desc    630.256329
6           teamTagged    407.609387
3       Tagged_Count^2    382.495354
18               Night    336.851600
7          otherTagged    305.245611
15           Afternoon    242.290632
9        bottom3Tagged    213.380925
2         Tagged_Count    185.473122
10          hasHashtag    174.628700
32                   7    146.294579
39  BeforeAllStarBreak    114.420645
40           Offseason     89.686689
29                   4     65.740342
17             Morning     52.468422
35                  10     45.877950
11          vidAllStar     41.654291
5       postsInThatDay     37.585108
22              Sunday     34.820148
41            Playoffs     30.962037
36                  11     29.887994
34                   9     29.138970
16             Evening     26.823376
2

In [22]:
allfeatures = non_target.columns.tolist()

In [23]:
allfeatures.remove('Not_Video')

In [24]:
'Not_Video' in allfeatures

False

In [25]:
print(featureScores.nlargest(21,'Score'))  #print 10 best features

                 Specs         Score
14           Not_Video  22416.055961
13               Video  22416.055961
12     nonvidOffseason   1198.324382
8           top3Tagged    814.885384
1             Len_Desc    630.256329
6           teamTagged    407.609387
3       Tagged_Count^2    382.495354
18               Night    336.851600
7          otherTagged    305.245611
15           Afternoon    242.290632
9        bottom3Tagged    213.380925
2         Tagged_Count    185.473122
10          hasHashtag    174.628700
32                   7    146.294579
39  BeforeAllStarBreak    114.420645
40           Offseason     89.686689
29                   4     65.740342
17             Morning     52.468422
35                  10     45.877950
11          vidAllStar     41.654291
5       postsInThatDay     37.585108


In [26]:
top20features = featureScores.nlargest(21,'Score')['Specs'].tolist()
top20features.remove('Not_Video')

In [27]:
top30features = featureScores.nlargest(31,'Score')['Specs'].tolist()
top30features.remove('Not_Video')

In [28]:
top20features

['Video',
 'nonvidOffseason',
 'top3Tagged',
 'Len_Desc',
 'teamTagged',
 'Tagged_Count^2',
 'Night',
 'otherTagged',
 'Afternoon',
 'bottom3Tagged',
 'Tagged_Count',
 'hasHashtag',
 7,
 'BeforeAllStarBreak',
 'Offseason',
 4,
 'Morning',
 10,
 'vidAllStar',
 'postsInThatDay']

In [29]:
top30features

['Video',
 'nonvidOffseason',
 'top3Tagged',
 'Len_Desc',
 'teamTagged',
 'Tagged_Count^2',
 'Night',
 'otherTagged',
 'Afternoon',
 'bottom3Tagged',
 'Tagged_Count',
 'hasHashtag',
 7,
 'BeforeAllStarBreak',
 'Offseason',
 4,
 'Morning',
 10,
 'vidAllStar',
 'postsInThatDay',
 'Sunday',
 'Playoffs',
 11,
 9,
 'Evening',
 2,
 'NBA_Follower_Count',
 'propOfSpecialTexts',
 6,
 1]

## Processing Function Def ##

In [49]:
# MAPE
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100


In [50]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import fbeta_score, make_scorer
def evaluate_model(estimator, data, train_features, target):
    return cross_validate(estimator,                     
                    X=data[train_features],
                    y=target,
                    scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False),
                    n_jobs=-1, cv=3,
                    return_train_score=True)

def display_results(results):
    results_df  = pd.DataFrame(results).T
    results_cols = results_df.columns
    for col in results_df:
        results_df[col] = results_df[col].apply(np.mean)
    return results_df

In [51]:
RESULTS = {}

## Modeling PRAY FOR MAPE UNDER 6 ## 
** ESKETIT **

In [52]:
RESULTS["forest_all"] = evaluate_model(RandomForestRegressor(), non_target, allfeatures,target)
RESULTS["tree_all"] = evaluate_model(DecisionTreeRegressor(), non_target, allfeatures,target)
RESULTS["xgb_all"] = evaluate_model(GradientBoostingRegressor(), non_target, allfeatures,target)

RESULTS["forest_top20"] = evaluate_model(RandomForestRegressor(), non_target, top20features,target)
RESULTS["tree_top20"] = evaluate_model(DecisionTreeRegressor(), non_target, top20features,target)
RESULTS["xgb_top20"] = evaluate_model(GradientBoostingRegressor(), non_target, top20features,target)

RESULTS["forest_top30"] = evaluate_model(RandomForestRegressor(), non_target, top30features,target)
RESULTS["tree_top30"] = evaluate_model(DecisionTreeRegressor(), non_target, top30features,target)
RESULTS["xgb_top30"] = evaluate_model(GradientBoostingRegressor(), non_target, top30features,target)

RESULTS["forest_old"] = evaluate_model(RandomForestRegressor(), non_target, features_from_old_tests,target)
RESULTS["tree_old"] = evaluate_model(DecisionTreeRegressor(), non_target, features_from_old_tests,target)
RESULTS["xgb_old"] = evaluate_model(GradientBoostingRegressor(), non_target, features_from_old_tests,target)


In [34]:
RESULTS["ada_all"] = evaluate_model(AdaBoostRegressor(), non_target, allfeatures,target)
RESULTS["ada_top20"] = evaluate_model(AdaBoostRegressor(), non_target, top20features,target)
RESULTS["ada_top30"] = evaluate_model(AdaBoostRegressor(), non_target, top30features,target)
RESULTS["ada_old"] = evaluate_model(AdaBoostRegressor(), non_target, features_from_old_tests,target)

In [35]:
display_results(RESULTS)

Unnamed: 0,fit_time,score_time,test_score,train_score
forest_all,0.295099,0.006078,-7.905183,-2.314421
tree_all,0.044175,0.001602,-10.022649,-0.014487
xgb_all,0.660398,0.004581,-7.138667,-5.736165
forest_top20,0.138373,0.005237,-8.65409,-2.866488
tree_top20,0.022457,0.001363,-10.735745,-0.207493
xgb_top20,0.32758,0.004322,-7.814649,-6.607887
forest_top30,0.240907,0.005161,-8.244314,-2.398844
tree_top30,0.036751,0.001689,-10.36923,-0.014487
xgb_top30,0.46088,0.004187,-7.354666,-5.997088
forest_old,0.146523,0.005167,-8.437196,-2.766773


## XGBOOST Hyperparameter Tuning

In [53]:
X_train, X_test, y_train, y_test = train_test_split(non_target[top30features], target, test_size=0.2)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

  if getattr(data, 'base', None) is not None and \


In [54]:
mean_train = np.mean(y_train)
# Get predictions on the test set
baseline_predictions = np.ones(y_test.shape) * mean_train
# Compute MAE
mae_baseline = mean_absolute_percentage_error(y_test, baseline_predictions)
print("Baseline MAPE is {:.2f}".format(mae_baseline))

Baseline MAPE is 31.77


In [55]:
params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    'objective':'reg:squarederror',
}


In [56]:
params['eval_metric'] = 'mae'
num_boost_round = 999

In [57]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

[0]	Test-mae:415876
Will train until Test-mae hasn't improved in 10 rounds.
[1]	Test-mae:291247
[2]	Test-mae:203899
[3]	Test-mae:142864
[4]	Test-mae:101267
[5]	Test-mae:73094.1
[6]	Test-mae:55055.6
[7]	Test-mae:44581.3
[8]	Test-mae:38383.5
[9]	Test-mae:35409
[10]	Test-mae:34037.1
[11]	Test-mae:33247.6
[12]	Test-mae:32809.8
[13]	Test-mae:32603.4
[14]	Test-mae:32551.4
[15]	Test-mae:32538.4
[16]	Test-mae:32455.2
[17]	Test-mae:32467
[18]	Test-mae:32481.5
[19]	Test-mae:32487.9
[20]	Test-mae:32438.9
[21]	Test-mae:32390.3
[22]	Test-mae:32407.2
[23]	Test-mae:32373.4
[24]	Test-mae:32437.4
[25]	Test-mae:32394.7
[26]	Test-mae:32264.3
[27]	Test-mae:32237.1
[28]	Test-mae:32210.4
[29]	Test-mae:32216.6
[30]	Test-mae:32258.6
[31]	Test-mae:32213.2
[32]	Test-mae:32182.8
[33]	Test-mae:32164.1
[34]	Test-mae:32272
[35]	Test-mae:32314.6
[36]	Test-mae:32330.2
[37]	Test-mae:32342.7
[38]	Test-mae:32362.3
[39]	Test-mae:32419.5
[40]	Test-mae:32467.8
[41]	Test-mae:32359.8
[42]	Test-mae:32301.5
[43]	Test-mae:32300

In [58]:
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    seed=42,
    nfold=5,
    metrics={'mae'},
    early_stopping_rounds=10
)
cv_results

Unnamed: 0,train-mae-mean,train-mae-std,test-mae-mean,test-mae-std
0,414796.19375,670.433402,414800.49375,3169.1311
1,290783.34375,458.101166,290857.525,2758.115606
2,204022.209375,325.585084,204148.94375,2439.188298
3,143338.3625,250.788287,143566.61875,2258.6151
4,101265.057812,229.587831,101837.871875,2039.809802
5,72949.178125,172.372767,74194.296875,1767.042756
6,54659.63125,253.594144,56527.824219,1437.674067
7,43630.610938,200.109638,46103.50625,1360.115767
8,37383.640625,133.614819,40392.869531,1225.078431
9,33778.596875,152.771074,37206.375,1084.482006


In [59]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(2,15)
    for min_child_weight in range(2,12)
]

In [60]:
min_mae = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best MAE
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

CV with max_depth=2, min_child_weight=2


will be corrected to return the positional minimum in the future.
Use 'series.values.argmin' to get the position of the minimum now.


	MAE 34321.2304686 for 275 rounds
CV with max_depth=2, min_child_weight=3
	MAE 34546.5312502 for 202 rounds
CV with max_depth=2, min_child_weight=4
	MAE 34801.0648436 for 146 rounds
CV with max_depth=2, min_child_weight=5
	MAE 34662.3375 for 171 rounds
CV with max_depth=2, min_child_weight=6
	MAE 34663.8507812 for 180 rounds
CV with max_depth=2, min_child_weight=7
	MAE 34343.2195314 for 268 rounds
CV with max_depth=2, min_child_weight=8
	MAE 34546.948437600004 for 220 rounds
CV with max_depth=2, min_child_weight=9
	MAE 34505.860937800004 for 181 rounds
CV with max_depth=2, min_child_weight=10
	MAE 34460.690625 for 223 rounds
CV with max_depth=2, min_child_weight=11
	MAE 34380.6992188 for 255 rounds
CV with max_depth=3, min_child_weight=2
	MAE 33638.2671874 for 135 rounds
CV with max_depth=3, min_child_weight=3
	MAE 33542.8535154 for 145 rounds
CV with max_depth=3, min_child_weight=4
	MAE 33536.8941406 for 142 rounds
CV with max_depth=3, min_child_weight=5
	MAE 33673.110547000004 for 12

	MAE 34031.4691406 for 13 rounds
CV with max_depth=13, min_child_weight=4
	MAE 33601.1390624 for 13 rounds
CV with max_depth=13, min_child_weight=5
	MAE 33740.5691406 for 14 rounds
CV with max_depth=13, min_child_weight=6
	MAE 33707.5570312 for 13 rounds
CV with max_depth=13, min_child_weight=7
	MAE 33460.3472656 for 13 rounds
CV with max_depth=13, min_child_weight=8
	MAE 33244.1515624 for 14 rounds
CV with max_depth=13, min_child_weight=9
	MAE 33118.8867186 for 12 rounds
CV with max_depth=13, min_child_weight=10
	MAE 33128.121875 for 13 rounds
CV with max_depth=13, min_child_weight=11
	MAE 33050.5652346 for 13 rounds
CV with max_depth=14, min_child_weight=2
	MAE 33945.082812399996 for 13 rounds
CV with max_depth=14, min_child_weight=3
	MAE 34052.3492188 for 13 rounds
CV with max_depth=14, min_child_weight=4
	MAE 33865.326172 for 12 rounds
CV with max_depth=14, min_child_weight=5
	MAE 33758.623827999996 for 13 rounds
CV with max_depth=14, min_child_weight=6
	MAE 33783.2332032 for 13 ro

In [61]:
params['max_depth'] = 12
params['min_child_weight'] = 11

gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

In [62]:

min_mae = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

CV with subsample=1.0, colsample=1.0


will be corrected to return the positional minimum in the future.
Use 'series.values.argmin' to get the position of the minimum now.


	MAE 32874.5023438 for 13 rounds
CV with subsample=1.0, colsample=0.9
	MAE 33528.2441406 for 15 rounds
CV with subsample=1.0, colsample=0.8
	MAE 33884.4695312 for 18 rounds
CV with subsample=1.0, colsample=0.7
	MAE 34992.7625 for 27 rounds
CV with subsample=0.9, colsample=1.0
	MAE 33142.6019532 for 13 rounds
CV with subsample=0.9, colsample=0.9
	MAE 33473.1929686 for 14 rounds
CV with subsample=0.9, colsample=0.8
	MAE 33880.746875 for 15 rounds
CV with subsample=0.9, colsample=0.7
	MAE 34632.1187502 for 22 rounds
CV with subsample=0.8, colsample=1.0
	MAE 33388.2339842 for 13 rounds
CV with subsample=0.8, colsample=0.9
	MAE 33726.5781248 for 13 rounds
CV with subsample=0.8, colsample=0.8
	MAE 34415.514844000005 for 18 rounds
CV with subsample=0.8, colsample=0.7
	MAE 34752.3140626 for 20 rounds
CV with subsample=0.7, colsample=1.0
	MAE 33569.7597654 for 13 rounds
CV with subsample=0.7, colsample=0.9
	MAE 33603.3667968 for 12 rounds
CV with subsample=0.7, colsample=0.8
	MAE 34619.9125 for

In [63]:
params['subsample'] = 1
params['colsample_bytree'] = 1

In [64]:

%time
# This can take some time…
min_mae = float("Inf")
best_params = None

for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    # We update our parameters
    params['eta'] = eta
    # Run and time CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds\n".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = eta
    print("Best params: {}, MAE: {}".format(best_params, min_mae))

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 15 µs
CV with eta=0.3


will be corrected to return the positional minimum in the future.
Use 'series.values.argmin' to get the position of the minimum now.


	MAE 32874.5023438 for 13 rounds

Best params: 0.3, MAE: 32874.5023438
CV with eta=0.2
	MAE 32529.6746094 for 22 rounds

Best params: 0.2, MAE: 32529.6746094
CV with eta=0.1
	MAE 32216.3796874 for 47 rounds

Best params: 0.1, MAE: 32216.3796874
CV with eta=0.05
	MAE 32237.323046600002 for 99 rounds

Best params: 0.1, MAE: 32216.3796874
CV with eta=0.01
	MAE 32192.7539062 for 526 rounds

Best params: 0.01, MAE: 32192.7539062
CV with eta=0.005
	MAE 32236.0996094 for 998 rounds

Best params: 0.01, MAE: 32192.7539062


In [67]:
params['eta'] = .01
bestmodel = xgb.train(
    params,
    dtrain,
    num_boost_round=1200,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)
mean_absolute_percentage_error(bestmodel.predict(dtest), y_test)


[0]	Test-mae:588293
Will train until Test-mae hasn't improved in 10 rounds.
[1]	Test-mae:582407
[2]	Test-mae:576580
[3]	Test-mae:570812
[4]	Test-mae:565100
[5]	Test-mae:559448
[6]	Test-mae:553850
[7]	Test-mae:548306
[8]	Test-mae:542819
[9]	Test-mae:537386
[10]	Test-mae:532009
[11]	Test-mae:526688
[12]	Test-mae:521418
[13]	Test-mae:516204
[14]	Test-mae:511043
[15]	Test-mae:505926
[16]	Test-mae:500867
[17]	Test-mae:495858
[18]	Test-mae:490895
[19]	Test-mae:485985
[20]	Test-mae:481124
[21]	Test-mae:476307
[22]	Test-mae:471544
[23]	Test-mae:466827
[24]	Test-mae:462152
[25]	Test-mae:457530
[26]	Test-mae:452952
[27]	Test-mae:448419
[28]	Test-mae:443933
[29]	Test-mae:439495
[30]	Test-mae:435095
[31]	Test-mae:430746
[32]	Test-mae:426441
[33]	Test-mae:422175
[34]	Test-mae:417957
[35]	Test-mae:413777
[36]	Test-mae:409642
[37]	Test-mae:405544
[38]	Test-mae:401491
[39]	Test-mae:397481
[40]	Test-mae:393505
[41]	Test-mae:389569
[42]	Test-mae:385677
[43]	Test-mae:381819
[44]	Test-mae:378000
[45]	Test

[368]	Test-mae:32442.2
[369]	Test-mae:32391.5
[370]	Test-mae:32345
[371]	Test-mae:32294
[372]	Test-mae:32257.5
[373]	Test-mae:32215.1
[374]	Test-mae:32179.9
[375]	Test-mae:32137.4
[376]	Test-mae:32099
[377]	Test-mae:32063.4
[378]	Test-mae:32021.3
[379]	Test-mae:31989.4
[380]	Test-mae:31952.7
[381]	Test-mae:31920.1
[382]	Test-mae:31894
[383]	Test-mae:31863.3
[384]	Test-mae:31832.4
[385]	Test-mae:31801.1
[386]	Test-mae:31763.8
[387]	Test-mae:31736.8
[388]	Test-mae:31711.7
[389]	Test-mae:31686.2
[390]	Test-mae:31657.7
[391]	Test-mae:31631.8
[392]	Test-mae:31609.5
[393]	Test-mae:31587.6
[394]	Test-mae:31562.1
[395]	Test-mae:31543.8
[396]	Test-mae:31526.4
[397]	Test-mae:31507.8
[398]	Test-mae:31489.1
[399]	Test-mae:31472.2
[400]	Test-mae:31458.2
[401]	Test-mae:31440
[402]	Test-mae:31426.3
[403]	Test-mae:31417.2
[404]	Test-mae:31402.4
[405]	Test-mae:31386.6
[406]	Test-mae:31375.3
[407]	Test-mae:31362.7
[408]	Test-mae:31350.8
[409]	Test-mae:31332.9
[410]	Test-mae:31323.2
[411]	Test-mae:31311.

5.575618705370321

In [97]:
pd.DataFrame(bestmodel.predict(dtest))

Unnamed: 0,0
0,713343.812500
1,829445.312500
2,283032.531250
3,337540.312500
4,302593.562500
5,712888.375000
6,802041.875000
7,358248.000000
8,463686.906250
9,776745.000000


## Code for Actual Test Prodcution ##

In [66]:
test = pd.read_csv('testing_data.csv')

In [72]:
top30features = ['Video',
 'nonvidOffseason',
 'top3Tagged',
 'Len_Desc',
 'teamTagged',
 'Tagged_Count^2',
 'Night',
 'otherTagged',
 'Afternoon',
 'bottom3Tagged',
 'Tagged_Count',
 'hasHashtag',
 '7',
 'BeforeAllStarBreak',
 'Offseason',
 '4',
 'Morning',
 '10',
 'vidAllStar',
 'postsInThatDay',
 'Sunday',
 'Playoffs',
 '11',
 '9',
 'Evening',
 '2',
 'NBA_Follower_Count',
 'propOfSpecialTexts',
 '6',
 '1']

In [73]:
test[top30features]

Unnamed: 0,Video,nonvidOffseason,top3Tagged,Len_Desc,teamTagged,Tagged_Count^2,Night,otherTagged,Afternoon,bottom3Tagged,...,Sunday,Playoffs,11,9,Evening,2,NBA_Follower_Count,propOfSpecialTexts,6,1
0,0,0,0,-0.369289,0,-0.552255,1,1,0,0,...,0,1,0,0,0,0,1.734250,-0.837799,0,0
1,0,0,0,0.654361,0,0.142405,1,1,0,0,...,0,1,0,0,0,0,1.734250,-0.721902,0,0
2,1,0,0,-0.230958,0,1.300174,1,1,0,0,...,0,1,0,0,0,0,1.734250,0.900656,0,0
3,1,0,0,1.014022,0,1.300174,0,1,0,0,...,0,1,0,0,1,0,1.726429,0.538478,0,0
4,1,0,0,1.456682,0,1.300174,0,1,0,0,...,0,1,0,0,1,0,1.726429,0.538478,0,0
5,0,0,0,0.045705,0,0.142405,0,1,0,0,...,0,1,0,0,0,0,1.726429,0.005089,0,0
6,1,0,0,-0.950279,0,-0.783809,1,0,0,0,...,0,1,0,0,0,0,1.726429,-0.548056,0,0
7,1,0,0,-0.258624,0,-0.552255,1,1,0,0,...,0,1,0,0,0,0,1.726429,-0.943159,0,0
8,1,0,0,0.322367,0,0.142405,1,1,0,0,...,0,1,0,0,0,0,1.726429,-0.721902,0,0
9,1,0,0,1.401349,0,2.921049,0,1,0,0,...,0,1,0,0,1,0,1.719446,0.504167,0,0


In [81]:
actual = xgb.DMatrix(test[top30features])

In [84]:
predicteddata = test.join(pd.DataFrame(bestmodel.predict(actual)))

In [85]:
predicteddata.head()

Unnamed: 0,NBA_Follower_Count,Len_Desc,Tagged_Count,Tagged_Count^2,propOfSpecialTexts,postsInThatDay,Engagements,teamTagged,otherTagged,top3Tagged,...,9,10,11,12,AllStarBreak,BeforeAllStarBreak,Offseason,Playoffs,PostAllStarBreak,0
0,1.73425,-0.369289,-0.455505,-0.552255,-0.837799,0.007664,,0,1,0,...,0,0,0,0,0,0,0,1,0,415565.65625
1,1.73425,0.654361,0.472204,0.142405,-0.721902,0.007664,,0,1,0,...,0,0,0,0,0,0,0,1,0,392246.3125
2,1.73425,-0.230958,1.399914,1.300174,0.900656,0.007664,,0,1,0,...,0,0,0,0,0,0,0,1,0,598190.375
3,1.726429,1.014022,1.399914,1.300174,0.538478,1.923568,,0,1,0,...,0,0,0,0,0,0,0,1,0,576586.3125
4,1.726429,1.456682,1.399914,1.300174,0.538478,1.923568,,0,1,0,...,0,0,0,0,0,0,0,1,0,580887.25


In [87]:
predicteddata.groupby('Video')[0].mean()

Video
0    342634.1250
1    667290.0625
Name: 0, dtype: float32

In [89]:
hold_out_set = pd.read_csv('holdout_set.csv', encoding = "ISO-8859-1")

In [92]:
rjwoj = hold_out_set.join(pd.DataFrame(bestmodel.predict(actual))).drop('Engagements', axis= 1)

In [93]:
rjwoj= rjwoj.rename(columns= {0: 'Engagements'})

In [98]:
rjwoj.to_csv('holdout_set_RJwithoutJ.csv')

871897.8