In [36]:
import pandas as pd
import datetime
import numpy as np
import sklearn as sk
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
import xgboost as xgb

In [2]:
data = pd.read_csv('Data_For_Modeling/esketit2.csv', encoding = "ISO-8859-1")

## Feature Function Definitions ##

In [3]:
def seasonTime(date): 
    if pd.Timestamp('2019-6-14') > date >= pd.Timestamp('2019-4-13'): 
        return 'Playoffs' 
    elif pd.Timestamp('2019-4-13')> date > pd.Timestamp('2019-2-17'):
        return 'PostAllStarBreak'
    elif pd.Timestamp('2019-2-17')>= date >= pd.Timestamp('2019-2-15'):
        return 'AllStarBreak'
    elif pd.Timestamp('2019-2-15')> date >= pd.Timestamp('2018-10-16'):
        return 'BeforeAllStarBreak'
    elif pd.Timestamp('2018-6-8') > date >= pd.Timestamp('2018-4-14'): 
        return 'Playoffs' 
    elif pd.Timestamp('2018-4-14')> date > pd.Timestamp('2018-2-18'):
        return 'PostAllStarBreak'
    elif pd.Timestamp('2018-2-18')>= date >= pd.Timestamp('2018-2-16'):
        return 'AllStarBreak'
    elif pd.Timestamp('2018-2-16')> date >= pd.Timestamp('2017-10-17'):
        return 'BeforeAllStarBreak'
    elif pd.Timestamp('2017-6-12') > date >= pd.Timestamp('2017-4-15'): 
        return 'Playoffs' 
    elif pd.Timestamp('2017-4-15')> date > pd.Timestamp('2017-2-19'):
        return 'PostAllStarBreak'
    elif pd.Timestamp('2017-2-19')>= date >= pd.Timestamp('2017-2-17'):
        return 'AllStarBreak'
    elif pd.Timestamp('2017-2-17')> date >= pd.Timestamp('2016-10-25'):
        return 'BeforeAllStarBreak'
    else: return 'Offseason'

In [4]:
def vidAllStar(row): 
    if row['SeasonTime'] == 'AllStarBreak' and row['Type'] == 'Video': 
        return 1
    else:
        return 0 

In [5]:
def nonvidOffseason(row): 
    if row['SeasonTime'] == 'Offseason' and row['Type'] != 'Video': 
        return 1
    else:
        return 0 

In [6]:
data['Created']= pd.to_datetime(data['Created'])
data['SeasonTime'] = data.Created.map(lambda a: seasonTime(a))

In [7]:
data['vidAllStar'] = data.apply(lambda row: vidAllStar(row), axis =1 )

In [8]:
data['nonvidOffseason'] = data.apply(lambda row: nonvidOffseason(row), axis =1 )

In [9]:
x= pd.get_dummies(data['Type'])
x['Not_Video'] = x['Album']+x['Photo']
data = data.join(x.drop(['Album','Photo'],axis = 1))
data = data.drop(['shoutouts','hashtags','monthDateYear','Created','Type','Description'],axis = 1)
data = data.join(pd.get_dummies(data['timeOfDay']))
data = data.join(pd.get_dummies(data['DayOfWeek']))
data = data.join(pd.get_dummies(data['month']))
data = data.join(pd.get_dummies(data['SeasonTime']))
data = data.drop(['DayOfWeek','timeOfDay','hour','month','SeasonTime'],axis = 1)

In [10]:
data.columns.tolist()

['Engagements',
 'NBA_Follower_Count',
 'Len_Desc',
 'Tagged_Count',
 'playOffDate',
 'propOfSpecialTexts',
 'teamTagged',
 'otherTagged',
 'top3Tagged',
 'bottom3Tagged',
 'hasHashtag',
 'postsInThatDay',
 'vidAllStar',
 'nonvidOffseason',
 'Video',
 'Not_Video',
 'Afternoon',
 'Evening',
 'Morning',
 'Night',
 'Friday',
 'Monday',
 'Saturday',
 'Sunday',
 'Thursday',
 'Tuesday',
 'Wednesday',
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 'AllStarBreak',
 'BeforeAllStarBreak',
 'Offseason',
 'Playoffs',
 'PostAllStarBreak']

In [11]:
data['Tagged_Count^2'] = data['Tagged_Count']*data['Tagged_Count']

In [12]:
standardize = data[['NBA_Follower_Count', 'Len_Desc', 'Tagged_Count','Tagged_Count^2',
         'propOfSpecialTexts','postsInThatDay']]
standardize

Unnamed: 0,NBA_Follower_Count,Len_Desc,Tagged_Count,Tagged_Count^2,propOfSpecialTexts,postsInThatDay
0,36984682,95,4,16,0.266667,5
1,36984682,64,2,4,0.166667,5
2,36984682,46,2,4,0.250000,5
3,36984682,43,0,0,0.000000,5
4,36984682,57,1,1,0.100000,5
5,36955156,104,2,4,0.133333,19
6,36955156,106,1,1,0.176471,19
7,36955156,88,2,4,0.176471,19
8,36955156,75,2,4,0.181818,19
9,36955156,121,2,4,0.176471,19


In [13]:
dummy = data[['Engagements','teamTagged',
 'otherTagged',
 'top3Tagged',
 'bottom3Tagged',
 'hasHashtag',
 'vidAllStar',
 'nonvidOffseason',
 'Video',
 'Not_Video',
 'Afternoon',
 'Evening',
 'Morning',
 'Night',
 'Friday',
 'Monday',
 'Saturday',
 'Sunday',
 'Thursday',
 'Tuesday',
 'Wednesday',
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 'AllStarBreak',
 'BeforeAllStarBreak',
 'Offseason',
 'Playoffs',
 'PostAllStarBreak']]

In [14]:
len(dummy.columns.tolist())

38

In [15]:
X = preprocessing.scale(standardize)
Y = pd.DataFrame(X).rename(columns={0:'NBA_Follower_Count', 1:'Len_Desc', 2:'Tagged_Count',3:'Tagged_Count^2',
       4:'propOfSpecialTexts',5:'postsInThatDay'})
data_post_standardize= Y.join(dummy)

In [16]:
data_post_standardize['hasHashtag'] = data_post_standardize['hasHashtag'].astype(int)
data_post_standardize

Unnamed: 0,NBA_Follower_Count,Len_Desc,Tagged_Count,Tagged_Count^2,propOfSpecialTexts,postsInThatDay,Engagements,teamTagged,otherTagged,top3Tagged,...,8,9,10,11,12,AllStarBreak,BeforeAllStarBreak,Offseason,Playoffs,PostAllStarBreak
0,1.749128,0.698583,2.216012,2.522685,0.003876,-1.651685,502093,1,1,0,...,0,0,0,0,0,0,0,0,1,0
1,1.749128,-0.136075,0.436787,0.092998,-0.543573,-1.651685,603380,1,1,0,...,0,0,0,0,0,0,0,0,1,0
2,1.749128,-0.620715,0.436787,0.092998,-0.087365,-1.651685,603380,1,1,0,...,0,0,0,0,0,0,0,0,1,0
3,1.749128,-0.701488,-1.342437,-0.716897,-1.455990,-1.651685,725100,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,1.749128,-0.324546,-0.452825,-0.514423,-0.908540,-1.651685,661446,0,1,0,...,0,0,0,0,0,0,0,0,1,0
5,1.741252,0.940903,0.436787,0.092998,-0.726057,0.348201,322444,0,1,0,...,0,0,0,0,0,0,0,0,1,0
6,1.741252,0.994752,-0.452825,-0.514423,-0.489902,0.348201,722540,1,0,0,...,0,0,0,0,0,0,0,0,1,0
7,1.741252,0.510112,0.436787,0.092998,-0.489902,0.348201,339265,1,0,0,...,0,0,0,0,0,0,0,0,1,0
8,1.741252,0.160094,0.436787,0.092998,-0.460626,0.348201,443330,1,0,0,...,0,0,0,0,0,0,0,0,1,0
9,1.741252,1.398619,0.436787,0.092998,-0.489902,0.348201,652193,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [17]:
data_post_standardize.corr()['Engagements']

NBA_Follower_Count   -0.053031
Len_Desc             -0.274011
Tagged_Count         -0.152747
Tagged_Count^2       -0.216684
propOfSpecialTexts    0.050351
postsInThatDay        0.069409
Engagements           1.000000
teamTagged           -0.223341
otherTagged           0.194495
top3Tagged            0.308200
bottom3Tagged        -0.163549
hasHashtag           -0.148315
vidAllStar            0.073051
nonvidOffseason      -0.365660
Video                 0.861826
Not_Video            -0.861826
Afternoon            -0.173961
Evening              -0.058677
Morning              -0.081930
Night                 0.203917
Friday               -0.034605
Monday               -0.030319
Saturday              0.025240
Sunday                0.066819
Thursday             -0.001808
Tuesday              -0.017213
Wednesday            -0.017960
1                     0.042564
2                     0.056058
3                    -0.001950
4                    -0.091631
5                    -0.012298
6       

## END OF PREPROCESSING AND STUFF ##

In [18]:
target = data_post_standardize['Engagements']

In [19]:
features_from_old_tests = ['Video', 'top3Tagged', 'Len_Desc', 'teamTagged','Night' ,'otherTagged', 'Afternoon', 
            'bottom3Tagged','Tagged_Count^2','hasHashtag',7,4,'Morning',10,
            'postsInThatDay','Sunday',11,9,'Evening',2]

## Code for Determining Good Features ##

In [20]:
non_target = data_post_standardize.drop('Engagements',axis=1)
bestfeatures = SelectKBest(score_func=f_regression, k=7)
fit = bestfeatures.fit(non_target,target)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(non_target.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(40,'Score'))  #print 10 best features

                 Specs         Score
14           Not_Video  22416.055961
13               Video  22416.055961
12     nonvidOffseason   1198.324382
8           top3Tagged    814.885384
1             Len_Desc    630.256329
6           teamTagged    407.609387
3       Tagged_Count^2    382.495354
18               Night    336.851600
7          otherTagged    305.245611
15           Afternoon    242.290632
9        bottom3Tagged    213.380925
2         Tagged_Count    185.473122
10          hasHashtag    174.628700
32                   7    146.294579
39  BeforeAllStarBreak    114.420645
40           Offseason     89.686689
29                   4     65.740342
17             Morning     52.468422
35                  10     45.877950
11          vidAllStar     41.654291
5       postsInThatDay     37.585108
22              Sunday     34.820148
41            Playoffs     30.962037
36                  11     29.887994
34                   9     29.138970
16             Evening     26.823376
2

In [21]:
allfeatures = non_target.columns.tolist()

In [22]:
allfeatures.remove('Not_Video')

In [23]:
'Not_Video' in allfeatures

False

In [24]:
print(featureScores.nlargest(21,'Score'))  #print 10 best features

                 Specs         Score
14           Not_Video  22416.055961
13               Video  22416.055961
12     nonvidOffseason   1198.324382
8           top3Tagged    814.885384
1             Len_Desc    630.256329
6           teamTagged    407.609387
3       Tagged_Count^2    382.495354
18               Night    336.851600
7          otherTagged    305.245611
15           Afternoon    242.290632
9        bottom3Tagged    213.380925
2         Tagged_Count    185.473122
10          hasHashtag    174.628700
32                   7    146.294579
39  BeforeAllStarBreak    114.420645
40           Offseason     89.686689
29                   4     65.740342
17             Morning     52.468422
35                  10     45.877950
11          vidAllStar     41.654291
5       postsInThatDay     37.585108


In [25]:
top20features = featureScores.nlargest(21,'Score')['Specs'].tolist()
top20features.remove('Not_Video')

In [26]:
top30features = featureScores.nlargest(31,'Score')['Specs'].tolist()
top30features.remove('Not_Video')

In [27]:
top20features

['Video',
 'nonvidOffseason',
 'top3Tagged',
 'Len_Desc',
 'teamTagged',
 'Tagged_Count^2',
 'Night',
 'otherTagged',
 'Afternoon',
 'bottom3Tagged',
 'Tagged_Count',
 'hasHashtag',
 7,
 'BeforeAllStarBreak',
 'Offseason',
 4,
 'Morning',
 10,
 'vidAllStar',
 'postsInThatDay']

In [28]:
top30features

['Video',
 'nonvidOffseason',
 'top3Tagged',
 'Len_Desc',
 'teamTagged',
 'Tagged_Count^2',
 'Night',
 'otherTagged',
 'Afternoon',
 'bottom3Tagged',
 'Tagged_Count',
 'hasHashtag',
 7,
 'BeforeAllStarBreak',
 'Offseason',
 4,
 'Morning',
 10,
 'vidAllStar',
 'postsInThatDay',
 'Sunday',
 'Playoffs',
 11,
 9,
 'Evening',
 2,
 'NBA_Follower_Count',
 'propOfSpecialTexts',
 6,
 1]

## Processing Function Def ##

In [29]:
# MAPE
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100


In [30]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import fbeta_score, make_scorer
def evaluate_model(estimator, data, train_features, target):
    return cross_validate(estimator,                     
                    X=data[train_features],
                    y=target,
                    scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False),
                    n_jobs=-1, cv=3,
                    return_train_score=True)

def display_results(results):
    results_df  = pd.DataFrame(results).T
    results_cols = results_df.columns
    for col in results_df:
        results_df[col] = results_df[col].apply(np.mean)
    return results_df

In [31]:
RESULTS = {}

## Modeling PRAY FOR MAPE UNDER 6 ## 
** ESKETIT **

In [32]:
RESULTS["forest_all"] = evaluate_model(RandomForestRegressor(), non_target, allfeatures,target)
RESULTS["tree_all"] = evaluate_model(DecisionTreeRegressor(), non_target, allfeatures,target)
RESULTS["xgb_all"] = evaluate_model(GradientBoostingRegressor(), non_target, allfeatures,target)

RESULTS["forest_top20"] = evaluate_model(RandomForestRegressor(), non_target, top20features,target)
RESULTS["tree_top20"] = evaluate_model(DecisionTreeRegressor(), non_target, top20features,target)
RESULTS["xgb_top20"] = evaluate_model(GradientBoostingRegressor(), non_target, top20features,target)

RESULTS["forest_top30"] = evaluate_model(RandomForestRegressor(), non_target, top30features,target)
RESULTS["tree_top30"] = evaluate_model(DecisionTreeRegressor(), non_target, top30features,target)
RESULTS["xgb_top30"] = evaluate_model(GradientBoostingRegressor(), non_target, top30features,target)

RESULTS["forest_old"] = evaluate_model(RandomForestRegressor(), non_target, features_from_old_tests,target)
RESULTS["tree_old"] = evaluate_model(DecisionTreeRegressor(), non_target, features_from_old_tests,target)
RESULTS["xgb_old"] = evaluate_model(GradientBoostingRegressor(), non_target, features_from_old_tests,target)


In [34]:
RESULTS["ada_all"] = evaluate_model(AdaBoostRegressor(), non_target, allfeatures,target)
RESULTS["ada_top20"] = evaluate_model(AdaBoostRegressor(), non_target, top20features,target)
RESULTS["ada_top30"] = evaluate_model(AdaBoostRegressor(), non_target, top30features,target)
RESULTS["ada_old"] = evaluate_model(AdaBoostRegressor(), non_target, features_from_old_tests,target)

In [35]:
display_results(RESULTS)

Unnamed: 0,fit_time,score_time,test_score,train_score
forest_all,0.296686,0.006564,-7.719794,-2.320969
tree_all,0.043788,0.001708,-10.21744,-0.014487
xgb_all,0.644759,0.005243,-7.146698,-5.736165
forest_top20,0.180889,0.007011,-8.838642,-2.863164
tree_top20,0.023225,0.001636,-10.711174,-0.207493
xgb_top20,0.334331,0.004097,-7.818173,-6.607887
forest_top30,0.268378,0.004867,-8.272505,-2.369808
tree_top30,0.037921,0.001502,-10.549838,-0.014487
xgb_top30,0.479448,0.004704,-7.35167,-5.997088
forest_old,0.151454,0.00618,-8.446872,-2.769672


## XGBOOST Hyperparameter Tuning

In [37]:
X_train, X_test, y_train, y_test = train_test_split(non_target[top30features], target, test_size=0.2)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

  if getattr(data, 'base', None) is not None and \


In [38]:
mean_train = np.mean(y_train)
# Get predictions on the test set
baseline_predictions = np.ones(y_test.shape) * mean_train
# Compute MAE
mae_baseline = mean_absolute_percentage_error(y_test, baseline_predictions)
print("Baseline MAPE is {:.2f}".format(mae_baseline))

Baseline MAPE is 31.81


In [39]:
params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    'objective':'reg:squarederror',
}


In [41]:
params['eval_metric'] = 'mae'
num_boost_round = 999

In [42]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

[0]	Test-mae:417359
Will train until Test-mae hasn't improved in 10 rounds.
[1]	Test-mae:292477
[2]	Test-mae:205475
[3]	Test-mae:144636
[4]	Test-mae:102316
[5]	Test-mae:74135.4
[6]	Test-mae:56463.7
[7]	Test-mae:46036.1
[8]	Test-mae:40423.2
[9]	Test-mae:37597.8
[10]	Test-mae:35403.5
[11]	Test-mae:34546.1
[12]	Test-mae:34121.2
[13]	Test-mae:33855.8
[14]	Test-mae:33824.1
[15]	Test-mae:33814.3
[16]	Test-mae:33620.8
[17]	Test-mae:33589.9
[18]	Test-mae:33485.6
[19]	Test-mae:33354.8
[20]	Test-mae:33258.4
[21]	Test-mae:33149.2
[22]	Test-mae:33169.5
[23]	Test-mae:33134.9
[24]	Test-mae:33056
[25]	Test-mae:33043.5
[26]	Test-mae:33093.5
[27]	Test-mae:33137.4
[28]	Test-mae:33148.6
[29]	Test-mae:33110.7
[30]	Test-mae:33121.7
[31]	Test-mae:33075.6
[32]	Test-mae:33113.1
[33]	Test-mae:33011.1
[34]	Test-mae:32973.2
[35]	Test-mae:32929.9
[36]	Test-mae:32910.9
[37]	Test-mae:32937
[38]	Test-mae:32933.8
[39]	Test-mae:32875.3
[40]	Test-mae:32870.1
[41]	Test-mae:32881.5
[42]	Test-mae:32833.7
[43]	Test-mae:327

In [43]:
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    seed=42,
    nfold=5,
    metrics={'mae'},
    early_stopping_rounds=10
)
cv_results

Unnamed: 0,train-mae-mean,train-mae-std,test-mae-mean,test-mae-std
0,414595.525,947.591773,414542.45,3774.303121
1,290646.7375,683.84204,290625.1875,2551.540963
2,203892.671875,470.86746,203922.2875,1861.158019
3,143214.225,334.856471,143430.60625,1383.754786
4,101116.189063,243.086484,101816.226562,870.561476
5,72853.242187,225.417918,74089.029688,702.563131
6,54537.173438,108.973403,56410.263281,607.405557
7,43623.76875,124.203161,46089.398437,562.036715
8,37541.699219,253.215244,40481.142969,548.115059
9,33839.210937,264.799028,37242.267188,528.831921


In [44]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(2,15)
    for min_child_weight in range(2,12)
]

In [45]:
min_mae = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best MAE
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

CV with max_depth=2, min_child_weight=2


will be corrected to return the positional minimum in the future.
Use 'series.values.argmin' to get the position of the minimum now.


	MAE 34500.5234374 for 228 rounds
CV with max_depth=2, min_child_weight=3
	MAE 34564.042969 for 190 rounds
CV with max_depth=2, min_child_weight=4
	MAE 34552.1921876 for 204 rounds
CV with max_depth=2, min_child_weight=5
	MAE 34637.2523438 for 186 rounds
CV with max_depth=2, min_child_weight=6
	MAE 34633.0859374 for 191 rounds
CV with max_depth=2, min_child_weight=7
	MAE 34630.836718599996 for 174 rounds
CV with max_depth=2, min_child_weight=8
	MAE 34553.701562199996 for 183 rounds
CV with max_depth=2, min_child_weight=9
	MAE 34581.315625 for 172 rounds
CV with max_depth=2, min_child_weight=10
	MAE 34539.3328126 for 195 rounds
CV with max_depth=2, min_child_weight=11
	MAE 34591.3625 for 195 rounds
CV with max_depth=3, min_child_weight=2
	MAE 33879.8203126 for 121 rounds
CV with max_depth=3, min_child_weight=3
	MAE 33751.819531 for 119 rounds
CV with max_depth=3, min_child_weight=4
	MAE 33816.8624998 for 137 rounds
CV with max_depth=3, min_child_weight=5
	MAE 33928.2179688 for 104 round

	MAE 33739.8820312 for 13 rounds
CV with max_depth=13, min_child_weight=4
	MAE 33440.3816406 for 14 rounds
CV with max_depth=13, min_child_weight=5
	MAE 33472.9648438 for 13 rounds
CV with max_depth=13, min_child_weight=6
	MAE 33250.903515800004 for 13 rounds
CV with max_depth=13, min_child_weight=7
	MAE 33086.1484374 for 14 rounds
CV with max_depth=13, min_child_weight=8
	MAE 33313.6121094 for 13 rounds
CV with max_depth=13, min_child_weight=9
	MAE 33393.6589844 for 13 rounds
CV with max_depth=13, min_child_weight=10
	MAE 32945.162890399995 for 13 rounds
CV with max_depth=13, min_child_weight=11
	MAE 33040.6371094 for 13 rounds
CV with max_depth=14, min_child_weight=2
	MAE 34051.603125 for 13 rounds
CV with max_depth=14, min_child_weight=3
	MAE 33770.165625 for 13 rounds
CV with max_depth=14, min_child_weight=4
	MAE 33700.173437800004 for 13 rounds
CV with max_depth=14, min_child_weight=5
	MAE 33311.5804686 for 13 rounds
CV with max_depth=14, min_child_weight=6
	MAE 33331.2390626 for 

In [46]:
params['max_depth'] = 13
params['min_child_weight'] = 10

gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

In [47]:

min_mae = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

CV with subsample=1.0, colsample=1.0


will be corrected to return the positional minimum in the future.
Use 'series.values.argmin' to get the position of the minimum now.


	MAE 32945.162890399995 for 13 rounds
CV with subsample=1.0, colsample=0.9
	MAE 33407.029687199996 for 13 rounds
CV with subsample=1.0, colsample=0.8
	MAE 34386.1898438 for 16 rounds
CV with subsample=1.0, colsample=0.7
	MAE 35378.5804688 for 16 rounds
CV with subsample=0.9, colsample=1.0
	MAE 33471.8144534 for 13 rounds
CV with subsample=0.9, colsample=0.9
	MAE 33722.149218599996 for 15 rounds
CV with subsample=0.9, colsample=0.8
	MAE 34235.8890624 for 17 rounds
CV with subsample=0.9, colsample=0.7
	MAE 34979.2351564 for 17 rounds
CV with subsample=0.8, colsample=1.0
	MAE 33269.2847656 for 13 rounds
CV with subsample=0.8, colsample=0.9
	MAE 33854.3816408 for 12 rounds
CV with subsample=0.8, colsample=0.8
	MAE 34690.0320312 for 15 rounds
CV with subsample=0.8, colsample=0.7
	MAE 35141.896875 for 18 rounds
CV with subsample=0.7, colsample=1.0
	MAE 33420.4421878 for 12 rounds
CV with subsample=0.7, colsample=0.9
	MAE 33741.245312600004 for 15 rounds
CV with subsample=0.7, colsample=0.8
	

In [48]:
params['subsample'] = 1
params['colsample_bytree'] = 1

In [49]:

%time
# This can take some time…
min_mae = float("Inf")
best_params = None

for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    # We update our parameters
    params['eta'] = eta
    # Run and time CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds\n".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = eta
    print("Best params: {}, MAE: {}".format(best_params, min_mae))

CPU times: user 7 µs, sys: 3 µs, total: 10 µs
Wall time: 1.01 ms
CV with eta=0.3


will be corrected to return the positional minimum in the future.
Use 'series.values.argmin' to get the position of the minimum now.


	MAE 32945.162890399995 for 13 rounds

Best params: 0.3, MAE: 32945.162890399995
CV with eta=0.2
	MAE 32822.3359376 for 23 rounds

Best params: 0.2, MAE: 32822.3359376
CV with eta=0.1
	MAE 32337.221875 for 47 rounds

Best params: 0.1, MAE: 32337.221875
CV with eta=0.05
	MAE 32365.771093800002 for 100 rounds

Best params: 0.1, MAE: 32337.221875
CV with eta=0.01
	MAE 32278.7589844 for 517 rounds

Best params: 0.01, MAE: 32278.7589844
CV with eta=0.005
	MAE 32198.9828126 for 998 rounds

Best params: 0.005, MAE: 32198.9828126


In [52]:
params['eta'] = .005
bestmodel = xgb.train(
    params,
    dtrain,
    num_boost_round=1200,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)
mean_absolute_percentage_error(bestmodel.predict(dtest), y_test)


[0]	Test-mae:592368
Will train until Test-mae hasn't improved in 10 rounds.
[1]	Test-mae:589413
[2]	Test-mae:586473
[3]	Test-mae:583548
[4]	Test-mae:580637
[5]	Test-mae:577741
[6]	Test-mae:574858
[7]	Test-mae:571989
[8]	Test-mae:569135
[9]	Test-mae:566296
[10]	Test-mae:563469
[11]	Test-mae:560660
[12]	Test-mae:557860
[13]	Test-mae:555080
[14]	Test-mae:552311
[15]	Test-mae:549555
[16]	Test-mae:546814
[17]	Test-mae:544086
[18]	Test-mae:541372
[19]	Test-mae:538669
[20]	Test-mae:535982
[21]	Test-mae:533306
[22]	Test-mae:530646
[23]	Test-mae:528000
[24]	Test-mae:525363
[25]	Test-mae:522742
[26]	Test-mae:520133
[27]	Test-mae:517538
[28]	Test-mae:514954
[29]	Test-mae:512385
[30]	Test-mae:509829
[31]	Test-mae:507286
[32]	Test-mae:504754
[33]	Test-mae:502235
[34]	Test-mae:499729
[35]	Test-mae:497235
[36]	Test-mae:494753
[37]	Test-mae:492285
[38]	Test-mae:489828
[39]	Test-mae:487384
[40]	Test-mae:484952
[41]	Test-mae:482532
[42]	Test-mae:480124
[43]	Test-mae:477730
[44]	Test-mae:475346
[45]	Test

[375]	Test-mae:92957.8
[376]	Test-mae:92524.8
[377]	Test-mae:92093.8
[378]	Test-mae:91665.4
[379]	Test-mae:91235.8
[380]	Test-mae:90812
[381]	Test-mae:90389.7
[382]	Test-mae:89967.2
[383]	Test-mae:89552
[384]	Test-mae:89138.1
[385]	Test-mae:88726.6
[386]	Test-mae:88316.5
[387]	Test-mae:87908
[388]	Test-mae:87503.1
[389]	Test-mae:87100.9
[390]	Test-mae:86700.7
[391]	Test-mae:86302.3
[392]	Test-mae:85907.2
[393]	Test-mae:85509.3
[394]	Test-mae:85114.8
[395]	Test-mae:84725.4
[396]	Test-mae:84336.6
[397]	Test-mae:83949.1
[398]	Test-mae:83563.4
[399]	Test-mae:83179.3
[400]	Test-mae:82800
[401]	Test-mae:82421.4
[402]	Test-mae:82043.4
[403]	Test-mae:81670.5
[404]	Test-mae:81299.8
[405]	Test-mae:80930.6
[406]	Test-mae:80563.6
[407]	Test-mae:80196.6
[408]	Test-mae:79833.8
[409]	Test-mae:79473.1
[410]	Test-mae:79114.6
[411]	Test-mae:78759.2
[412]	Test-mae:78403.8
[413]	Test-mae:78050.7
[414]	Test-mae:77699.9
[415]	Test-mae:77347.6
[416]	Test-mae:77000
[417]	Test-mae:76656.2
[418]	Test-mae:76310.

[734]	Test-mae:33176.8
[735]	Test-mae:33155.6
[736]	Test-mae:33133.5
[737]	Test-mae:33109.2
[738]	Test-mae:33086.1
[739]	Test-mae:33064.9
[740]	Test-mae:33039.8
[741]	Test-mae:33018.6
[742]	Test-mae:32997.9
[743]	Test-mae:32976.9
[744]	Test-mae:32955.6
[745]	Test-mae:32935.2
[746]	Test-mae:32916.7
[747]	Test-mae:32898.2
[748]	Test-mae:32879.3
[749]	Test-mae:32859.9
[750]	Test-mae:32840.9
[751]	Test-mae:32824
[752]	Test-mae:32805.1
[753]	Test-mae:32786.5
[754]	Test-mae:32772.9
[755]	Test-mae:32754.2
[756]	Test-mae:32739.6
[757]	Test-mae:32721.4
[758]	Test-mae:32704.4
[759]	Test-mae:32688.8
[760]	Test-mae:32672.3
[761]	Test-mae:32659.5
[762]	Test-mae:32643.1
[763]	Test-mae:32628.4
[764]	Test-mae:32613.9
[765]	Test-mae:32600.7
[766]	Test-mae:32585.2
[767]	Test-mae:32571.4
[768]	Test-mae:32558
[769]	Test-mae:32544.3
[770]	Test-mae:32530.8
[771]	Test-mae:32517.1
[772]	Test-mae:32506
[773]	Test-mae:32493
[774]	Test-mae:32482.9
[775]	Test-mae:32473.1
[776]	Test-mae:32461.1
[777]	Test-mae:3244

5.7932715704097255