# Cleaned Notebook with XGBoost
- Hopefully this will have everything pipelined but not positive :)

In [1]:
import pandas as pd
import numpy as np
import os

os.chdir('/data/p_dsi/teams2023/team9/')

df = pd.read_csv('Asurion_clean_data_feature_set.csv')

df['weeks_monday'] = pd.to_datetime(df['weeks_monday'].str[:10])

### Trim Final Row

In [2]:
cut_date = "13-Feb-2023"

all_iphone = df.loc[df.weeks_monday < cut_date].copy()

### Set and Order Index for new DF

In [3]:
all_iphone.set_index('weeks_monday')
all_iphone['weeks_monday'] = pd.to_datetime(all_iphone['weeks_monday'], format='%Y-%m-%d')
all_iphone.sort_index()

Unnamed: 0,phone model,phone size,phone color,claim,weeks_monday,month,year,season,quarter,Holidays_Boolean,company,model_group
0,apple iphone 11,128gb,black,252,2022-01-24,January,2022,winter,Q1,False,apple,iphone 11
1,apple iphone 11,128gb,green,45,2022-01-24,January,2022,winter,Q1,False,apple,iphone 11
2,apple iphone 11,128gb,purple,92,2022-01-24,January,2022,winter,Q1,False,apple,iphone 11
3,apple iphone 11,128gb,red,69,2022-01-24,January,2022,winter,Q1,False,apple,iphone 11
4,apple iphone 11,128gb,white,91,2022-01-24,January,2022,winter,Q1,False,apple,iphone 11
...,...,...,...,...,...,...,...,...,...,...,...,...
11538,apple iphone xs max,512gb,gray,13,2023-02-06,February,2023,winter,Q1,False,apple,iphone x
11539,apple iphone xs max,512gb,silver,4,2023-02-06,February,2023,winter,Q1,False,apple,iphone x
11540,apple iphone xs max,64gb,gold,47,2023-02-06,February,2023,winter,Q1,False,apple,iphone x
11541,apple iphone xs max,64gb,gray,49,2023-02-06,February,2023,winter,Q1,False,apple,iphone x


In [4]:
# Transform columns into proper format
def col_transform(temp):
    
    temp["color"] = temp["phone color"]
    temp["model"] = temp["model_group"].str.split().str[-1]
    temp['size'] = temp['phone size'].str.replace('\D', '', regex=True)
    temp = temp[['color', 'claim', 'weeks_monday', 'model', 'size']]
    
    # Changing colors to only show gray, black, silver, or other
    temp["color"] = np.where(temp["color"] == "black", "black", 
                    np.where(temp["color"] == "gray", "gray",
                    np.where(temp["color"] == "silver", "silver", "other")))

    
    temp = pd.get_dummies(temp, columns=['color', 'model', 'size'])
    
    return temp

all_iphone = col_transform(all_iphone)
all_iphone.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp["color"] = np.where(temp["color"] == "black", "black",


Unnamed: 0,claim,weeks_monday,color_black,color_gray,color_other,color_silver,model_11,model_12,model_13,model_14,model_se,model_x,size_128,size_16,size_256,size_32,size_512,size_64
0,252,2022-01-24,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
1,45,2022-01-24,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0
2,92,2022-01-24,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0
3,69,2022-01-24,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0
4,91,2022-01-24,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0


### Function to create date related features

In [5]:
def create_features(temp):
    
    temp['quarter'] = temp['weeks_monday'].dt.quarter
    temp['month'] = temp['weeks_monday'].dt.month
    temp['year'] = temp['weeks_monday'].dt.year
    temp['dayofyear'] = temp['weeks_monday'].dt.dayofyear
    temp['dayofmonth'] = temp['weeks_monday'].dt.day
    temp['weekofyear'] = temp['weeks_monday'].dt.isocalendar().week.astype(int)
    temp['lag_1'] = temp['claim'].shift(1)
    
    return temp

all_iphone = create_features(all_iphone)

In [6]:
iphone13 = all_iphone[all_iphone["model_13"] == 1]
iphone14 = all_iphone[all_iphone["model_14"] == 1]

### Create train / test function

In [7]:
def create_split(split_ratio, temp):

    train = temp.iloc[:int(len(temp)*split_ratio)]
    test = temp.iloc[int(len(temp)*split_ratio):]

    features = temp.columns.to_list()
    features.remove('claim')
    features.remove('weeks_monday')
    target = 'claim'

    xtrain = train[features]
    ytrain = train[target]

    xtest = test[features]
    ytest = test[target]
    
    return train, test, xtrain, ytrain, xtest, ytest

## XGBoost with all iphones
- Note: Not hyper parameter tuned, yet

In [8]:
import xgboost as xgb
from xgboost import plot_importance, plot_tree
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [9]:
train, test, x_train, y_train, x_test, y_test = create_split(0.75, all_iphone)

In [10]:
# Build model
mod = xgb.XGBRegressor(base_score = 0.2, booster = 'gbtree', n_estimators = 1000,
                       objective = 'reg:linear', max_depth = 5, learning_rate = 0.01)

mod.fit(x_train, y_train, eval_set = [(x_train, y_train), (x_test, y_test)])

[0]	validation_0-rmse:100.22827	validation_1-rmse:91.59444
[1]	validation_0-rmse:99.55953	validation_1-rmse:91.00688
[2]	validation_0-rmse:98.89954	validation_1-rmse:90.42750
[3]	validation_0-rmse:98.24819	validation_1-rmse:89.85622
[4]	validation_0-rmse:97.60256	validation_1-rmse:89.27839
[5]	validation_0-rmse:96.96505	validation_1-rmse:88.71772
[6]	validation_0-rmse:96.33871	validation_1-rmse:88.17117
[7]	validation_0-rmse:95.71432	validation_1-rmse:87.61006
[8]	validation_0-rmse:95.10128	validation_1-rmse:87.07315
[9]	validation_0-rmse:94.48800	validation_1-rmse:86.53348
[10]	validation_0-rmse:93.89066	validation_1-rmse:86.01127
[11]	validation_0-rmse:93.29747	validation_1-rmse:85.48043
[12]	validation_0-rmse:92.71569	validation_1-rmse:84.97290
[13]	validation_0-rmse:92.13345	validation_1-rmse:84.46926
[14]	validation_0-rmse:91.56312	validation_1-rmse:83.96094
[15]	validation_0-rmse:91.00373	validation_1-rmse:83.47430
[16]	validation_0-rmse:90.44396	validation_1-rmse:82.99128
[17]	v

[137]	validation_0-rmse:58.41059	validation_1-rmse:56.59085
[138]	validation_0-rmse:58.32044	validation_1-rmse:56.52458
[139]	validation_0-rmse:58.22181	validation_1-rmse:56.44743
[140]	validation_0-rmse:58.13033	validation_1-rmse:56.38432
[141]	validation_0-rmse:58.04189	validation_1-rmse:56.32263
[142]	validation_0-rmse:57.94663	validation_1-rmse:56.24864
[143]	validation_0-rmse:57.85926	validation_1-rmse:56.19217
[144]	validation_0-rmse:57.77618	validation_1-rmse:56.13277
[145]	validation_0-rmse:57.69098	validation_1-rmse:56.07120
[146]	validation_0-rmse:57.60093	validation_1-rmse:55.99788
[147]	validation_0-rmse:57.51879	validation_1-rmse:55.94577
[148]	validation_0-rmse:57.43147	validation_1-rmse:55.87923
[149]	validation_0-rmse:57.35188	validation_1-rmse:55.82910
[150]	validation_0-rmse:57.26694	validation_1-rmse:55.76398
[151]	validation_0-rmse:57.18971	validation_1-rmse:55.71544
[152]	validation_0-rmse:57.09984	validation_1-rmse:55.64541
[153]	validation_0-rmse:57.02269	validat

[274]	validation_0-rmse:52.22459	validation_1-rmse:52.55589
[275]	validation_0-rmse:52.21460	validation_1-rmse:52.54981
[276]	validation_0-rmse:52.19885	validation_1-rmse:52.53802
[277]	validation_0-rmse:52.18418	validation_1-rmse:52.52981
[278]	validation_0-rmse:52.16828	validation_1-rmse:52.52212
[279]	validation_0-rmse:52.15886	validation_1-rmse:52.51666
[280]	validation_0-rmse:52.14956	validation_1-rmse:52.51102
[281]	validation_0-rmse:52.13319	validation_1-rmse:52.50388
[282]	validation_0-rmse:52.11836	validation_1-rmse:52.49317
[283]	validation_0-rmse:52.10947	validation_1-rmse:52.48797
[284]	validation_0-rmse:52.09305	validation_1-rmse:52.48014
[285]	validation_0-rmse:52.08439	validation_1-rmse:52.47512
[286]	validation_0-rmse:52.06144	validation_1-rmse:52.46158
[287]	validation_0-rmse:52.05280	validation_1-rmse:52.45637
[288]	validation_0-rmse:52.03025	validation_1-rmse:52.43896
[289]	validation_0-rmse:52.02185	validation_1-rmse:52.43401
[290]	validation_0-rmse:52.00370	validat

[411]	validation_0-rmse:50.82950	validation_1-rmse:51.81956
[412]	validation_0-rmse:50.82529	validation_1-rmse:51.81731
[413]	validation_0-rmse:50.81923	validation_1-rmse:51.81932
[414]	validation_0-rmse:50.81375	validation_1-rmse:51.81659
[415]	validation_0-rmse:50.80773	validation_1-rmse:51.81275
[416]	validation_0-rmse:50.80353	validation_1-rmse:51.81116
[417]	validation_0-rmse:50.80145	validation_1-rmse:51.81062
[418]	validation_0-rmse:50.79616	validation_1-rmse:51.80786
[419]	validation_0-rmse:50.78770	validation_1-rmse:51.80769
[420]	validation_0-rmse:50.78277	validation_1-rmse:51.80474
[421]	validation_0-rmse:50.77934	validation_1-rmse:51.80305
[422]	validation_0-rmse:50.77725	validation_1-rmse:51.80244
[423]	validation_0-rmse:50.77513	validation_1-rmse:51.80140
[424]	validation_0-rmse:50.76684	validation_1-rmse:51.80133
[425]	validation_0-rmse:50.76110	validation_1-rmse:51.79767
[426]	validation_0-rmse:50.75918	validation_1-rmse:51.79720
[427]	validation_0-rmse:50.75107	validat

[548]	validation_0-rmse:50.10800	validation_1-rmse:51.59695
[549]	validation_0-rmse:50.09936	validation_1-rmse:51.59513
[550]	validation_0-rmse:50.09056	validation_1-rmse:51.59381
[551]	validation_0-rmse:50.08828	validation_1-rmse:51.59294
[552]	validation_0-rmse:50.08287	validation_1-rmse:51.59073
[553]	validation_0-rmse:50.07900	validation_1-rmse:51.58923
[554]	validation_0-rmse:50.07516	validation_1-rmse:51.58749
[555]	validation_0-rmse:50.06879	validation_1-rmse:51.58471
[556]	validation_0-rmse:50.06656	validation_1-rmse:51.58387
[557]	validation_0-rmse:50.06135	validation_1-rmse:51.58115
[558]	validation_0-rmse:50.05758	validation_1-rmse:51.57951
[559]	validation_0-rmse:50.05513	validation_1-rmse:51.57814
[560]	validation_0-rmse:50.04692	validation_1-rmse:51.57449
[561]	validation_0-rmse:50.03998	validation_1-rmse:51.57302
[562]	validation_0-rmse:50.03382	validation_1-rmse:51.57030
[563]	validation_0-rmse:50.03013	validation_1-rmse:51.56866
[564]	validation_0-rmse:50.02398	validat

[685]	validation_0-rmse:49.41998	validation_1-rmse:51.34508
[686]	validation_0-rmse:49.41314	validation_1-rmse:51.33789
[687]	validation_0-rmse:49.40654	validation_1-rmse:51.33939
[688]	validation_0-rmse:49.40450	validation_1-rmse:51.33983
[689]	validation_0-rmse:49.40076	validation_1-rmse:51.33792
[690]	validation_0-rmse:49.39948	validation_1-rmse:51.33816
[691]	validation_0-rmse:49.39125	validation_1-rmse:51.33667
[692]	validation_0-rmse:49.38455	validation_1-rmse:51.32974
[693]	validation_0-rmse:49.38120	validation_1-rmse:51.33425
[694]	validation_0-rmse:49.37281	validation_1-rmse:51.33338
[695]	validation_0-rmse:49.36620	validation_1-rmse:51.32798
[696]	validation_0-rmse:49.36291	validation_1-rmse:51.33267
[697]	validation_0-rmse:49.36164	validation_1-rmse:51.33292
[698]	validation_0-rmse:49.35955	validation_1-rmse:51.33163
[699]	validation_0-rmse:49.35643	validation_1-rmse:51.33433
[700]	validation_0-rmse:49.35087	validation_1-rmse:51.33217
[701]	validation_0-rmse:49.34074	validat

[822]	validation_0-rmse:48.37033	validation_1-rmse:51.01048
[823]	validation_0-rmse:48.36489	validation_1-rmse:51.01046
[824]	validation_0-rmse:48.34874	validation_1-rmse:51.00107
[825]	validation_0-rmse:48.34523	validation_1-rmse:51.00555
[826]	validation_0-rmse:48.34370	validation_1-rmse:51.00565
[827]	validation_0-rmse:48.33897	validation_1-rmse:51.00495
[828]	validation_0-rmse:48.32555	validation_1-rmse:51.00024
[829]	validation_0-rmse:48.31034	validation_1-rmse:50.99148
[830]	validation_0-rmse:48.30602	validation_1-rmse:50.99068
[831]	validation_0-rmse:48.29064	validation_1-rmse:50.98332
[832]	validation_0-rmse:48.28530	validation_1-rmse:50.98331
[833]	validation_0-rmse:48.27278	validation_1-rmse:50.97891
[834]	validation_0-rmse:48.26976	validation_1-rmse:50.98300
[835]	validation_0-rmse:48.26383	validation_1-rmse:50.97819
[836]	validation_0-rmse:48.24573	validation_1-rmse:50.96788
[837]	validation_0-rmse:48.24068	validation_1-rmse:50.96796
[838]	validation_0-rmse:48.23771	validat

[959]	validation_0-rmse:47.46176	validation_1-rmse:50.75537
[960]	validation_0-rmse:47.46061	validation_1-rmse:50.75532
[961]	validation_0-rmse:47.45700	validation_1-rmse:50.75421
[962]	validation_0-rmse:47.44109	validation_1-rmse:50.75692
[963]	validation_0-rmse:47.43654	validation_1-rmse:50.75583
[964]	validation_0-rmse:47.43477	validation_1-rmse:50.75524
[965]	validation_0-rmse:47.43070	validation_1-rmse:50.75364
[966]	validation_0-rmse:47.42901	validation_1-rmse:50.75496
[967]	validation_0-rmse:47.42751	validation_1-rmse:50.75441
[968]	validation_0-rmse:47.42332	validation_1-rmse:50.75252
[969]	validation_0-rmse:47.42187	validation_1-rmse:50.75166
[970]	validation_0-rmse:47.41396	validation_1-rmse:50.75397
[971]	validation_0-rmse:47.40967	validation_1-rmse:50.75332
[972]	validation_0-rmse:47.40817	validation_1-rmse:50.75276
[973]	validation_0-rmse:47.40675	validation_1-rmse:50.75183
[974]	validation_0-rmse:47.39316	validation_1-rmse:50.75035
[975]	validation_0-rmse:47.39146	validat

XGBRegressor(base_score=0.2, booster='gbtree', callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.01, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=5, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=1000, n_jobs=None, num_parallel_tree=None,
             objective='reg:linear', predictor=None, ...)

In [11]:
### Score with WMAPE
def calc_wmape(actual, pred):

    abs_diff = abs(actual - pred)
    weight = (abs_diff / actual * 100) * actual
    total_weight = weight.sum()
    total_claims = actual.sum()
    
    return (total_weight/total_claims)

In [12]:
test['claim_pred'] = mod.predict(x_test)
calc_wmape(test['claim'], test['claim_pred'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['claim_pred'] = mod.predict(x_test)


52.49821260922686

### Use full model to predict iphone14 ? maybe ?

In [13]:
test14 = test.loc[test["model_14"] == 1]
x_test14 = x_test.loc[x_test["model_14"] == 1]
test14['claim_pred'] = mod.predict(x_test14)
calc_wmape(test14['claim'], test14['claim_pred'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test14['claim_pred'] = mod.predict(x_test14)


54.491486964304805

## XGBoost with just iPhone 13

In [14]:
train, test, x_train, y_train, x_test, y_test = create_split(0.75, iphone13)

# Build model
mod = xgb.XGBRegressor(base_score = 0.2, booster = 'gbtree', n_estimators = 1000,
                       objective = 'reg:linear', max_depth = 5, learning_rate = 0.01)

mod.fit(x_train, y_train, eval_set = [(x_train, y_train), (x_test, y_test)])

[0]	validation_0-rmse:103.73245	validation_1-rmse:114.72882
[1]	validation_0-rmse:103.14009	validation_1-rmse:114.10852
[2]	validation_0-rmse:102.55564	validation_1-rmse:113.49670
[3]	validation_0-rmse:101.97936	validation_1-rmse:112.89407
[4]	validation_0-rmse:101.40889	validation_1-rmse:112.29355
[5]	validation_0-rmse:100.84643	validation_1-rmse:111.70210
[6]	validation_0-rmse:100.29350	validation_1-rmse:111.12010
[7]	validation_0-rmse:99.74806	validation_1-rmse:110.54648
[8]	validation_0-rmse:99.21089	validation_1-rmse:109.97697
[9]	validation_0-rmse:98.67925	validation_1-rmse:109.41641
[10]	validation_0-rmse:98.15446	validation_1-rmse:108.86757
[11]	validation_0-rmse:97.63717	validation_1-rmse:108.31976
[12]	validation_0-rmse:97.12920	validation_1-rmse:107.78391
[13]	validation_0-rmse:96.62492	validation_1-rmse:107.24967
[14]	validation_0-rmse:96.12938	validation_1-rmse:106.73017
[15]	validation_0-rmse:95.63988	validation_1-rmse:106.21103
[16]	validation_0-rmse:95.15980	validation_

[135]	validation_0-rmse:67.10542	validation_1-rmse:80.11342
[136]	validation_0-rmse:67.02283	validation_1-rmse:80.04208
[137]	validation_0-rmse:66.91696	validation_1-rmse:79.95820
[138]	validation_0-rmse:66.81341	validation_1-rmse:79.89090
[139]	validation_0-rmse:66.74119	validation_1-rmse:79.82769
[140]	validation_0-rmse:66.64268	validation_1-rmse:79.75867
[141]	validation_0-rmse:66.54361	validation_1-rmse:79.70077
[142]	validation_0-rmse:66.46959	validation_1-rmse:79.64162
[143]	validation_0-rmse:66.37318	validation_1-rmse:79.58057
[144]	validation_0-rmse:66.27840	validation_1-rmse:79.52652
[145]	validation_0-rmse:66.20664	validation_1-rmse:79.47213
[146]	validation_0-rmse:66.11647	validation_1-rmse:79.40904
[147]	validation_0-rmse:66.04080	validation_1-rmse:79.34530
[148]	validation_0-rmse:65.95095	validation_1-rmse:79.29579
[149]	validation_0-rmse:65.88317	validation_1-rmse:79.24500
[150]	validation_0-rmse:65.80936	validation_1-rmse:79.18563
[151]	validation_0-rmse:65.73628	validat

[272]	validation_0-rmse:59.51286	validation_1-rmse:75.89305
[273]	validation_0-rmse:59.47617	validation_1-rmse:75.88046
[274]	validation_0-rmse:59.45374	validation_1-rmse:75.86932
[275]	validation_0-rmse:59.41845	validation_1-rmse:75.85470
[276]	validation_0-rmse:59.38376	validation_1-rmse:75.83773
[277]	validation_0-rmse:59.36134	validation_1-rmse:75.82660
[278]	validation_0-rmse:59.32683	validation_1-rmse:75.81240
[279]	validation_0-rmse:59.29347	validation_1-rmse:75.80260
[280]	validation_0-rmse:59.27182	validation_1-rmse:75.79201
[281]	validation_0-rmse:59.23919	validation_1-rmse:75.77942
[282]	validation_0-rmse:59.20651	validation_1-rmse:75.76525
[283]	validation_0-rmse:59.18646	validation_1-rmse:75.75697
[284]	validation_0-rmse:59.14876	validation_1-rmse:75.74617
[285]	validation_0-rmse:59.11779	validation_1-rmse:75.73808
[286]	validation_0-rmse:59.09773	validation_1-rmse:75.72852
[287]	validation_0-rmse:59.06748	validation_1-rmse:75.71837
[288]	validation_0-rmse:59.04479	validat

[409]	validation_0-rmse:57.07549	validation_1-rmse:75.33407
[410]	validation_0-rmse:57.06117	validation_1-rmse:75.34185
[411]	validation_0-rmse:57.05701	validation_1-rmse:75.33927
[412]	validation_0-rmse:57.05274	validation_1-rmse:75.33870
[413]	validation_0-rmse:57.04884	validation_1-rmse:75.33630
[414]	validation_0-rmse:57.03803	validation_1-rmse:75.33505
[415]	validation_0-rmse:57.03383	validation_1-rmse:75.33452
[416]	validation_0-rmse:57.00686	validation_1-rmse:75.33860
[417]	validation_0-rmse:56.99124	validation_1-rmse:75.33483
[418]	validation_0-rmse:56.97732	validation_1-rmse:75.34286
[419]	validation_0-rmse:56.97320	validation_1-rmse:75.34239
[420]	validation_0-rmse:56.95924	validation_1-rmse:75.35019
[421]	validation_0-rmse:56.95518	validation_1-rmse:75.34975
[422]	validation_0-rmse:56.94167	validation_1-rmse:75.35800
[423]	validation_0-rmse:56.92926	validation_1-rmse:75.36666
[424]	validation_0-rmse:56.92527	validation_1-rmse:75.36628
[425]	validation_0-rmse:56.92132	validat

[545]	validation_0-rmse:55.87654	validation_1-rmse:75.45847
[546]	validation_0-rmse:55.85672	validation_1-rmse:75.45774
[547]	validation_0-rmse:55.84775	validation_1-rmse:75.46191
[548]	validation_0-rmse:55.82499	validation_1-rmse:75.44353
[549]	validation_0-rmse:55.81615	validation_1-rmse:75.44769
[550]	validation_0-rmse:55.80372	validation_1-rmse:75.46390
[551]	validation_0-rmse:55.79366	validation_1-rmse:75.45732
[552]	validation_0-rmse:55.78638	validation_1-rmse:75.45554
[553]	validation_0-rmse:55.76441	validation_1-rmse:75.44625
[554]	validation_0-rmse:55.74682	validation_1-rmse:75.44355
[555]	validation_0-rmse:55.73692	validation_1-rmse:75.43706
[556]	validation_0-rmse:55.72164	validation_1-rmse:75.42304
[557]	validation_0-rmse:55.71436	validation_1-rmse:75.41715
[558]	validation_0-rmse:55.70589	validation_1-rmse:75.42197
[559]	validation_0-rmse:55.68807	validation_1-rmse:75.42699
[560]	validation_0-rmse:55.68063	validation_1-rmse:75.42570
[561]	validation_0-rmse:55.65887	validat

[682]	validation_0-rmse:54.52550	validation_1-rmse:75.20283
[683]	validation_0-rmse:54.51381	validation_1-rmse:75.19846
[684]	validation_0-rmse:54.50687	validation_1-rmse:75.19719
[685]	validation_0-rmse:54.50504	validation_1-rmse:75.19810
[686]	validation_0-rmse:54.49891	validation_1-rmse:75.19161
[687]	validation_0-rmse:54.49711	validation_1-rmse:75.19252
[688]	validation_0-rmse:54.48335	validation_1-rmse:75.19349
[689]	validation_0-rmse:54.47683	validation_1-rmse:75.19180
[690]	validation_0-rmse:54.47301	validation_1-rmse:75.19183
[691]	validation_0-rmse:54.44629	validation_1-rmse:75.19372
[692]	validation_0-rmse:54.44451	validation_1-rmse:75.19464
[693]	validation_0-rmse:54.43785	validation_1-rmse:75.19340
[694]	validation_0-rmse:54.42429	validation_1-rmse:75.19450
[695]	validation_0-rmse:54.41974	validation_1-rmse:75.18963
[696]	validation_0-rmse:54.39340	validation_1-rmse:75.19157
[697]	validation_0-rmse:54.39165	validation_1-rmse:75.19249
[698]	validation_0-rmse:54.38304	validat

[819]	validation_0-rmse:53.26178	validation_1-rmse:75.07741
[820]	validation_0-rmse:53.24493	validation_1-rmse:75.07828
[821]	validation_0-rmse:53.23584	validation_1-rmse:75.07726
[822]	validation_0-rmse:53.23125	validation_1-rmse:75.07201
[823]	validation_0-rmse:53.21554	validation_1-rmse:75.06672
[824]	validation_0-rmse:53.21098	validation_1-rmse:75.07082
[825]	validation_0-rmse:53.19186	validation_1-rmse:75.07100
[826]	validation_0-rmse:53.18104	validation_1-rmse:75.06493
[827]	validation_0-rmse:53.17185	validation_1-rmse:75.06658
[828]	validation_0-rmse:53.16916	validation_1-rmse:75.06665
[829]	validation_0-rmse:53.15186	validation_1-rmse:75.06140
[830]	validation_0-rmse:53.14265	validation_1-rmse:75.06733
[831]	validation_0-rmse:53.13707	validation_1-rmse:75.06723
[832]	validation_0-rmse:53.12796	validation_1-rmse:75.08242
[833]	validation_0-rmse:53.12357	validation_1-rmse:75.08227
[834]	validation_0-rmse:53.12096	validation_1-rmse:75.08235
[835]	validation_0-rmse:53.11661	validat

[955]	validation_0-rmse:52.26718	validation_1-rmse:75.19983
[956]	validation_0-rmse:52.26221	validation_1-rmse:75.19943
[957]	validation_0-rmse:52.24913	validation_1-rmse:75.21547
[958]	validation_0-rmse:52.23396	validation_1-rmse:75.21762
[959]	validation_0-rmse:52.22487	validation_1-rmse:75.21646
[960]	validation_0-rmse:52.21881	validation_1-rmse:75.21585
[961]	validation_0-rmse:52.21602	validation_1-rmse:75.21532
[962]	validation_0-rmse:52.21119	validation_1-rmse:75.21572
[963]	validation_0-rmse:52.20827	validation_1-rmse:75.21537
[964]	validation_0-rmse:52.20212	validation_1-rmse:75.21418
[965]	validation_0-rmse:52.19116	validation_1-rmse:75.21553
[966]	validation_0-rmse:52.17570	validation_1-rmse:75.21671
[967]	validation_0-rmse:52.16688	validation_1-rmse:75.22309
[968]	validation_0-rmse:52.15255	validation_1-rmse:75.22264
[969]	validation_0-rmse:52.13484	validation_1-rmse:75.22356
[970]	validation_0-rmse:52.12757	validation_1-rmse:75.22480
[971]	validation_0-rmse:52.12421	validat

XGBRegressor(base_score=0.2, booster='gbtree', callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.01, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=5, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=1000, n_jobs=None, num_parallel_tree=None,
             objective='reg:linear', predictor=None, ...)

In [15]:
test['claim_pred'] = mod.predict(x_test)
calc_wmape(test['claim'], test['claim_pred'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['claim_pred'] = mod.predict(x_test)


65.99663059001266

## Observations
- Model with all iphones was better based on WMAPE
- As a result, will move on with hyper tuning based on this model

## Notes
- Moving on with hyper parameter tuning

In [16]:
train, test, x_train, y_train, x_test, y_test = create_split(0.75, all_iphone)
dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test, label=y_test)

In [17]:
params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    'objective':'reg:squarederror',
}

params['eval_metric'] = "rmse"

num_boost_round = 999

model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

print("Best RMSE: {:.2f} with {} rounds".format(
                 model.best_score,
                 model.best_iteration+1))

[0]	Test-rmse:74.60087
[1]	Test-rmse:64.34980
[2]	Test-rmse:58.54927
[3]	Test-rmse:55.72713
[4]	Test-rmse:53.78382
[5]	Test-rmse:52.72393
[6]	Test-rmse:52.26309
[7]	Test-rmse:51.44342
[8]	Test-rmse:50.99673
[9]	Test-rmse:50.87669
[10]	Test-rmse:50.51841
[11]	Test-rmse:50.76766
[12]	Test-rmse:50.72743
[13]	Test-rmse:50.68014
[14]	Test-rmse:50.82759
[15]	Test-rmse:50.82093
[16]	Test-rmse:50.61865
[17]	Test-rmse:50.60377
[18]	Test-rmse:50.59922
[19]	Test-rmse:50.52322
Best RMSE: 50.52 with 11 rounds


In [18]:
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    seed=42,
    nfold=5,
    metrics={'rmse'},
    early_stopping_rounds=10
)

cv_results

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,80.893552,0.520073,81.213687,1.391057
1,68.37976,0.616291,68.911397,2.135143
2,60.634978,0.49279,61.467374,2.630432
3,56.040266,0.600223,57.12798,2.866665
4,53.207125,0.66834,54.589829,3.0073
5,51.473236,0.556826,53.034324,3.009354
6,50.477476,0.646944,52.247558,2.963308
7,49.684505,0.78109,51.645993,2.813811
8,49.19228,0.694722,51.289537,2.875007
9,48.685534,0.603051,50.953572,2.826304


In [19]:
cv_results['test-rmse-mean'].min()

49.785809323522116

In [20]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(9,12)
    for min_child_weight in range(5,8)
]

# Define initial best params and RMSE
min_rmse = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'rmse'},
        early_stopping_rounds=10
    )
    # Update best RMSE
    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSE {} for {} rounds".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, RMSE: {}".format(best_params[0], best_params[1], min_rmse))

CV with max_depth=9, min_child_weight=5
	RMSE 49.181465910033005 for 8 rounds
CV with max_depth=9, min_child_weight=6
	RMSE 49.05221227627993 for 11 rounds
CV with max_depth=9, min_child_weight=7
	RMSE 48.97002381261633 for 9 rounds
CV with max_depth=10, min_child_weight=5
	RMSE 49.628229185310865 for 9 rounds
CV with max_depth=10, min_child_weight=6
	RMSE 49.31452197850703 for 9 rounds
CV with max_depth=10, min_child_weight=7
	RMSE 49.518099860555694 for 8 rounds
CV with max_depth=11, min_child_weight=5
	RMSE 49.5815338815481 for 7 rounds
CV with max_depth=11, min_child_weight=6
	RMSE 49.68006925889691 for 8 rounds
CV with max_depth=11, min_child_weight=7
	RMSE 49.65680906695663 for 7 rounds
Best params: 9, 7, RMSE: 48.97002381261633


In [21]:
params['max_depth'] = 9
params['min_child_weight'] = 7

In [22]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

min_rmse = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'rmse'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSE {} for {} rounds".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = (subsample,colsample)
print("Best params: {}, {}, RMSE: {}".format(best_params[0], best_params[1], min_rmse))

CV with subsample=1.0, colsample=1.0
	RMSE 48.97002381261633 for 9 rounds
CV with subsample=1.0, colsample=0.9
	RMSE 49.081866054402454 for 11 rounds
CV with subsample=1.0, colsample=0.8
	RMSE 49.06105142223855 for 12 rounds
CV with subsample=1.0, colsample=0.7
	RMSE 49.24499985425458 for 10 rounds
CV with subsample=0.9, colsample=1.0
	RMSE 49.218781474785544 for 10 rounds
CV with subsample=0.9, colsample=0.9
	RMSE 49.269064418444586 for 9 rounds
CV with subsample=0.9, colsample=0.8
	RMSE 49.50949172691372 for 9 rounds
CV with subsample=0.9, colsample=0.7
	RMSE 49.73685868182289 for 10 rounds
CV with subsample=0.8, colsample=1.0
	RMSE 49.26510017911288 for 11 rounds
CV with subsample=0.8, colsample=0.9
	RMSE 49.43524235055243 for 10 rounds
CV with subsample=0.8, colsample=0.8
	RMSE 49.473597533682 for 10 rounds
CV with subsample=0.8, colsample=0.7
	RMSE 50.210048235429326 for 10 rounds
CV with subsample=0.7, colsample=1.0
	RMSE 49.496771240191705 for 10 rounds
CV with subsample=0.7, co

In [23]:
params['subsample'] = 1.0
params['colsample_bytree'] = 1.0

In [24]:
%time
# This can take some time…
min_rmse = float("Inf")
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    # We update our parameters
    params['eta'] = eta
    # Run and time CV
    %time cv_results = xgb.cv(params,dtrain,num_boost_round=num_boost_round,seed=42,nfold=5,metrics=['rmse'],early_stopping_rounds=10)
    # Update best score
    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSE {} for {} rounds\n".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = eta
print("Best params: {}, RMSE: {}".format(best_params, min_rmse))

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.96 µs
CV with eta=0.3
CPU times: user 3.56 s, sys: 50.9 ms, total: 3.61 s
Wall time: 462 ms
	RMSE 48.97002381261633 for 9 rounds

CV with eta=0.2
CPU times: user 4.47 s, sys: 49.8 ms, total: 4.52 s
Wall time: 566 ms
	RMSE 49.09801618947339 for 15 rounds

CV with eta=0.1
CPU times: user 8.46 s, sys: 124 ms, total: 8.59 s
Wall time: 1.08 s
	RMSE 48.85331278943821 for 38 rounds

CV with eta=0.05
CPU times: user 16.8 s, sys: 305 ms, total: 17.1 s
Wall time: 2.14 s
	RMSE 48.91823831710107 for 76 rounds

CV with eta=0.01
CPU times: user 1min 2s, sys: 768 ms, total: 1min 3s
Wall time: 7.9 s
	RMSE 48.93035304580089 for 351 rounds

CV with eta=0.005
CPU times: user 2min, sys: 1.5 s, total: 2min 2s
Wall time: 15.3 s
	RMSE 48.94370227341955 for 691 rounds

Best params: 0.1, RMSE: 48.85331278943821


In [25]:
params['eta'] = .1

In [26]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

print("Best RMSE: {:.2f} in {} rounds".format(model.best_score, model.best_iteration+1))

[0]	Test-rmse:85.86691
[1]	Test-rmse:80.51807
[2]	Test-rmse:75.89559
[3]	Test-rmse:71.97975
[4]	Test-rmse:68.45937
[5]	Test-rmse:65.47770
[6]	Test-rmse:62.79031
[7]	Test-rmse:60.65630
[8]	Test-rmse:58.93232
[9]	Test-rmse:57.22896
[10]	Test-rmse:55.85367
[11]	Test-rmse:54.65135
[12]	Test-rmse:53.62252
[13]	Test-rmse:52.77676
[14]	Test-rmse:52.15742
[15]	Test-rmse:51.51059
[16]	Test-rmse:51.08552
[17]	Test-rmse:50.73728
[18]	Test-rmse:50.46238
[19]	Test-rmse:50.17036
[20]	Test-rmse:49.89832
[21]	Test-rmse:49.71416
[22]	Test-rmse:49.55358
[23]	Test-rmse:49.32667
[24]	Test-rmse:49.15692
[25]	Test-rmse:49.03658
[26]	Test-rmse:48.83941
[27]	Test-rmse:48.70104
[28]	Test-rmse:48.62372
[29]	Test-rmse:48.51502
[30]	Test-rmse:48.46282
[31]	Test-rmse:48.44416
[32]	Test-rmse:48.38748
[33]	Test-rmse:48.41936
[34]	Test-rmse:48.41458
[35]	Test-rmse:48.38797
[36]	Test-rmse:48.37783
[37]	Test-rmse:48.39609
[38]	Test-rmse:48.38856
[39]	Test-rmse:48.43594
[40]	Test-rmse:48.40735
[41]	Test-rmse:48.49499
[4

In [27]:
test['claim_pred'] = model.predict(dtest)
calc_wmape(test['claim'], test['claim_pred'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['claim_pred'] = model.predict(dtest)


47.950827073231046