In [1]:
import pandas as pd
import numpy as np
import math
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from scipy import stats
from sklearn.metrics import explained_variance_score
import pickle

In [2]:
df_initial = pd.read_csv('7_dwarfs_train.csv')
print(df_initial)

             date         datetime  SPOSTMIN  SACTMIN DAYOFWEEK
0        6/4/2013    6/4/2013 9:00      30.0      NaN   Tuesday
1        6/4/2013    6/4/2013 9:30      30.0      NaN   Tuesday
2        6/4/2013   6/4/2013 10:00      60.0      NaN   Tuesday
3        6/4/2013   6/4/2013 10:30      60.0      NaN   Tuesday
4        6/4/2013   6/4/2013 11:00      60.0      NaN   Tuesday
5        6/4/2013   6/4/2013 11:30      90.0      NaN   Tuesday
6        6/4/2013   6/4/2013 12:00      90.0      NaN   Tuesday
7        6/4/2013   6/4/2013 13:00     120.0      NaN   Tuesday
8        6/4/2013   6/4/2013 13:30     120.0      NaN   Tuesday
9        6/4/2013   6/4/2013 14:00      90.0      NaN   Tuesday
10       6/4/2013   6/4/2013 15:00      90.0      NaN   Tuesday
11       6/4/2013   6/4/2013 15:30      90.0      NaN   Tuesday
12       6/4/2013   6/4/2013 16:00      90.0      NaN   Tuesday
13       6/4/2013   6/4/2013 16:00      90.0      NaN   Tuesday
14       6/4/2013   6/4/2013 16:30     1

In [3]:
print(df_initial.loc[0])

date              6/4/2013
datetime     6/4/2013 9:00
SPOSTMIN                30
SACTMIN                NaN
DAYOFWEEK          Tuesday
Name: 0, dtype: object


In [4]:
print(df_initial.loc[0]['SACTMIN'])

nan


In [5]:
print(type(df_initial.loc[0]['datetime']))

<class 'str'>


In [6]:
for index, row in df_initial.iterrows():
    if math.isnan(row['SPOSTMIN']):
        df_initial.loc[index,'SPOSTMIN'] = df_initial.loc[index, 'SACTMIN']
#     print(row['SPOSTMIN'], row['SACTMIN'])
df_initial = df_initial.drop(columns="SACTMIN")

In [7]:
df_initial = df_initial[df_initial['SPOSTMIN'] != -999]

In [8]:
print(df_initial)

             date         datetime  SPOSTMIN DAYOFWEEK
0        6/4/2013    6/4/2013 9:00      30.0   Tuesday
1        6/4/2013    6/4/2013 9:30      30.0   Tuesday
2        6/4/2013   6/4/2013 10:00      60.0   Tuesday
3        6/4/2013   6/4/2013 10:30      60.0   Tuesday
4        6/4/2013   6/4/2013 11:00      60.0   Tuesday
5        6/4/2013   6/4/2013 11:30      90.0   Tuesday
6        6/4/2013   6/4/2013 12:00      90.0   Tuesday
7        6/4/2013   6/4/2013 13:00     120.0   Tuesday
8        6/4/2013   6/4/2013 13:30     120.0   Tuesday
9        6/4/2013   6/4/2013 14:00      90.0   Tuesday
10       6/4/2013   6/4/2013 15:00      90.0   Tuesday
11       6/4/2013   6/4/2013 15:30      90.0   Tuesday
12       6/4/2013   6/4/2013 16:00      90.0   Tuesday
13       6/4/2013   6/4/2013 16:00      90.0   Tuesday
14       6/4/2013   6/4/2013 16:30     120.0   Tuesday
15       6/4/2013   6/4/2013 17:00      90.0   Tuesday
16       6/4/2013   6/4/2013 17:30     100.0   Tuesday
17       6

In [9]:
df_initial['Month'] = df_initial.date.str.split('/').str[0]
df_initial['Day'] = df_initial.date.str.split('/').str[1]
df_initial['Year'] = df_initial.date.str.rsplit('/', 1).str[1]
df_initial['Time_char'] = df_initial.datetime.str.split(' ').str[1]
df_initial['Hour'] = df_initial['Time_char'].str.split(':').str[0]
df_initial['Minute'] = df_initial['Time_char'].str.split(':').str[1]
df_initial = df_initial.drop(columns="Time_char")

In [10]:
print(df_initial)

             date         datetime  SPOSTMIN DAYOFWEEK Month Day  Year Hour  \
0        6/4/2013    6/4/2013 9:00      30.0   Tuesday     6   4  2013    9   
1        6/4/2013    6/4/2013 9:30      30.0   Tuesday     6   4  2013    9   
2        6/4/2013   6/4/2013 10:00      60.0   Tuesday     6   4  2013   10   
3        6/4/2013   6/4/2013 10:30      60.0   Tuesday     6   4  2013   10   
4        6/4/2013   6/4/2013 11:00      60.0   Tuesday     6   4  2013   11   
5        6/4/2013   6/4/2013 11:30      90.0   Tuesday     6   4  2013   11   
6        6/4/2013   6/4/2013 12:00      90.0   Tuesday     6   4  2013   12   
7        6/4/2013   6/4/2013 13:00     120.0   Tuesday     6   4  2013   13   
8        6/4/2013   6/4/2013 13:30     120.0   Tuesday     6   4  2013   13   
9        6/4/2013   6/4/2013 14:00      90.0   Tuesday     6   4  2013   14   
10       6/4/2013   6/4/2013 15:00      90.0   Tuesday     6   4  2013   15   
11       6/4/2013   6/4/2013 15:30      90.0   Tuesd

In [11]:
df_initial['Month'] = (df_initial['Month']).astype(int)
df_initial['Day'] = (df_initial['Day']).astype(int)
df_initial['Year'] = (df_initial['Year']).astype(int)
df_initial['Hour'] = (df_initial['Hour']).astype(int)
df_initial['Minute'] = (df_initial['Minute']).astype(int)

In [12]:
print(df_initial)

             date         datetime  SPOSTMIN DAYOFWEEK  Month  Day  Year  \
0        6/4/2013    6/4/2013 9:00      30.0   Tuesday      6    4  2013   
1        6/4/2013    6/4/2013 9:30      30.0   Tuesday      6    4  2013   
2        6/4/2013   6/4/2013 10:00      60.0   Tuesday      6    4  2013   
3        6/4/2013   6/4/2013 10:30      60.0   Tuesday      6    4  2013   
4        6/4/2013   6/4/2013 11:00      60.0   Tuesday      6    4  2013   
5        6/4/2013   6/4/2013 11:30      90.0   Tuesday      6    4  2013   
6        6/4/2013   6/4/2013 12:00      90.0   Tuesday      6    4  2013   
7        6/4/2013   6/4/2013 13:00     120.0   Tuesday      6    4  2013   
8        6/4/2013   6/4/2013 13:30     120.0   Tuesday      6    4  2013   
9        6/4/2013   6/4/2013 14:00      90.0   Tuesday      6    4  2013   
10       6/4/2013   6/4/2013 15:00      90.0   Tuesday      6    4  2013   
11       6/4/2013   6/4/2013 15:30      90.0   Tuesday      6    4  2013   
12       6/4

In [13]:
print(type(df_initial.loc[0]['SPOSTMIN']))

<class 'numpy.float64'>


In [14]:
df_y = df_initial['SPOSTMIN']

In [15]:
label_encoder_DOW = LabelEncoder()
DoW_feature = label_encoder_DOW.fit_transform(df_initial.DAYOFWEEK.iloc[:].values)

In [16]:
# new_col = pd.Series(DoW_feature)
df_initial['DayOfWeek'] = DoW_feature
df_initial = df_initial.drop(columns=["DAYOFWEEK", "date", "datetime", "SPOSTMIN"])

In [17]:
print(df_initial)

        Month  Day  Year  Hour  Minute  DayOfWeek
0           6    4  2013     9       0          5
1           6    4  2013     9      30          5
2           6    4  2013    10       0          5
3           6    4  2013    10      30          5
4           6    4  2013    11       0          5
5           6    4  2013    11      30          5
6           6    4  2013    12       0          5
7           6    4  2013    13       0          5
8           6    4  2013    13      30          5
9           6    4  2013    14       0          5
10          6    4  2013    15       0          5
11          6    4  2013    15      30          5
12          6    4  2013    16       0          5
13          6    4  2013    16       0          5
14          6    4  2013    16      30          5
15          6    4  2013    17       0          5
16          6    4  2013    17      30          5
17          6    4  2013    18       0          5
18          6    4  2013    18      30          5


In [18]:
print(max(df_initial.loc[:,'Month']))

12


In [19]:
random_seed = 5
t_s = .20
X_train, X_test, y_train, y_test = train_test_split(df_initial, df_y, test_size = t_s, random_state = random_seed)

In [20]:
param_grid = {'n_estimators': [100, 200, 300], #random int btwn 100 and 500 - removed
              'learning_rate': stats.uniform(0.01, 0.08), #.01 + loc, range of .01+/-.08
              'max_depth': [2, 4, 6, 8], #tree depths to check
              'colsample_bytree': stats.uniform(0.3, 0.7) #btwn .1 and 1.0    
}
kfold = KFold(n_splits=3, shuffle=True, random_state=random_seed)
model = XGBRegressor(tree_method='gpu_hist')

In [21]:
rand_search = RandomizedSearchCV(model, param_distributions = param_grid, scoring = 'explained_variance', n_iter = 3, verbose = 10, cv=kfold)
rand_result = rand_search.fit(X_train, y_train)
print("Best: %f using %s" % (rand_result.best_score_, rand_result.best_params_))
best_XGB_estimator = rand_result.best_estimator_
pickle.dump(best_XGB_estimator, open("xgb_dwarves.pkl", 'wb'))

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] colsample_bytree=0.87039834402735, learning_rate=0.036085559562499425, max_depth=8, n_estimators=300 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  colsample_bytree=0.87039834402735, learning_rate=0.036085559562499425, max_depth=8, n_estimators=300, score=0.006386082410054228, total=  16.4s
[CV] colsample_bytree=0.87039834402735, learning_rate=0.036085559562499425, max_depth=8, n_estimators=300 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   16.5s remaining:    0.0s


[CV]  colsample_bytree=0.87039834402735, learning_rate=0.036085559562499425, max_depth=8, n_estimators=300, score=-47.266328286227505, total=  15.6s
[CV] colsample_bytree=0.87039834402735, learning_rate=0.036085559562499425, max_depth=8, n_estimators=300 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   32.2s remaining:    0.0s


[CV]  colsample_bytree=0.87039834402735, learning_rate=0.036085559562499425, max_depth=8, n_estimators=300, score=-4.45785211494758, total=  14.8s
[CV] colsample_bytree=0.8027843486747859, learning_rate=0.05947744568872531, max_depth=8, n_estimators=100 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   47.1s remaining:    0.0s


[CV]  colsample_bytree=0.8027843486747859, learning_rate=0.05947744568872531, max_depth=8, n_estimators=100, score=0.005993892360712572, total=   5.0s
[CV] colsample_bytree=0.8027843486747859, learning_rate=0.05947744568872531, max_depth=8, n_estimators=100 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   52.3s remaining:    0.0s


[CV]  colsample_bytree=0.8027843486747859, learning_rate=0.05947744568872531, max_depth=8, n_estimators=100, score=-27.89695723684013, total=   5.0s
[CV] colsample_bytree=0.8027843486747859, learning_rate=0.05947744568872531, max_depth=8, n_estimators=100 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   57.4s remaining:    0.0s


[CV]  colsample_bytree=0.8027843486747859, learning_rate=0.05947744568872531, max_depth=8, n_estimators=100, score=-0.965691763953161, total=   5.0s
[CV] colsample_bytree=0.9596276141282687, learning_rate=0.08580040161762502, max_depth=8, n_estimators=100 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  1.0min remaining:    0.0s


[CV]  colsample_bytree=0.9596276141282687, learning_rate=0.08580040161762502, max_depth=8, n_estimators=100, score=0.006261661076658043, total=   4.9s
[CV] colsample_bytree=0.9596276141282687, learning_rate=0.08580040161762502, max_depth=8, n_estimators=100 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  1.1min remaining:    0.0s


[CV]  colsample_bytree=0.9596276141282687, learning_rate=0.08580040161762502, max_depth=8, n_estimators=100, score=-38.45960926387023, total=   4.9s
[CV] colsample_bytree=0.9596276141282687, learning_rate=0.08580040161762502, max_depth=8, n_estimators=100 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  1.2min remaining:    0.0s


[CV]  colsample_bytree=0.9596276141282687, learning_rate=0.08580040161762502, max_depth=8, n_estimators=100, score=-1.470753911618281, total=   4.8s


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  1.3min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  1.3min finished


Best: -9.618832 using {'colsample_bytree': 0.8027843486747859, 'learning_rate': 0.05947744568872531, 'max_depth': 8, 'n_estimators': 100}


In [22]:
xgb = XGBRegressor(n_estimators=300, learning_rate=0.08, gamma=0, subsample=0.75, colsample_bytree=1, max_depth=7)
xgb.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.08, max_delta_step=0,
       max_depth=7, min_child_weight=1, missing=None, n_estimators=300,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.75)

In [23]:
predictions = xgb.predict(X_test)
print(explained_variance_score(predictions,y_test))

0.252192115333184


In [24]:
# pickle.dump(xgb, open('xgb_dwarves_nonsearch.pkl','wb'))

In [25]:
print(type(X_test))

<class 'pandas.core.frame.DataFrame'>


In [26]:
print(type(X_test.iloc[0]))

<class 'pandas.core.series.Series'>


In [27]:
print(X_test)

        Month  Day  Year  Hour  Minute  DayOfWeek
66977       8   12  2015    18      21          6
68450       8   25  2015     9      49          5
21056      11    4  2014     9      32          5
187242      3   14  2018    17      21          6
155766      8    6  2017    19      15          3
240577      3    9  2019    17      49          2
187936      3   19  2018    10      14          1
175135     12   17  2017     9      35          3
212631      9    5  2018    10      49          6
196929      5   24  2018    19       0          4
195993      5   17  2018    21      56          4
82778      12    8  2015    20       7          5
50334       5    6  2015     9      30          6
79642      11   16  2015    16      56          1
38044       2   21  2015    13      45          2
233616      1   18  2019    15      56          0
176558     12   25  2017    21      49          1
137223      3   10  2017    11      56          0
168562     11    4  2017    15      58          2


In [28]:
model = pickle.load(open('xgb_dwarves.pkl','rb'))
month = 6
day = 4
year = 2013
hour = 9
minute = 0
dayofweek = 5
data = [[month, day, year, hour, minute, dayofweek]]
print(data)

[[6, 4, 2013, 9, 0, 5]]


In [29]:
input_df = pd.DataFrame(data, columns =['Month','Day','Year','Hour','Minute','DayOfWeek'])
print(input_df)

   Month  Day  Year  Hour  Minute  DayOfWeek
0      6    4  2013     9       0          5


In [31]:
prediction = model.predict(input_df)

In [32]:
print(prediction)

[41.452232]


In [33]:
df_initial = pd.read_csv('alien_saucers.csv')
for index, row in df_initial.iterrows():
    if math.isnan(row['SPOSTMIN']):
        df_initial.loc[index,'SPOSTMIN'] = df_initial.loc[index, 'SACTMIN']
#     print(row['SPOSTMIN'], row['SACTMIN'])
df_initial = df_initial.drop(columns="SACTMIN")
df_initial = df_initial[df_initial['SPOSTMIN'] != -999]
df_initial['Month'] = df_initial.date.str.split('/').str[0]
df_initial['Day'] = df_initial.date.str.split('/').str[1]
df_initial['Year'] = df_initial.date.str.rsplit('/', 1).str[1]
df_initial['Time_char'] = df_initial.datetime.str.split(' ').str[1]
df_initial['Hour'] = df_initial['Time_char'].str.split(':').str[0]
df_initial['Minute'] = df_initial['Time_char'].str.split(':').str[1]
df_initial = df_initial.drop(columns="Time_char")
df_initial['Month'] = (df_initial['Month']).astype(int)
df_initial['Day'] = (df_initial['Day']).astype(int)
df_initial['Year'] = (df_initial['Year']).astype(int)
df_initial['Hour'] = (df_initial['Hour']).astype(int)
df_initial['Minute'] = (df_initial['Minute']).astype(int)
df_y = df_initial['SPOSTMIN']
label_encoder_DOW = LabelEncoder()
DoW_feature = label_encoder_DOW.fit_transform(df_initial.DAYOFWEEK.iloc[:].values)
# new_col = pd.Series(DoW_feature)
df_initial['DayOfWeek'] = DoW_feature
df_initial = df_initial.drop(columns=["DAYOFWEEK", "date", "datetime", "SPOSTMIN"])
random_seed = 5
t_s = .20
X_train, X_test, y_train, y_test = train_test_split(df_initial, df_y, test_size = t_s, random_state = random_seed)
param_grid = {'n_estimators': [100, 200, 300], #random int btwn 100 and 500 - removed
              'learning_rate': stats.uniform(0.01, 0.08), #.01 + loc, range of .01+/-.08
              'max_depth': [2, 4, 6, 8], #tree depths to check
              'colsample_bytree': stats.uniform(0.3, 0.7) #btwn .1 and 1.0    
}
kfold = KFold(n_splits=3, shuffle=True, random_state=random_seed)
model = XGBRegressor(tree_method='gpu_hist')
rand_search = RandomizedSearchCV(model, param_distributions = param_grid, scoring = 'explained_variance', n_iter = 3, verbose = 10, cv=kfold)
rand_result = rand_search.fit(X_train, y_train)
print("Best: %f using %s" % (rand_result.best_score_, rand_result.best_params_))
best_XGB_estimator = rand_result.best_estimator_
pickle.dump(best_XGB_estimator, open("xgb_alien_saucers.pkl", 'wb'))

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] colsample_bytree=0.577976915017408, learning_rate=0.06002926970353843, max_depth=2, n_estimators=200 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  colsample_bytree=0.577976915017408, learning_rate=0.06002926970353843, max_depth=2, n_estimators=200, score=0.4447262116945443, total=   0.2s
[CV] colsample_bytree=0.577976915017408, learning_rate=0.06002926970353843, max_depth=2, n_estimators=200 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV]  colsample_bytree=0.577976915017408, learning_rate=0.06002926970353843, max_depth=2, n_estimators=200, score=0.4567281211554609, total=   0.2s
[CV] colsample_bytree=0.577976915017408, learning_rate=0.06002926970353843, max_depth=2, n_estimators=200 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.6s remaining:    0.0s


[CV]  colsample_bytree=0.577976915017408, learning_rate=0.06002926970353843, max_depth=2, n_estimators=200, score=0.4220322500180581, total=   0.2s
[CV] colsample_bytree=0.5164940677309758, learning_rate=0.0423351957354838, max_depth=6, n_estimators=200 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.9s remaining:    0.0s


[CV]  colsample_bytree=0.5164940677309758, learning_rate=0.0423351957354838, max_depth=6, n_estimators=200, score=0.6860681757844085, total=   2.1s
[CV] colsample_bytree=0.5164940677309758, learning_rate=0.0423351957354838, max_depth=6, n_estimators=200 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    3.1s remaining:    0.0s


[CV]  colsample_bytree=0.5164940677309758, learning_rate=0.0423351957354838, max_depth=6, n_estimators=200, score=0.6844072313321747, total=   2.1s
[CV] colsample_bytree=0.5164940677309758, learning_rate=0.0423351957354838, max_depth=6, n_estimators=200 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    5.3s remaining:    0.0s


[CV]  colsample_bytree=0.5164940677309758, learning_rate=0.0423351957354838, max_depth=6, n_estimators=200, score=0.6927774349548144, total=   2.1s
[CV] colsample_bytree=0.8126016842908925, learning_rate=0.011670201080323723, max_depth=2, n_estimators=200 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    7.5s remaining:    0.0s


[CV]  colsample_bytree=0.8126016842908925, learning_rate=0.011670201080323723, max_depth=2, n_estimators=200, score=0.30511549560772655, total=   0.2s
[CV] colsample_bytree=0.8126016842908925, learning_rate=0.011670201080323723, max_depth=2, n_estimators=200 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    7.9s remaining:    0.0s


[CV]  colsample_bytree=0.8126016842908925, learning_rate=0.011670201080323723, max_depth=2, n_estimators=200, score=0.31701837859390325, total=   0.2s
[CV] colsample_bytree=0.8126016842908925, learning_rate=0.011670201080323723, max_depth=2, n_estimators=200 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    8.2s remaining:    0.0s


[CV]  colsample_bytree=0.8126016842908925, learning_rate=0.011670201080323723, max_depth=2, n_estimators=200, score=0.29188853694420935, total=   0.2s


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    8.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    8.5s finished


Best: 0.687751 using {'colsample_bytree': 0.5164940677309758, 'learning_rate': 0.0423351957354838, 'max_depth': 6, 'n_estimators': 200}


In [34]:
df_initial = pd.read_csv('dinosaur.csv')
for index, row in df_initial.iterrows():
    if math.isnan(row['SPOSTMIN']):
        df_initial.loc[index,'SPOSTMIN'] = df_initial.loc[index, 'SACTMIN']
#     print(row['SPOSTMIN'], row['SACTMIN'])
df_initial = df_initial.drop(columns="SACTMIN")
df_initial = df_initial[df_initial['SPOSTMIN'] != -999]
df_initial['Month'] = df_initial.date.str.split('/').str[0]
df_initial['Day'] = df_initial.date.str.split('/').str[1]
df_initial['Year'] = df_initial.date.str.rsplit('/', 1).str[1]
df_initial['Time_char'] = df_initial.datetime.str.split(' ').str[1]
df_initial['Hour'] = df_initial['Time_char'].str.split(':').str[0]
df_initial['Minute'] = df_initial['Time_char'].str.split(':').str[1]
df_initial = df_initial.drop(columns="Time_char")
df_initial['Month'] = (df_initial['Month']).astype(int)
df_initial['Day'] = (df_initial['Day']).astype(int)
df_initial['Year'] = (df_initial['Year']).astype(int)
df_initial['Hour'] = (df_initial['Hour']).astype(int)
df_initial['Minute'] = (df_initial['Minute']).astype(int)
df_y = df_initial['SPOSTMIN']
label_encoder_DOW = LabelEncoder()
DoW_feature = label_encoder_DOW.fit_transform(df_initial.DAYOFWEEK.iloc[:].values)
# new_col = pd.Series(DoW_feature)
df_initial['DayOfWeek'] = DoW_feature
df_initial = df_initial.drop(columns=["DAYOFWEEK", "date", "datetime", "SPOSTMIN"])
random_seed = 5
t_s = .20
X_train, X_test, y_train, y_test = train_test_split(df_initial, df_y, test_size = t_s, random_state = random_seed)
param_grid = {'n_estimators': [100, 200, 300], #random int btwn 100 and 500 - removed
              'learning_rate': stats.uniform(0.01, 0.08), #.01 + loc, range of .01+/-.08
              'max_depth': [2, 4, 6, 8], #tree depths to check
              'colsample_bytree': stats.uniform(0.3, 0.7) #btwn .1 and 1.0    
}
kfold = KFold(n_splits=3, shuffle=True, random_state=random_seed)
model = XGBRegressor(tree_method='gpu_hist')
rand_search = RandomizedSearchCV(model, param_distributions = param_grid, scoring = 'explained_variance', n_iter = 3, verbose = 10, cv=kfold)
rand_result = rand_search.fit(X_train, y_train)
print("Best: %f using %s" % (rand_result.best_score_, rand_result.best_params_))
best_XGB_estimator = rand_result.best_estimator_
pickle.dump(best_XGB_estimator, open("xgb_dinosaur.pkl", 'wb'))

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] colsample_bytree=0.3788152731681116, learning_rate=0.03763450269760868, max_depth=4, n_estimators=100 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  colsample_bytree=0.3788152731681116, learning_rate=0.03763450269760868, max_depth=4, n_estimators=100, score=0.3305039087984535, total=   0.5s
[CV] colsample_bytree=0.3788152731681116, learning_rate=0.03763450269760868, max_depth=4, n_estimators=100 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


[CV]  colsample_bytree=0.3788152731681116, learning_rate=0.03763450269760868, max_depth=4, n_estimators=100, score=0.32828419390423036, total=   0.5s
[CV] colsample_bytree=0.3788152731681116, learning_rate=0.03763450269760868, max_depth=4, n_estimators=100 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.2s remaining:    0.0s


[CV]  colsample_bytree=0.3788152731681116, learning_rate=0.03763450269760868, max_depth=4, n_estimators=100, score=0.33014473016703194, total=   0.5s
[CV] colsample_bytree=0.32837068043255735, learning_rate=0.051036144348603554, max_depth=8, n_estimators=200 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.8s remaining:    0.0s


[CV]  colsample_bytree=0.32837068043255735, learning_rate=0.051036144348603554, max_depth=8, n_estimators=200, score=0.3401325824749507, total=   1.4s
[CV] colsample_bytree=0.32837068043255735, learning_rate=0.051036144348603554, max_depth=8, n_estimators=200 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    3.3s remaining:    0.0s


[CV]  colsample_bytree=0.32837068043255735, learning_rate=0.051036144348603554, max_depth=8, n_estimators=200, score=0.33824532697476795, total=   1.3s
[CV] colsample_bytree=0.32837068043255735, learning_rate=0.051036144348603554, max_depth=8, n_estimators=200 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    4.7s remaining:    0.0s


[CV]  colsample_bytree=0.32837068043255735, learning_rate=0.051036144348603554, max_depth=8, n_estimators=200, score=0.3422653129298958, total=   1.4s
[CV] colsample_bytree=0.6859201280124755, learning_rate=0.04250813727131032, max_depth=2, n_estimators=200 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    6.2s remaining:    0.0s


[CV]  colsample_bytree=0.6859201280124755, learning_rate=0.04250813727131032, max_depth=2, n_estimators=200, score=0.3756878805030033, total=   0.5s
[CV] colsample_bytree=0.6859201280124755, learning_rate=0.04250813727131032, max_depth=2, n_estimators=200 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    6.8s remaining:    0.0s


[CV]  colsample_bytree=0.6859201280124755, learning_rate=0.04250813727131032, max_depth=2, n_estimators=200, score=0.37636754996530175, total=   0.5s
[CV] colsample_bytree=0.6859201280124755, learning_rate=0.04250813727131032, max_depth=2, n_estimators=200 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    7.4s remaining:    0.0s


[CV]  colsample_bytree=0.6859201280124755, learning_rate=0.04250813727131032, max_depth=2, n_estimators=200, score=0.37522729181482106, total=   0.5s


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    8.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    8.0s finished


Best: 0.375761 using {'colsample_bytree': 0.6859201280124755, 'learning_rate': 0.04250813727131032, 'max_depth': 2, 'n_estimators': 200}


In [35]:
df_initial = pd.read_csv('expedition_everest.csv')
for index, row in df_initial.iterrows():
    if math.isnan(row['SPOSTMIN']):
        df_initial.loc[index,'SPOSTMIN'] = df_initial.loc[index, 'SACTMIN']
#     print(row['SPOSTMIN'], row['SACTMIN'])
df_initial = df_initial.drop(columns="SACTMIN")
df_initial = df_initial[df_initial['SPOSTMIN'] != -999]
df_initial['Month'] = df_initial.date.str.split('/').str[0]
df_initial['Day'] = df_initial.date.str.split('/').str[1]
df_initial['Year'] = df_initial.date.str.rsplit('/', 1).str[1]
df_initial['Time_char'] = df_initial.datetime.str.split(' ').str[1]
df_initial['Hour'] = df_initial['Time_char'].str.split(':').str[0]
df_initial['Minute'] = df_initial['Time_char'].str.split(':').str[1]
df_initial = df_initial.drop(columns="Time_char")
df_initial['Month'] = (df_initial['Month']).astype(int)
df_initial['Day'] = (df_initial['Day']).astype(int)
df_initial['Year'] = (df_initial['Year']).astype(int)
df_initial['Hour'] = (df_initial['Hour']).astype(int)
df_initial['Minute'] = (df_initial['Minute']).astype(int)
df_y = df_initial['SPOSTMIN']
label_encoder_DOW = LabelEncoder()
DoW_feature = label_encoder_DOW.fit_transform(df_initial.DAYOFWEEK.iloc[:].values)
# new_col = pd.Series(DoW_feature)
df_initial['DayOfWeek'] = DoW_feature
df_initial = df_initial.drop(columns=["DAYOFWEEK", "date", "datetime", "SPOSTMIN"])
random_seed = 5
t_s = .20
X_train, X_test, y_train, y_test = train_test_split(df_initial, df_y, test_size = t_s, random_state = random_seed)
param_grid = {'n_estimators': [100, 200, 300], #random int btwn 100 and 500 - removed
              'learning_rate': stats.uniform(0.01, 0.08), #.01 + loc, range of .01+/-.08
              'max_depth': [2, 4, 6, 8], #tree depths to check
              'colsample_bytree': stats.uniform(0.3, 0.7) #btwn .1 and 1.0    
}
kfold = KFold(n_splits=3, shuffle=True, random_state=random_seed)
model = XGBRegressor(tree_method='gpu_hist')
rand_search = RandomizedSearchCV(model, param_distributions = param_grid, scoring = 'explained_variance', n_iter = 3, verbose = 10, cv=kfold)
rand_result = rand_search.fit(X_train, y_train)
print("Best: %f using %s" % (rand_result.best_score_, rand_result.best_params_))
best_XGB_estimator = rand_result.best_estimator_
pickle.dump(best_XGB_estimator, open("xgb_expedition_everest.pkl", 'wb'))

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] colsample_bytree=0.5520118345812709, learning_rate=0.04062364873027281, max_depth=2, n_estimators=300 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  colsample_bytree=0.5520118345812709, learning_rate=0.04062364873027281, max_depth=2, n_estimators=300, score=0.4722528198216195, total=   0.8s
[CV] colsample_bytree=0.5520118345812709, learning_rate=0.04062364873027281, max_depth=2, n_estimators=300 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s remaining:    0.0s


[CV]  colsample_bytree=0.5520118345812709, learning_rate=0.04062364873027281, max_depth=2, n_estimators=300, score=0.47288870918773196, total=   0.8s
[CV] colsample_bytree=0.5520118345812709, learning_rate=0.04062364873027281, max_depth=2, n_estimators=300 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.7s remaining:    0.0s


[CV]  colsample_bytree=0.5520118345812709, learning_rate=0.04062364873027281, max_depth=2, n_estimators=300, score=0.468079306258227, total=   0.8s
[CV] colsample_bytree=0.385248419026763, learning_rate=0.0117244229397226, max_depth=6, n_estimators=100 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    2.6s remaining:    0.0s


[CV]  colsample_bytree=0.385248419026763, learning_rate=0.0117244229397226, max_depth=6, n_estimators=100, score=0.24573216204728143, total=   1.3s
[CV] colsample_bytree=0.385248419026763, learning_rate=0.0117244229397226, max_depth=6, n_estimators=100 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    4.1s remaining:    0.0s


[CV]  colsample_bytree=0.385248419026763, learning_rate=0.0117244229397226, max_depth=6, n_estimators=100, score=0.24689745232382665, total=   1.3s
[CV] colsample_bytree=0.385248419026763, learning_rate=0.0117244229397226, max_depth=6, n_estimators=100 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    5.5s remaining:    0.0s


[CV]  colsample_bytree=0.385248419026763, learning_rate=0.0117244229397226, max_depth=6, n_estimators=100, score=0.2463104096618527, total=   1.3s
[CV] colsample_bytree=0.7168642025772023, learning_rate=0.0762026436042897, max_depth=8, n_estimators=100 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    6.9s remaining:    0.0s


[CV]  colsample_bytree=0.7168642025772023, learning_rate=0.0762026436042897, max_depth=8, n_estimators=100, score=0.725835318362474, total=   5.0s
[CV] colsample_bytree=0.7168642025772023, learning_rate=0.0762026436042897, max_depth=8, n_estimators=100 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   12.1s remaining:    0.0s


[CV]  colsample_bytree=0.7168642025772023, learning_rate=0.0762026436042897, max_depth=8, n_estimators=100, score=0.7294919425571937, total=   5.1s
[CV] colsample_bytree=0.7168642025772023, learning_rate=0.0762026436042897, max_depth=8, n_estimators=100 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   17.4s remaining:    0.0s


[CV]  colsample_bytree=0.7168642025772023, learning_rate=0.0762026436042897, max_depth=8, n_estimators=100, score=0.7236383494320533, total=   5.0s


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   22.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   22.5s finished


Best: 0.726322 using {'colsample_bytree': 0.7168642025772023, 'learning_rate': 0.0762026436042897, 'max_depth': 8, 'n_estimators': 100}


In [36]:
df_initial = pd.read_csv('flight_of_passage.csv')
for index, row in df_initial.iterrows():
    if math.isnan(row['SPOSTMIN']):
        df_initial.loc[index,'SPOSTMIN'] = df_initial.loc[index, 'SACTMIN']
#     print(row['SPOSTMIN'], row['SACTMIN'])
df_initial = df_initial.drop(columns="SACTMIN")
df_initial = df_initial[df_initial['SPOSTMIN'] != -999]
df_initial['Month'] = df_initial.date.str.split('/').str[0]
df_initial['Day'] = df_initial.date.str.split('/').str[1]
df_initial['Year'] = df_initial.date.str.rsplit('/', 1).str[1]
df_initial['Time_char'] = df_initial.datetime.str.split(' ').str[1]
df_initial['Hour'] = df_initial['Time_char'].str.split(':').str[0]
df_initial['Minute'] = df_initial['Time_char'].str.split(':').str[1]
df_initial = df_initial.drop(columns="Time_char")
df_initial['Month'] = (df_initial['Month']).astype(int)
df_initial['Day'] = (df_initial['Day']).astype(int)
df_initial['Year'] = (df_initial['Year']).astype(int)
df_initial['Hour'] = (df_initial['Hour']).astype(int)
df_initial['Minute'] = (df_initial['Minute']).astype(int)
df_y = df_initial['SPOSTMIN']
label_encoder_DOW = LabelEncoder()
DoW_feature = label_encoder_DOW.fit_transform(df_initial.DAYOFWEEK.iloc[:].values)
# new_col = pd.Series(DoW_feature)
df_initial['DayOfWeek'] = DoW_feature
df_initial = df_initial.drop(columns=["DAYOFWEEK", "date", "datetime", "SPOSTMIN"])
random_seed = 5
t_s = .20
X_train, X_test, y_train, y_test = train_test_split(df_initial, df_y, test_size = t_s, random_state = random_seed)
param_grid = {'n_estimators': [100, 200, 300], #random int btwn 100 and 500 - removed
              'learning_rate': stats.uniform(0.01, 0.08), #.01 + loc, range of .01+/-.08
              'max_depth': [2, 4, 6, 8], #tree depths to check
              'colsample_bytree': stats.uniform(0.3, 0.7) #btwn .1 and 1.0    
}
kfold = KFold(n_splits=3, shuffle=True, random_state=random_seed)
model = XGBRegressor(tree_method='gpu_hist')
rand_search = RandomizedSearchCV(model, param_distributions = param_grid, scoring = 'explained_variance', n_iter = 3, verbose = 10, cv=kfold)
rand_result = rand_search.fit(X_train, y_train)
print("Best: %f using %s" % (rand_result.best_score_, rand_result.best_params_))
best_XGB_estimator = rand_result.best_estimator_
pickle.dump(best_XGB_estimator, open("xgb_flight_of_passage.pkl", 'wb'))

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] colsample_bytree=0.3421465446336176, learning_rate=0.022001785127876525, max_depth=6, n_estimators=100 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  colsample_bytree=0.3421465446336176, learning_rate=0.022001785127876525, max_depth=6, n_estimators=100, score=0.003935369031655256, total=   0.9s
[CV] colsample_bytree=0.3421465446336176, learning_rate=0.022001785127876525, max_depth=6, n_estimators=100 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.9s remaining:    0.0s


[CV]  colsample_bytree=0.3421465446336176, learning_rate=0.022001785127876525, max_depth=6, n_estimators=100, score=-0.015520592779881826, total=   0.9s
[CV] colsample_bytree=0.3421465446336176, learning_rate=0.022001785127876525, max_depth=6, n_estimators=100 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.0s remaining:    0.0s


[CV]  colsample_bytree=0.3421465446336176, learning_rate=0.022001785127876525, max_depth=6, n_estimators=100, score=0.008135198269304311, total=   0.9s
[CV] colsample_bytree=0.34640610354852347, learning_rate=0.0647676420993673, max_depth=6, n_estimators=300 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    3.0s remaining:    0.0s


[CV]  colsample_bytree=0.34640610354852347, learning_rate=0.0647676420993673, max_depth=6, n_estimators=300, score=-2.646395910608601, total=   3.1s
[CV] colsample_bytree=0.34640610354852347, learning_rate=0.0647676420993673, max_depth=6, n_estimators=300 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    6.2s remaining:    0.0s


[CV]  colsample_bytree=0.34640610354852347, learning_rate=0.0647676420993673, max_depth=6, n_estimators=300, score=-2.9884744203433864, total=   3.0s
[CV] colsample_bytree=0.34640610354852347, learning_rate=0.0647676420993673, max_depth=6, n_estimators=300 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    9.3s remaining:    0.0s


[CV]  colsample_bytree=0.34640610354852347, learning_rate=0.0647676420993673, max_depth=6, n_estimators=300, score=0.014723403688300496, total=   3.0s
[CV] colsample_bytree=0.7345277052079023, learning_rate=0.027467861334061663, max_depth=8, n_estimators=100 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   12.4s remaining:    0.0s


[CV]  colsample_bytree=0.7345277052079023, learning_rate=0.027467861334061663, max_depth=8, n_estimators=100, score=-0.6614832822603196, total=   4.1s
[CV] colsample_bytree=0.7345277052079023, learning_rate=0.027467861334061663, max_depth=8, n_estimators=100 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   16.7s remaining:    0.0s


[CV]  colsample_bytree=0.7345277052079023, learning_rate=0.027467861334061663, max_depth=8, n_estimators=100, score=-0.11011598028759129, total=   4.1s
[CV] colsample_bytree=0.7345277052079023, learning_rate=0.027467861334061663, max_depth=8, n_estimators=100 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   20.9s remaining:    0.0s


[CV]  colsample_bytree=0.7345277052079023, learning_rate=0.027467861334061663, max_depth=8, n_estimators=100, score=0.014981626658876812, total=   4.0s


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   25.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   25.1s finished


Best: -0.001150 using {'colsample_bytree': 0.3421465446336176, 'learning_rate': 0.022001785127876525, 'max_depth': 6, 'n_estimators': 100}


In [37]:
df_initial = pd.read_csv('kilimanjaro_safaris.csv')
for index, row in df_initial.iterrows():
    if math.isnan(row['SPOSTMIN']):
        df_initial.loc[index,'SPOSTMIN'] = df_initial.loc[index, 'SACTMIN']
#     print(row['SPOSTMIN'], row['SACTMIN'])
df_initial = df_initial.drop(columns="SACTMIN")
df_initial = df_initial[df_initial['SPOSTMIN'] != -999]
df_initial['Month'] = df_initial.date.str.split('/').str[0]
df_initial['Day'] = df_initial.date.str.split('/').str[1]
df_initial['Year'] = df_initial.date.str.rsplit('/', 1).str[1]
df_initial['Time_char'] = df_initial.datetime.str.split(' ').str[1]
df_initial['Hour'] = df_initial['Time_char'].str.split(':').str[0]
df_initial['Minute'] = df_initial['Time_char'].str.split(':').str[1]
df_initial = df_initial.drop(columns="Time_char")
df_initial['Month'] = (df_initial['Month']).astype(int)
df_initial['Day'] = (df_initial['Day']).astype(int)
df_initial['Year'] = (df_initial['Year']).astype(int)
df_initial['Hour'] = (df_initial['Hour']).astype(int)
df_initial['Minute'] = (df_initial['Minute']).astype(int)
df_y = df_initial['SPOSTMIN']
label_encoder_DOW = LabelEncoder()
DoW_feature = label_encoder_DOW.fit_transform(df_initial.DAYOFWEEK.iloc[:].values)
# new_col = pd.Series(DoW_feature)
df_initial['DayOfWeek'] = DoW_feature
df_initial = df_initial.drop(columns=["DAYOFWEEK", "date", "datetime", "SPOSTMIN"])
random_seed = 5
t_s = .20
X_train, X_test, y_train, y_test = train_test_split(df_initial, df_y, test_size = t_s, random_state = random_seed)
param_grid = {'n_estimators': [100, 200, 300], #random int btwn 100 and 500 - removed
              'learning_rate': stats.uniform(0.01, 0.08), #.01 + loc, range of .01+/-.08
              'max_depth': [2, 4, 6, 8], #tree depths to check
              'colsample_bytree': stats.uniform(0.3, 0.7) #btwn .1 and 1.0    
}
kfold = KFold(n_splits=3, shuffle=True, random_state=random_seed)
model = XGBRegressor(tree_method='gpu_hist')
rand_search = RandomizedSearchCV(model, param_distributions = param_grid, scoring = 'explained_variance', n_iter = 3, verbose = 10, cv=kfold)
rand_result = rand_search.fit(X_train, y_train)
print("Best: %f using %s" % (rand_result.best_score_, rand_result.best_params_))
best_XGB_estimator = rand_result.best_estimator_
pickle.dump(best_XGB_estimator, open("xgb_kilimanjaro_safaris.pkl", 'wb'))

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] colsample_bytree=0.5260751836091215, learning_rate=0.08035609113697822, max_depth=8, n_estimators=200 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  colsample_bytree=0.5260751836091215, learning_rate=0.08035609113697822, max_depth=8, n_estimators=200, score=0.6652307626218938, total=   8.7s
[CV] colsample_bytree=0.5260751836091215, learning_rate=0.08035609113697822, max_depth=8, n_estimators=200 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    8.7s remaining:    0.0s


[CV]  colsample_bytree=0.5260751836091215, learning_rate=0.08035609113697822, max_depth=8, n_estimators=200, score=0.6631760236850532, total=   8.6s
[CV] colsample_bytree=0.5260751836091215, learning_rate=0.08035609113697822, max_depth=8, n_estimators=200 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   17.4s remaining:    0.0s


[CV]  colsample_bytree=0.5260751836091215, learning_rate=0.08035609113697822, max_depth=8, n_estimators=200, score=0.667921820631203, total=   8.9s
[CV] colsample_bytree=0.941942775789034, learning_rate=0.018879294016387584, max_depth=4, n_estimators=100 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   26.5s remaining:    0.0s


[CV]  colsample_bytree=0.941942775789034, learning_rate=0.018879294016387584, max_depth=4, n_estimators=100, score=0.3646020090173371, total=   0.5s
[CV] colsample_bytree=0.941942775789034, learning_rate=0.018879294016387584, max_depth=4, n_estimators=100 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   27.1s remaining:    0.0s


[CV]  colsample_bytree=0.941942775789034, learning_rate=0.018879294016387584, max_depth=4, n_estimators=100, score=0.36398839774728264, total=   0.5s
[CV] colsample_bytree=0.941942775789034, learning_rate=0.018879294016387584, max_depth=4, n_estimators=100 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   27.8s remaining:    0.0s


[CV]  colsample_bytree=0.941942775789034, learning_rate=0.018879294016387584, max_depth=4, n_estimators=100, score=0.367775136085541, total=   0.5s
[CV] colsample_bytree=0.6513094517022668, learning_rate=0.04047125455705542, max_depth=6, n_estimators=100 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   28.4s remaining:    0.0s


[CV]  colsample_bytree=0.6513094517022668, learning_rate=0.04047125455705542, max_depth=6, n_estimators=100, score=0.4890402482408641, total=   1.4s
[CV] colsample_bytree=0.6513094517022668, learning_rate=0.04047125455705542, max_depth=6, n_estimators=100 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   30.0s remaining:    0.0s


[CV]  colsample_bytree=0.6513094517022668, learning_rate=0.04047125455705542, max_depth=6, n_estimators=100, score=0.4865454462123918, total=   1.4s
[CV] colsample_bytree=0.6513094517022668, learning_rate=0.04047125455705542, max_depth=6, n_estimators=100 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   31.6s remaining:    0.0s


[CV]  colsample_bytree=0.6513094517022668, learning_rate=0.04047125455705542, max_depth=6, n_estimators=100, score=0.49248089001554074, total=   1.5s


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   33.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   33.2s finished


Best: 0.665443 using {'colsample_bytree': 0.5260751836091215, 'learning_rate': 0.08035609113697822, 'max_depth': 8, 'n_estimators': 200}


In [38]:
df_initial = pd.read_csv('navi_river.csv')
for index, row in df_initial.iterrows():
    if math.isnan(row['SPOSTMIN']):
        df_initial.loc[index,'SPOSTMIN'] = df_initial.loc[index, 'SACTMIN']
#     print(row['SPOSTMIN'], row['SACTMIN'])
df_initial = df_initial.drop(columns="SACTMIN")
df_initial = df_initial[df_initial['SPOSTMIN'] != -999]
df_initial['Month'] = df_initial.date.str.split('/').str[0]
df_initial['Day'] = df_initial.date.str.split('/').str[1]
df_initial['Year'] = df_initial.date.str.rsplit('/', 1).str[1]
df_initial['Time_char'] = df_initial.datetime.str.split(' ').str[1]
df_initial['Hour'] = df_initial['Time_char'].str.split(':').str[0]
df_initial['Minute'] = df_initial['Time_char'].str.split(':').str[1]
df_initial = df_initial.drop(columns="Time_char")
df_initial['Month'] = (df_initial['Month']).astype(int)
df_initial['Day'] = (df_initial['Day']).astype(int)
df_initial['Year'] = (df_initial['Year']).astype(int)
df_initial['Hour'] = (df_initial['Hour']).astype(int)
df_initial['Minute'] = (df_initial['Minute']).astype(int)
df_y = df_initial['SPOSTMIN']
label_encoder_DOW = LabelEncoder()
DoW_feature = label_encoder_DOW.fit_transform(df_initial.DAYOFWEEK.iloc[:].values)
# new_col = pd.Series(DoW_feature)
df_initial['DayOfWeek'] = DoW_feature
df_initial = df_initial.drop(columns=["DAYOFWEEK", "date", "datetime", "SPOSTMIN"])
random_seed = 5
t_s = .20
X_train, X_test, y_train, y_test = train_test_split(df_initial, df_y, test_size = t_s, random_state = random_seed)
param_grid = {'n_estimators': [100, 200, 300], #random int btwn 100 and 500 - removed
              'learning_rate': stats.uniform(0.01, 0.08), #.01 + loc, range of .01+/-.08
              'max_depth': [2, 4, 6, 8], #tree depths to check
              'colsample_bytree': stats.uniform(0.3, 0.7) #btwn .1 and 1.0    
}
kfold = KFold(n_splits=3, shuffle=True, random_state=random_seed)
model = XGBRegressor(tree_method='gpu_hist')
rand_search = RandomizedSearchCV(model, param_distributions = param_grid, scoring = 'explained_variance', n_iter = 3, verbose = 10, cv=kfold)
rand_result = rand_search.fit(X_train, y_train)
print("Best: %f using %s" % (rand_result.best_score_, rand_result.best_params_))
best_XGB_estimator = rand_result.best_estimator_
pickle.dump(best_XGB_estimator, open("xgb_navi_river.pkl", 'wb'))

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] colsample_bytree=0.7107008216022356, learning_rate=0.08437304676552695, max_depth=2, n_estimators=300 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  colsample_bytree=0.7107008216022356, learning_rate=0.08437304676552695, max_depth=2, n_estimators=300, score=0.5923771663955524, total=   0.5s
[CV] colsample_bytree=0.7107008216022356, learning_rate=0.08437304676552695, max_depth=2, n_estimators=300 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


[CV]  colsample_bytree=0.7107008216022356, learning_rate=0.08437304676552695, max_depth=2, n_estimators=300, score=0.5900590214070527, total=   0.5s
[CV] colsample_bytree=0.7107008216022356, learning_rate=0.08437304676552695, max_depth=2, n_estimators=300 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.1s remaining:    0.0s


[CV]  colsample_bytree=0.7107008216022356, learning_rate=0.08437304676552695, max_depth=2, n_estimators=300, score=0.5884690539923785, total=   0.4s
[CV] colsample_bytree=0.9008114290623426, learning_rate=0.05565181104802352, max_depth=6, n_estimators=100 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.6s remaining:    0.0s


[CV]  colsample_bytree=0.9008114290623426, learning_rate=0.05565181104802352, max_depth=6, n_estimators=100, score=0.6916038492156933, total=   1.3s
[CV] colsample_bytree=0.9008114290623426, learning_rate=0.05565181104802352, max_depth=6, n_estimators=100 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    3.0s remaining:    0.0s


[CV]  colsample_bytree=0.9008114290623426, learning_rate=0.05565181104802352, max_depth=6, n_estimators=100, score=0.6856765943094658, total=   1.3s
[CV] colsample_bytree=0.9008114290623426, learning_rate=0.05565181104802352, max_depth=6, n_estimators=100 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    4.4s remaining:    0.0s


[CV]  colsample_bytree=0.9008114290623426, learning_rate=0.05565181104802352, max_depth=6, n_estimators=100, score=0.6859732990840257, total=   1.3s
[CV] colsample_bytree=0.39253976183476985, learning_rate=0.05686053902869172, max_depth=2, n_estimators=200 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    5.8s remaining:    0.0s


[CV]  colsample_bytree=0.39253976183476985, learning_rate=0.05686053902869172, max_depth=2, n_estimators=200, score=0.5160457538203663, total=   0.3s
[CV] colsample_bytree=0.39253976183476985, learning_rate=0.05686053902869172, max_depth=2, n_estimators=200 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    6.2s remaining:    0.0s


[CV]  colsample_bytree=0.39253976183476985, learning_rate=0.05686053902869172, max_depth=2, n_estimators=200, score=0.5141764366877982, total=   0.3s
[CV] colsample_bytree=0.39253976183476985, learning_rate=0.05686053902869172, max_depth=2, n_estimators=200 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    6.7s remaining:    0.0s


[CV]  colsample_bytree=0.39253976183476985, learning_rate=0.05686053902869172, max_depth=2, n_estimators=200, score=0.508146430254097, total=   0.3s


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    7.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    7.1s finished


Best: 0.687751 using {'colsample_bytree': 0.9008114290623426, 'learning_rate': 0.05565181104802352, 'max_depth': 6, 'n_estimators': 100}


In [39]:
df_initial = pd.read_csv('pirates_of_caribbean.csv')
for index, row in df_initial.iterrows():
    if math.isnan(row['SPOSTMIN']):
        df_initial.loc[index,'SPOSTMIN'] = df_initial.loc[index, 'SACTMIN']
#     print(row['SPOSTMIN'], row['SACTMIN'])
df_initial = df_initial.drop(columns="SACTMIN")
df_initial = df_initial[df_initial['SPOSTMIN'] != -999]
df_initial['Month'] = df_initial.date.str.split('/').str[0]
df_initial['Day'] = df_initial.date.str.split('/').str[1]
df_initial['Year'] = df_initial.date.str.rsplit('/', 1).str[1]
df_initial['Time_char'] = df_initial.datetime.str.split(' ').str[1]
df_initial['Hour'] = df_initial['Time_char'].str.split(':').str[0]
df_initial['Minute'] = df_initial['Time_char'].str.split(':').str[1]
df_initial = df_initial.drop(columns="Time_char")
df_initial['Month'] = (df_initial['Month']).astype(int)
df_initial['Day'] = (df_initial['Day']).astype(int)
df_initial['Year'] = (df_initial['Year']).astype(int)
df_initial['Hour'] = (df_initial['Hour']).astype(int)
df_initial['Minute'] = (df_initial['Minute']).astype(int)
df_y = df_initial['SPOSTMIN']
label_encoder_DOW = LabelEncoder()
DoW_feature = label_encoder_DOW.fit_transform(df_initial.DAYOFWEEK.iloc[:].values)
# new_col = pd.Series(DoW_feature)
df_initial['DayOfWeek'] = DoW_feature
df_initial = df_initial.drop(columns=["DAYOFWEEK", "date", "datetime", "SPOSTMIN"])
random_seed = 5
t_s = .20
X_train, X_test, y_train, y_test = train_test_split(df_initial, df_y, test_size = t_s, random_state = random_seed)
param_grid = {'n_estimators': [100, 200, 300], #random int btwn 100 and 500 - removed
              'learning_rate': stats.uniform(0.01, 0.08), #.01 + loc, range of .01+/-.08
              'max_depth': [2, 4, 6, 8], #tree depths to check
              'colsample_bytree': stats.uniform(0.3, 0.7) #btwn .1 and 1.0    
}
kfold = KFold(n_splits=3, shuffle=True, random_state=random_seed)
model = XGBRegressor(tree_method='gpu_hist')
rand_search = RandomizedSearchCV(model, param_distributions = param_grid, scoring = 'explained_variance', n_iter = 3, verbose = 10, cv=kfold)
rand_result = rand_search.fit(X_train, y_train)
print("Best: %f using %s" % (rand_result.best_score_, rand_result.best_params_))
best_XGB_estimator = rand_result.best_estimator_
pickle.dump(best_XGB_estimator, open("xgb_pirates_of_caribbean.pkl", 'wb'))

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] colsample_bytree=0.9538436675685811, learning_rate=0.05505176812345752, max_depth=4, n_estimators=100 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  colsample_bytree=0.9538436675685811, learning_rate=0.05505176812345752, max_depth=4, n_estimators=100, score=0.497392743039882, total=   0.7s
[CV] colsample_bytree=0.9538436675685811, learning_rate=0.05505176812345752, max_depth=4, n_estimators=100 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s


[CV]  colsample_bytree=0.9538436675685811, learning_rate=0.05505176812345752, max_depth=4, n_estimators=100, score=0.5528348675411966, total=   0.6s
[CV] colsample_bytree=0.9538436675685811, learning_rate=0.05505176812345752, max_depth=4, n_estimators=100 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.5s remaining:    0.0s


[CV]  colsample_bytree=0.9538436675685811, learning_rate=0.05505176812345752, max_depth=4, n_estimators=100, score=0.5593724440451604, total=   0.6s
[CV] colsample_bytree=0.9168987340468555, learning_rate=0.08488943382129595, max_depth=2, n_estimators=200 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    2.2s remaining:    0.0s


[CV]  colsample_bytree=0.9168987340468555, learning_rate=0.08488943382129595, max_depth=2, n_estimators=200, score=0.4685970095190827, total=   0.7s
[CV] colsample_bytree=0.9168987340468555, learning_rate=0.08488943382129595, max_depth=2, n_estimators=200 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    3.1s remaining:    0.0s


[CV]  colsample_bytree=0.9168987340468555, learning_rate=0.08488943382129595, max_depth=2, n_estimators=200, score=0.522386871294065, total=   0.7s
[CV] colsample_bytree=0.9168987340468555, learning_rate=0.08488943382129595, max_depth=2, n_estimators=200 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    3.9s remaining:    0.0s


[CV]  colsample_bytree=0.9168987340468555, learning_rate=0.08488943382129595, max_depth=2, n_estimators=200, score=0.5256114218157519, total=   0.7s
[CV] colsample_bytree=0.31990641183346374, learning_rate=0.04224378104873386, max_depth=4, n_estimators=200 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    4.8s remaining:    0.0s


[CV]  colsample_bytree=0.31990641183346374, learning_rate=0.04224378104873386, max_depth=4, n_estimators=200, score=0.3883380401001336, total=   1.0s
[CV] colsample_bytree=0.31990641183346374, learning_rate=0.04224378104873386, max_depth=4, n_estimators=200 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    5.9s remaining:    0.0s


[CV]  colsample_bytree=0.31990641183346374, learning_rate=0.04224378104873386, max_depth=4, n_estimators=200, score=0.431159289308372, total=   1.0s
[CV] colsample_bytree=0.31990641183346374, learning_rate=0.04224378104873386, max_depth=4, n_estimators=200 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    7.1s remaining:    0.0s


[CV]  colsample_bytree=0.31990641183346374, learning_rate=0.04224378104873386, max_depth=4, n_estimators=200, score=0.4343214979405511, total=   1.0s


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    8.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    8.3s finished


Best: 0.536533 using {'colsample_bytree': 0.9538436675685811, 'learning_rate': 0.05505176812345752, 'max_depth': 4, 'n_estimators': 100}


In [40]:
df_initial = pd.read_csv('rock_n_rollercoaster.csv')
for index, row in df_initial.iterrows():
    if math.isnan(row['SPOSTMIN']):
        df_initial.loc[index,'SPOSTMIN'] = df_initial.loc[index, 'SACTMIN']
#     print(row['SPOSTMIN'], row['SACTMIN'])
df_initial = df_initial.drop(columns="SACTMIN")
df_initial = df_initial[df_initial['SPOSTMIN'] != -999]
df_initial['Month'] = df_initial.date.str.split('/').str[0]
df_initial['Day'] = df_initial.date.str.split('/').str[1]
df_initial['Year'] = df_initial.date.str.rsplit('/', 1).str[1]
df_initial['Time_char'] = df_initial.datetime.str.split(' ').str[1]
df_initial['Hour'] = df_initial['Time_char'].str.split(':').str[0]
df_initial['Minute'] = df_initial['Time_char'].str.split(':').str[1]
df_initial = df_initial.drop(columns="Time_char")
df_initial['Month'] = (df_initial['Month']).astype(int)
df_initial['Day'] = (df_initial['Day']).astype(int)
df_initial['Year'] = (df_initial['Year']).astype(int)
df_initial['Hour'] = (df_initial['Hour']).astype(int)
df_initial['Minute'] = (df_initial['Minute']).astype(int)
df_y = df_initial['SPOSTMIN']
label_encoder_DOW = LabelEncoder()
DoW_feature = label_encoder_DOW.fit_transform(df_initial.DAYOFWEEK.iloc[:].values)
# new_col = pd.Series(DoW_feature)
df_initial['DayOfWeek'] = DoW_feature
df_initial = df_initial.drop(columns=["DAYOFWEEK", "date", "datetime", "SPOSTMIN"])
random_seed = 5
t_s = .20
X_train, X_test, y_train, y_test = train_test_split(df_initial, df_y, test_size = t_s, random_state = random_seed)
param_grid = {'n_estimators': [100, 200, 300], #random int btwn 100 and 500 - removed
              'learning_rate': stats.uniform(0.01, 0.08), #.01 + loc, range of .01+/-.08
              'max_depth': [2, 4, 6, 8], #tree depths to check
              'colsample_bytree': stats.uniform(0.3, 0.7) #btwn .1 and 1.0    
}
kfold = KFold(n_splits=3, shuffle=True, random_state=random_seed)
model = XGBRegressor(tree_method='gpu_hist')
rand_search = RandomizedSearchCV(model, param_distributions = param_grid, scoring = 'explained_variance', n_iter = 3, verbose = 10, cv=kfold)
rand_result = rand_search.fit(X_train, y_train)
print("Best: %f using %s" % (rand_result.best_score_, rand_result.best_params_))
best_XGB_estimator = rand_result.best_estimator_
pickle.dump(best_XGB_estimator, open("xgb_rock_n_rollercoaster.pkl", 'wb'))

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] colsample_bytree=0.7229412082413382, learning_rate=0.017631665954581488, max_depth=6, n_estimators=100 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  colsample_bytree=0.7229412082413382, learning_rate=0.017631665954581488, max_depth=6, n_estimators=100, score=0.37021405669197505, total=   1.6s
[CV] colsample_bytree=0.7229412082413382, learning_rate=0.017631665954581488, max_depth=6, n_estimators=100 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.6s remaining:    0.0s


[CV]  colsample_bytree=0.7229412082413382, learning_rate=0.017631665954581488, max_depth=6, n_estimators=100, score=0.3668389652453722, total=   1.6s
[CV] colsample_bytree=0.7229412082413382, learning_rate=0.017631665954581488, max_depth=6, n_estimators=100 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.3s remaining:    0.0s


[CV]  colsample_bytree=0.7229412082413382, learning_rate=0.017631665954581488, max_depth=6, n_estimators=100, score=0.36789752594988256, total=   1.6s
[CV] colsample_bytree=0.443549253455096, learning_rate=0.04388167052079101, max_depth=8, n_estimators=300 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    5.0s remaining:    0.0s


[CV]  colsample_bytree=0.443549253455096, learning_rate=0.04388167052079101, max_depth=8, n_estimators=300, score=0.4816051387700847, total=   8.7s
[CV] colsample_bytree=0.443549253455096, learning_rate=0.04388167052079101, max_depth=8, n_estimators=300 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   13.9s remaining:    0.0s


[CV]  colsample_bytree=0.443549253455096, learning_rate=0.04388167052079101, max_depth=8, n_estimators=300, score=0.48250434010467247, total=   8.7s
[CV] colsample_bytree=0.443549253455096, learning_rate=0.04388167052079101, max_depth=8, n_estimators=300 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   22.7s remaining:    0.0s


[CV]  colsample_bytree=0.443549253455096, learning_rate=0.04388167052079101, max_depth=8, n_estimators=300, score=0.48125146620199244, total=   8.8s
[CV] colsample_bytree=0.7866774812377872, learning_rate=0.04462529486861672, max_depth=8, n_estimators=100 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   31.7s remaining:    0.0s


[CV]  colsample_bytree=0.7866774812377872, learning_rate=0.04462529486861672, max_depth=8, n_estimators=100, score=0.5500395960703122, total=   5.0s
[CV] colsample_bytree=0.7866774812377872, learning_rate=0.04462529486861672, max_depth=8, n_estimators=100 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   36.9s remaining:    0.0s


[CV]  colsample_bytree=0.7866774812377872, learning_rate=0.04462529486861672, max_depth=8, n_estimators=100, score=0.5502636426450886, total=   5.0s
[CV] colsample_bytree=0.7866774812377872, learning_rate=0.04462529486861672, max_depth=8, n_estimators=100 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   42.0s remaining:    0.0s


[CV]  colsample_bytree=0.7866774812377872, learning_rate=0.04462529486861672, max_depth=8, n_estimators=100, score=0.5537426824066967, total=   5.0s


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   47.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   47.2s finished


Best: 0.551349 using {'colsample_bytree': 0.7866774812377872, 'learning_rate': 0.04462529486861672, 'max_depth': 8, 'n_estimators': 100}


In [41]:
df_initial = pd.read_csv('slinky_dog.csv')
for index, row in df_initial.iterrows():
    if math.isnan(row['SPOSTMIN']):
        df_initial.loc[index,'SPOSTMIN'] = df_initial.loc[index, 'SACTMIN']
#     print(row['SPOSTMIN'], row['SACTMIN'])
df_initial = df_initial.drop(columns="SACTMIN")
df_initial = df_initial[df_initial['SPOSTMIN'] != -999]
df_initial['Month'] = df_initial.date.str.split('/').str[0]
df_initial['Day'] = df_initial.date.str.split('/').str[1]
df_initial['Year'] = df_initial.date.str.rsplit('/', 1).str[1]
df_initial['Time_char'] = df_initial.datetime.str.split(' ').str[1]
df_initial['Hour'] = df_initial['Time_char'].str.split(':').str[0]
df_initial['Minute'] = df_initial['Time_char'].str.split(':').str[1]
df_initial = df_initial.drop(columns="Time_char")
df_initial['Month'] = (df_initial['Month']).astype(int)
df_initial['Day'] = (df_initial['Day']).astype(int)
df_initial['Year'] = (df_initial['Year']).astype(int)
df_initial['Hour'] = (df_initial['Hour']).astype(int)
df_initial['Minute'] = (df_initial['Minute']).astype(int)
df_y = df_initial['SPOSTMIN']
label_encoder_DOW = LabelEncoder()
DoW_feature = label_encoder_DOW.fit_transform(df_initial.DAYOFWEEK.iloc[:].values)
# new_col = pd.Series(DoW_feature)
df_initial['DayOfWeek'] = DoW_feature
df_initial = df_initial.drop(columns=["DAYOFWEEK", "date", "datetime", "SPOSTMIN"])
random_seed = 5
t_s = .20
X_train, X_test, y_train, y_test = train_test_split(df_initial, df_y, test_size = t_s, random_state = random_seed)
param_grid = {'n_estimators': [100, 200, 300], #random int btwn 100 and 500 - removed
              'learning_rate': stats.uniform(0.01, 0.08), #.01 + loc, range of .01+/-.08
              'max_depth': [2, 4, 6, 8], #tree depths to check
              'colsample_bytree': stats.uniform(0.3, 0.7) #btwn .1 and 1.0    
}
kfold = KFold(n_splits=3, shuffle=True, random_state=random_seed)
model = XGBRegressor(tree_method='gpu_hist')
rand_search = RandomizedSearchCV(model, param_distributions = param_grid, scoring = 'explained_variance', n_iter = 3, verbose = 10, cv=kfold)
rand_result = rand_search.fit(X_train, y_train)
print("Best: %f using %s" % (rand_result.best_score_, rand_result.best_params_))
best_XGB_estimator = rand_result.best_estimator_
pickle.dump(best_XGB_estimator, open("xgb_slinky_dog.pkl", 'wb'))

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] colsample_bytree=0.300776927113, learning_rate=0.0472120868990769, max_depth=4, n_estimators=100 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  colsample_bytree=0.300776927113, learning_rate=0.0472120868990769, max_depth=4, n_estimators=100, score=0.3117412597703185, total=   0.2s
[CV] colsample_bytree=0.300776927113, learning_rate=0.0472120868990769, max_depth=4, n_estimators=100 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV]  colsample_bytree=0.300776927113, learning_rate=0.0472120868990769, max_depth=4, n_estimators=100, score=0.3166569543566983, total=   0.2s
[CV] colsample_bytree=0.300776927113, learning_rate=0.0472120868990769, max_depth=4, n_estimators=100 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.5s remaining:    0.0s


[CV]  colsample_bytree=0.300776927113, learning_rate=0.0472120868990769, max_depth=4, n_estimators=100, score=0.31723508813090084, total=   0.2s
[CV] colsample_bytree=0.6569678654644897, learning_rate=0.08011830417509848, max_depth=4, n_estimators=100 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.8s remaining:    0.0s


[CV]  colsample_bytree=0.6569678654644897, learning_rate=0.08011830417509848, max_depth=4, n_estimators=100, score=0.651795078200759, total=   0.3s
[CV] colsample_bytree=0.6569678654644897, learning_rate=0.08011830417509848, max_depth=4, n_estimators=100 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    1.2s remaining:    0.0s


[CV]  colsample_bytree=0.6569678654644897, learning_rate=0.08011830417509848, max_depth=4, n_estimators=100, score=0.64593146420809, total=   0.3s
[CV] colsample_bytree=0.6569678654644897, learning_rate=0.08011830417509848, max_depth=4, n_estimators=100 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    1.7s remaining:    0.0s


[CV]  colsample_bytree=0.6569678654644897, learning_rate=0.08011830417509848, max_depth=4, n_estimators=100, score=0.6556578554016566, total=   0.3s
[CV] colsample_bytree=0.3416092135935267, learning_rate=0.027342075160993948, max_depth=8, n_estimators=300 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    2.1s remaining:    0.0s


[CV]  colsample_bytree=0.3416092135935267, learning_rate=0.027342075160993948, max_depth=8, n_estimators=300, score=0.6759867061162864, total=   5.2s
[CV] colsample_bytree=0.3416092135935267, learning_rate=0.027342075160993948, max_depth=8, n_estimators=300 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    7.4s remaining:    0.0s


[CV]  colsample_bytree=0.3416092135935267, learning_rate=0.027342075160993948, max_depth=8, n_estimators=300, score=0.6946859587648129, total=   5.1s
[CV] colsample_bytree=0.3416092135935267, learning_rate=0.027342075160993948, max_depth=8, n_estimators=300 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   12.6s remaining:    0.0s


[CV]  colsample_bytree=0.3416092135935267, learning_rate=0.027342075160993948, max_depth=8, n_estimators=300, score=0.695679051131108, total=   5.1s


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   17.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   17.8s finished


Best: 0.688783 using {'colsample_bytree': 0.3416092135935267, 'learning_rate': 0.027342075160993948, 'max_depth': 8, 'n_estimators': 300}


In [42]:
df_initial = pd.read_csv('soarin.csv')
for index, row in df_initial.iterrows():
    if math.isnan(row['SPOSTMIN']):
        df_initial.loc[index,'SPOSTMIN'] = df_initial.loc[index, 'SACTMIN']
#     print(row['SPOSTMIN'], row['SACTMIN'])
df_initial = df_initial.drop(columns="SACTMIN")
df_initial = df_initial[df_initial['SPOSTMIN'] != -999]
df_initial['Month'] = df_initial.date.str.split('/').str[0]
df_initial['Day'] = df_initial.date.str.split('/').str[1]
df_initial['Year'] = df_initial.date.str.rsplit('/', 1).str[1]
df_initial['Time_char'] = df_initial.datetime.str.split(' ').str[1]
df_initial['Hour'] = df_initial['Time_char'].str.split(':').str[0]
df_initial['Minute'] = df_initial['Time_char'].str.split(':').str[1]
df_initial = df_initial.drop(columns="Time_char")
df_initial['Month'] = (df_initial['Month']).astype(int)
df_initial['Day'] = (df_initial['Day']).astype(int)
df_initial['Year'] = (df_initial['Year']).astype(int)
df_initial['Hour'] = (df_initial['Hour']).astype(int)
df_initial['Minute'] = (df_initial['Minute']).astype(int)
df_y = df_initial['SPOSTMIN']
label_encoder_DOW = LabelEncoder()
DoW_feature = label_encoder_DOW.fit_transform(df_initial.DAYOFWEEK.iloc[:].values)
# new_col = pd.Series(DoW_feature)
df_initial['DayOfWeek'] = DoW_feature
df_initial = df_initial.drop(columns=["DAYOFWEEK", "date", "datetime", "SPOSTMIN"])
random_seed = 5
t_s = .20
X_train, X_test, y_train, y_test = train_test_split(df_initial, df_y, test_size = t_s, random_state = random_seed)
param_grid = {'n_estimators': [100, 200, 300], #random int btwn 100 and 500 - removed
              'learning_rate': stats.uniform(0.01, 0.08), #.01 + loc, range of .01+/-.08
              'max_depth': [2, 4, 6, 8], #tree depths to check
              'colsample_bytree': stats.uniform(0.3, 0.7) #btwn .1 and 1.0    
}
kfold = KFold(n_splits=3, shuffle=True, random_state=random_seed)
model = XGBRegressor(tree_method='gpu_hist')
rand_search = RandomizedSearchCV(model, param_distributions = param_grid, scoring = 'explained_variance', n_iter = 3, verbose = 10, cv=kfold)
rand_result = rand_search.fit(X_train, y_train)
print("Best: %f using %s" % (rand_result.best_score_, rand_result.best_params_))
best_XGB_estimator = rand_result.best_estimator_
pickle.dump(best_XGB_estimator, open("xgb_soarin.pkl", 'wb'))

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] colsample_bytree=0.8867718295025033, learning_rate=0.014657561370748669, max_depth=2, n_estimators=300 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  colsample_bytree=0.8867718295025033, learning_rate=0.014657561370748669, max_depth=2, n_estimators=300, score=0.36183954723485956, total=   1.0s
[CV] colsample_bytree=0.8867718295025033, learning_rate=0.014657561370748669, max_depth=2, n_estimators=300 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.1s remaining:    0.0s


[CV]  colsample_bytree=0.8867718295025033, learning_rate=0.014657561370748669, max_depth=2, n_estimators=300, score=0.37332048783083926, total=   1.0s
[CV] colsample_bytree=0.8867718295025033, learning_rate=0.014657561370748669, max_depth=2, n_estimators=300 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.2s remaining:    0.0s


[CV]  colsample_bytree=0.8867718295025033, learning_rate=0.014657561370748669, max_depth=2, n_estimators=300, score=0.37756722570053125, total=   1.1s
[CV] colsample_bytree=0.9008746559435541, learning_rate=0.05551380174124687, max_depth=4, n_estimators=100 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    3.4s remaining:    0.0s


[CV]  colsample_bytree=0.9008746559435541, learning_rate=0.05551380174124687, max_depth=4, n_estimators=100, score=0.4972547989717965, total=   0.6s
[CV] colsample_bytree=0.9008746559435541, learning_rate=0.05551380174124687, max_depth=4, n_estimators=100 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    4.2s remaining:    0.0s


[CV]  colsample_bytree=0.9008746559435541, learning_rate=0.05551380174124687, max_depth=4, n_estimators=100, score=0.5121032457558534, total=   0.6s
[CV] colsample_bytree=0.9008746559435541, learning_rate=0.05551380174124687, max_depth=4, n_estimators=100 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    4.9s remaining:    0.0s


[CV]  colsample_bytree=0.9008746559435541, learning_rate=0.05551380174124687, max_depth=4, n_estimators=100, score=0.5163734723389105, total=   0.6s
[CV] colsample_bytree=0.7773943404320777, learning_rate=0.03182031225216541, max_depth=4, n_estimators=100 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    5.7s remaining:    0.0s


[CV]  colsample_bytree=0.7773943404320777, learning_rate=0.03182031225216541, max_depth=4, n_estimators=100, score=0.43418297098666936, total=   0.6s
[CV] colsample_bytree=0.7773943404320777, learning_rate=0.03182031225216541, max_depth=4, n_estimators=100 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    6.4s remaining:    0.0s


[CV]  colsample_bytree=0.7773943404320777, learning_rate=0.03182031225216541, max_depth=4, n_estimators=100, score=0.44528924780548884, total=   0.6s
[CV] colsample_bytree=0.7773943404320777, learning_rate=0.03182031225216541, max_depth=4, n_estimators=100 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    7.2s remaining:    0.0s


[CV]  colsample_bytree=0.7773943404320777, learning_rate=0.03182031225216541, max_depth=4, n_estimators=100, score=0.45151921358614655, total=   0.6s


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    8.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    8.0s finished


Best: 0.508577 using {'colsample_bytree': 0.9008746559435541, 'learning_rate': 0.05551380174124687, 'max_depth': 4, 'n_estimators': 100}


In [43]:
df_initial = pd.read_csv('spaceship_earth.csv')
for index, row in df_initial.iterrows():
    if math.isnan(row['SPOSTMIN']):
        df_initial.loc[index,'SPOSTMIN'] = df_initial.loc[index, 'SACTMIN']
#     print(row['SPOSTMIN'], row['SACTMIN'])
df_initial = df_initial.drop(columns="SACTMIN")
df_initial = df_initial[df_initial['SPOSTMIN'] != -999]
df_initial['Month'] = df_initial.date.str.split('/').str[0]
df_initial['Day'] = df_initial.date.str.split('/').str[1]
df_initial['Year'] = df_initial.date.str.rsplit('/', 1).str[1]
df_initial['Time_char'] = df_initial.datetime.str.split(' ').str[1]
df_initial['Hour'] = df_initial['Time_char'].str.split(':').str[0]
df_initial['Minute'] = df_initial['Time_char'].str.split(':').str[1]
df_initial = df_initial.drop(columns="Time_char")
df_initial['Month'] = (df_initial['Month']).astype(int)
df_initial['Day'] = (df_initial['Day']).astype(int)
df_initial['Year'] = (df_initial['Year']).astype(int)
df_initial['Hour'] = (df_initial['Hour']).astype(int)
df_initial['Minute'] = (df_initial['Minute']).astype(int)
df_y = df_initial['SPOSTMIN']
label_encoder_DOW = LabelEncoder()
DoW_feature = label_encoder_DOW.fit_transform(df_initial.DAYOFWEEK.iloc[:].values)
# new_col = pd.Series(DoW_feature)
df_initial['DayOfWeek'] = DoW_feature
df_initial = df_initial.drop(columns=["DAYOFWEEK", "date", "datetime", "SPOSTMIN"])
random_seed = 5
t_s = .20
X_train, X_test, y_train, y_test = train_test_split(df_initial, df_y, test_size = t_s, random_state = random_seed)
param_grid = {'n_estimators': [100, 200, 300], #random int btwn 100 and 500 - removed
              'learning_rate': stats.uniform(0.01, 0.08), #.01 + loc, range of .01+/-.08
              'max_depth': [2, 4, 6, 8], #tree depths to check
              'colsample_bytree': stats.uniform(0.3, 0.7) #btwn .1 and 1.0    
}
kfold = KFold(n_splits=3, shuffle=True, random_state=random_seed)
model = XGBRegressor(tree_method='gpu_hist')
rand_search = RandomizedSearchCV(model, param_distributions = param_grid, scoring = 'explained_variance', n_iter = 3, verbose = 10, cv=kfold)
rand_result = rand_search.fit(X_train, y_train)
print("Best: %f using %s" % (rand_result.best_score_, rand_result.best_params_))
best_XGB_estimator = rand_result.best_estimator_
pickle.dump(best_XGB_estimator, open("xgb_spaceship_earth.pkl", 'wb'))

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] colsample_bytree=0.8276186193730097, learning_rate=0.010345523035463442, max_depth=4, n_estimators=200 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  colsample_bytree=0.8276186193730097, learning_rate=0.010345523035463442, max_depth=4, n_estimators=200, score=0.4654945554105976, total=   1.2s
[CV] colsample_bytree=0.8276186193730097, learning_rate=0.010345523035463442, max_depth=4, n_estimators=200 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.3s remaining:    0.0s


[CV]  colsample_bytree=0.8276186193730097, learning_rate=0.010345523035463442, max_depth=4, n_estimators=200, score=0.4715192586499225, total=   1.2s
[CV] colsample_bytree=0.8276186193730097, learning_rate=0.010345523035463442, max_depth=4, n_estimators=200 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.7s remaining:    0.0s


[CV]  colsample_bytree=0.8276186193730097, learning_rate=0.010345523035463442, max_depth=4, n_estimators=200, score=0.4700572542453536, total=   1.2s
[CV] colsample_bytree=0.9474689470015636, learning_rate=0.031893486261045406, max_depth=4, n_estimators=100 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    4.0s remaining:    0.0s


[CV]  colsample_bytree=0.9474689470015636, learning_rate=0.031893486261045406, max_depth=4, n_estimators=100, score=0.5338982757681885, total=   0.6s
[CV] colsample_bytree=0.9474689470015636, learning_rate=0.031893486261045406, max_depth=4, n_estimators=100 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    4.8s remaining:    0.0s


[CV]  colsample_bytree=0.9474689470015636, learning_rate=0.031893486261045406, max_depth=4, n_estimators=100, score=0.5424877033901747, total=   0.6s
[CV] colsample_bytree=0.9474689470015636, learning_rate=0.031893486261045406, max_depth=4, n_estimators=100 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    5.5s remaining:    0.0s


[CV]  colsample_bytree=0.9474689470015636, learning_rate=0.031893486261045406, max_depth=4, n_estimators=100, score=0.5401822015287709, total=   0.6s
[CV] colsample_bytree=0.7951321565675904, learning_rate=0.07110810933243497, max_depth=6, n_estimators=200 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    6.3s remaining:    0.0s


[CV]  colsample_bytree=0.7951321565675904, learning_rate=0.07110810933243497, max_depth=6, n_estimators=200, score=0.7000382387126445, total=   3.1s
[CV] colsample_bytree=0.7951321565675904, learning_rate=0.07110810933243497, max_depth=6, n_estimators=200 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    9.5s remaining:    0.0s


[CV]  colsample_bytree=0.7951321565675904, learning_rate=0.07110810933243497, max_depth=6, n_estimators=200, score=0.7009169192189914, total=   3.1s
[CV] colsample_bytree=0.7951321565675904, learning_rate=0.07110810933243497, max_depth=6, n_estimators=200 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   12.7s remaining:    0.0s


[CV]  colsample_bytree=0.7951321565675904, learning_rate=0.07110810933243497, max_depth=6, n_estimators=200, score=0.7003149359680536, total=   3.1s


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   16.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   16.0s finished


Best: 0.700423 using {'colsample_bytree': 0.7951321565675904, 'learning_rate': 0.07110810933243497, 'max_depth': 6, 'n_estimators': 200}


In [3]:
df_initial = pd.read_csv('splash_mountain.csv')
for index, row in df_initial.iterrows():
    if math.isnan(row['SPOSTMIN']):
        df_initial.loc[index,'SPOSTMIN'] = df_initial.loc[index, 'SACTMIN']
#     print(row['SPOSTMIN'], row['SACTMIN'])
df_initial = df_initial.drop(columns="SACTMIN")
df_initial = df_initial[df_initial['SPOSTMIN'] != -999]
df_initial['Month'] = df_initial.date.str.split('/').str[0]
df_initial['Day'] = df_initial.date.str.split('/').str[1]
df_initial['Year'] = df_initial.date.str.rsplit('/', 1).str[1]
df_initial['Time_char'] = df_initial.datetime.str.split(' ').str[1]
df_initial['Hour'] = df_initial['Time_char'].str.split(':').str[0]
df_initial['Minute'] = df_initial['Time_char'].str.split(':').str[1]
df_initial = df_initial.drop(columns="Time_char")
df_initial['Month'] = (df_initial['Month']).astype(int)
df_initial['Day'] = (df_initial['Day']).astype(int)
df_initial['Year'] = (df_initial['Year']).astype(int)
df_initial['Hour'] = (df_initial['Hour']).astype(int)
df_initial['Minute'] = (df_initial['Minute']).astype(int)
df_y = df_initial['SPOSTMIN']
label_encoder_DOW = LabelEncoder()
DoW_feature = label_encoder_DOW.fit_transform(df_initial.DAYOFWEEK.iloc[:].values)
# new_col = pd.Series(DoW_feature)
df_initial['DayOfWeek'] = DoW_feature
df_initial = df_initial.drop(columns=["DAYOFWEEK", "date", "datetime", "SPOSTMIN"])
random_seed = 5
t_s = .20
X_train, X_test, y_train, y_test = train_test_split(df_initial, df_y, test_size = t_s, random_state = random_seed)
param_grid = {'n_estimators': [100, 200, 300], #random int btwn 100 and 500 - removed
              'learning_rate': stats.uniform(0.01, 0.08), #.01 + loc, range of .01+/-.08
              'max_depth': [2, 4, 6, 8], #tree depths to check
              'colsample_bytree': stats.uniform(0.3, 0.7) #btwn .1 and 1.0    
}
kfold = KFold(n_splits=3, shuffle=True, random_state=random_seed)
model = XGBRegressor(tree_method='gpu_hist')
rand_search = RandomizedSearchCV(model, param_distributions = param_grid, scoring = 'explained_variance', n_iter = 3, verbose = 10, cv=kfold)
rand_result = rand_search.fit(X_train, y_train)
print("Best: %f using %s" % (rand_result.best_score_, rand_result.best_params_))
best_XGB_estimator = rand_result.best_estimator_
pickle.dump(best_XGB_estimator, open("xgb_splash_mountain.pkl", 'wb'))

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] colsample_bytree=0.9710119106581161, learning_rate=0.07147565132457223, max_depth=8, n_estimators=200 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  colsample_bytree=0.9710119106581161, learning_rate=0.07147565132457223, max_depth=8, n_estimators=200, score=0.8069952723154302, total=  10.3s
[CV] colsample_bytree=0.9710119106581161, learning_rate=0.07147565132457223, max_depth=8, n_estimators=200 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.3s remaining:    0.0s


[CV]  colsample_bytree=0.9710119106581161, learning_rate=0.07147565132457223, max_depth=8, n_estimators=200, score=0.8071799015668484, total=  10.3s
[CV] colsample_bytree=0.9710119106581161, learning_rate=0.07147565132457223, max_depth=8, n_estimators=200 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   20.7s remaining:    0.0s


[CV]  colsample_bytree=0.9710119106581161, learning_rate=0.07147565132457223, max_depth=8, n_estimators=200, score=0.8048558029553516, total=  10.5s
[CV] colsample_bytree=0.7839141070761564, learning_rate=0.06806845573905287, max_depth=6, n_estimators=200 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   31.4s remaining:    0.0s


[CV]  colsample_bytree=0.7839141070761564, learning_rate=0.06806845573905287, max_depth=6, n_estimators=200, score=0.7274864684533215, total=   3.1s
[CV] colsample_bytree=0.7839141070761564, learning_rate=0.06806845573905287, max_depth=6, n_estimators=200 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   34.7s remaining:    0.0s


[CV]  colsample_bytree=0.7839141070761564, learning_rate=0.06806845573905287, max_depth=6, n_estimators=200, score=0.7284047783417895, total=   3.1s
[CV] colsample_bytree=0.7839141070761564, learning_rate=0.06806845573905287, max_depth=6, n_estimators=200 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   38.0s remaining:    0.0s


[CV]  colsample_bytree=0.7839141070761564, learning_rate=0.06806845573905287, max_depth=6, n_estimators=200, score=0.7296086014427452, total=   3.1s
[CV] colsample_bytree=0.560361210484058, learning_rate=0.04254563113176944, max_depth=4, n_estimators=200 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   41.2s remaining:    0.0s


[CV]  colsample_bytree=0.560361210484058, learning_rate=0.04254563113176944, max_depth=4, n_estimators=200, score=0.5941058672259782, total=   1.2s
[CV] colsample_bytree=0.560361210484058, learning_rate=0.04254563113176944, max_depth=4, n_estimators=200 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   42.6s remaining:    0.0s


[CV]  colsample_bytree=0.560361210484058, learning_rate=0.04254563113176944, max_depth=4, n_estimators=200, score=0.5903252284031726, total=   1.2s
[CV] colsample_bytree=0.560361210484058, learning_rate=0.04254563113176944, max_depth=4, n_estimators=200 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   44.0s remaining:    0.0s


[CV]  colsample_bytree=0.560361210484058, learning_rate=0.04254563113176944, max_depth=4, n_estimators=200, score=0.5921290088796246, total=   1.3s


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   45.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   45.4s finished


Best: 0.806344 using {'colsample_bytree': 0.9710119106581161, 'learning_rate': 0.07147565132457223, 'max_depth': 8, 'n_estimators': 200}


In [2]:
df_initial = pd.read_csv('toy_story_mania.csv')
for index, row in df_initial.iterrows():
    if math.isnan(row['SPOSTMIN']):
        df_initial.loc[index,'SPOSTMIN'] = df_initial.loc[index, 'SACTMIN']
#     print(row['SPOSTMIN'], row['SACTMIN'])
df_initial = df_initial.drop(columns="SACTMIN")
df_initial = df_initial[df_initial['SPOSTMIN'] != -999]
df_initial['Month'] = df_initial.date.str.split('/').str[0]
df_initial['Day'] = df_initial.date.str.split('/').str[1]
df_initial['Year'] = df_initial.date.str.rsplit('/', 1).str[1]
df_initial['Time_char'] = df_initial.datetime.str.split(' ').str[1]
df_initial['Hour'] = df_initial['Time_char'].str.split(':').str[0]
df_initial['Minute'] = df_initial['Time_char'].str.split(':').str[1]
df_initial = df_initial.drop(columns="Time_char")
df_initial['Month'] = (df_initial['Month']).astype(int)
df_initial['Day'] = (df_initial['Day']).astype(int)
df_initial['Year'] = (df_initial['Year']).astype(int)
df_initial['Hour'] = (df_initial['Hour']).astype(int)
df_initial['Minute'] = (df_initial['Minute']).astype(int)
df_y = df_initial['SPOSTMIN']
label_encoder_DOW = LabelEncoder()
DoW_feature = label_encoder_DOW.fit_transform(df_initial.DAYOFWEEK.iloc[:].values)
# new_col = pd.Series(DoW_feature)
df_initial['DayOfWeek'] = DoW_feature
df_initial = df_initial.drop(columns=["DAYOFWEEK", "date", "datetime", "SPOSTMIN"])
random_seed = 5
t_s = .20
X_train, X_test, y_train, y_test = train_test_split(df_initial, df_y, test_size = t_s, random_state = random_seed)
param_grid = {'n_estimators': [100, 200, 300], #random int btwn 100 and 500 - removed
              'learning_rate': stats.uniform(0.01, 0.08), #.01 + loc, range of .01+/-.08
              'max_depth': [2, 4, 6, 8], #tree depths to check
              'colsample_bytree': stats.uniform(0.3, 0.7) #btwn .1 and 1.0    
}
kfold = KFold(n_splits=3, shuffle=True, random_state=random_seed)
model = XGBRegressor(tree_method='gpu_hist')
rand_search = RandomizedSearchCV(model, param_distributions = param_grid, scoring = 'explained_variance', n_iter = 3, verbose = 10, cv=kfold)
rand_result = rand_search.fit(X_train, y_train)
print("Best: %f using %s" % (rand_result.best_score_, rand_result.best_params_))
best_XGB_estimator = rand_result.best_estimator_
pickle.dump(best_XGB_estimator, open("xgb_toy_story_mania.pkl", 'wb'))

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] colsample_bytree=0.5941031901820354, learning_rate=0.0545130983127397, max_depth=2, n_estimators=200 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  colsample_bytree=0.5941031901820354, learning_rate=0.0545130983127397, max_depth=2, n_estimators=200, score=0.3805751168607082, total=   1.2s
[CV] colsample_bytree=0.5941031901820354, learning_rate=0.0545130983127397, max_depth=2, n_estimators=200 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.2s remaining:    0.0s


[CV]  colsample_bytree=0.5941031901820354, learning_rate=0.0545130983127397, max_depth=2, n_estimators=200, score=0.38468119179617455, total=   0.7s
[CV] colsample_bytree=0.5941031901820354, learning_rate=0.0545130983127397, max_depth=2, n_estimators=200 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.0s remaining:    0.0s


[CV]  colsample_bytree=0.5941031901820354, learning_rate=0.0545130983127397, max_depth=2, n_estimators=200, score=0.040859381778227566, total=   0.7s
[CV] colsample_bytree=0.5184183872046262, learning_rate=0.025471527483897943, max_depth=2, n_estimators=200 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    2.9s remaining:    0.0s


[CV]  colsample_bytree=0.5184183872046262, learning_rate=0.025471527483897943, max_depth=2, n_estimators=200, score=0.32291078729568123, total=   0.7s
[CV] colsample_bytree=0.5184183872046262, learning_rate=0.025471527483897943, max_depth=2, n_estimators=200 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    3.7s remaining:    0.0s


[CV]  colsample_bytree=0.5184183872046262, learning_rate=0.025471527483897943, max_depth=2, n_estimators=200, score=0.32471393749165733, total=   0.7s
[CV] colsample_bytree=0.5184183872046262, learning_rate=0.025471527483897943, max_depth=2, n_estimators=200 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    4.5s remaining:    0.0s


[CV]  colsample_bytree=0.5184183872046262, learning_rate=0.025471527483897943, max_depth=2, n_estimators=200, score=0.034864724329228935, total=   0.7s
[CV] colsample_bytree=0.9748802637447009, learning_rate=0.05592198956104434, max_depth=4, n_estimators=100 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    5.3s remaining:    0.0s


[CV]  colsample_bytree=0.9748802637447009, learning_rate=0.05592198956104434, max_depth=4, n_estimators=100, score=0.49648408350135853, total=   0.6s
[CV] colsample_bytree=0.9748802637447009, learning_rate=0.05592198956104434, max_depth=4, n_estimators=100 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    6.1s remaining:    0.0s


[CV]  colsample_bytree=0.9748802637447009, learning_rate=0.05592198956104434, max_depth=4, n_estimators=100, score=0.5012972881772639, total=   0.6s
[CV] colsample_bytree=0.9748802637447009, learning_rate=0.05592198956104434, max_depth=4, n_estimators=100 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    6.9s remaining:    0.0s


[CV]  colsample_bytree=0.9748802637447009, learning_rate=0.05592198956104434, max_depth=4, n_estimators=100, score=0.053884468239179584, total=   0.6s


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    7.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    7.6s finished


Best: 0.350555 using {'colsample_bytree': 0.9748802637447009, 'learning_rate': 0.05592198956104434, 'max_depth': 4, 'n_estimators': 100}
