## H2O AutoML

    https://docs.h2o.ai/h2o/latest-stable/h2o-docs/automl.html

In [2]:
import pandas as pd
import h2o
from h2o.automl import H2OAutoML

In [3]:
h2o.init(
    nthreads=-1,     # number of threads when launching a new H2O server
    max_mem_size=12  # in gigabytes
)

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,44 mins 43 secs
H2O_cluster_timezone:,Asia/Kolkata
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.34.0.3
H2O_cluster_version_age:,16 days
H2O_cluster_name:,H2O_from_python_SESA475934_zkwyo9
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,10.35 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [4]:
#h2o.init()

In [50]:
# Import a train/test set into H2O

train = h2o.import_file("C:\\Data_Science\\Competitions\\MachineHack-2021\\train.csv")
test = h2o.import_file("C:\\Data_Science\\Competitions\\MachineHack-2021\\test.csv")

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [51]:
train.head(5)

Item_ID,Item_W,Item_Type,Item_MRP,Outlet_ID,Outlet_Year,Outlet_Size,Outlet_Location_Type,Sales
FDU32,21.0275,Baking Goods,197.352,OUT046,2004,Small,Tier 2,2689.46
NCT54,21.1024,Meat,148.25,OUT035,1987,Small,Tier 1,3437.35
FDW08,20.8823,Hard Drinks,205.465,OUT035,1999,Small,Tier 3,3129.97
FDJ22,21.0504,Starchy Foods,253.418,OUT046,1996,Small,Tier 1,1306.51
FDF47,21.2479,Baking Goods,240.871,OUT035,1988,Small,Tier 3,1739.77




In [52]:
test.head(5)

Item_ID,Item_W,Item_Type,Item_MRP,Outlet_ID,Outlet_Year,Outlet_Size,Outlet_Location_Type
DRM23,10.1567,Snack Foods,119.319,OUT013,1999,High,Tier 3
FDG47,10.4344,Household,263.6,OUT018,1987,High,Tier 3
FDN21,11.9536,Health and Hygiene,246.287,OUT018,1997,High,Tier 1
FDZ23,14.9485,Dairy,136.201,OUT046,2004,Medium,Tier 2
FDC16,11.08,Canned,173.146,OUT018,2009,Medium,Tier 3




In [53]:
#Convert H2O frame to pandas dataframe(This is done so that data operations can be easily done)

train_as_df = h2o.as_list(train, use_pandas=True)
test_as_df = h2o.as_list(test, use_pandas=True)

In [54]:
#Drop the ID columns

#train_as_df=train_as_df.drop(['Item_ID','Outlet_ID'], axis = 1)
#test_as_df=test_as_df.drop(['Item_ID','Outlet_ID'], axis = 1)

# Dropping the outlet id column
#train_as_df=train_as_df.drop(['Item_ID'], axis = 1)
#test_as_df=test_as_df.drop(['Item_ID'], axis = 1)

In [55]:
#Derive the outlet age column

train_as_df['Outlet_Age'] = 2021 - train_as_df['Outlet_Year']
train_as_df=train_as_df.drop(['Outlet_Year'], axis = 1)

test_as_df['Outlet_Age'] = 2021 - test_as_df['Outlet_Year']
test_as_df=test_as_df.drop(['Outlet_Year'], axis = 1)

In [56]:
#Check for any missing values - train

round((train_as_df.isnull().sum() * 100/ len(train_as_df)),2).sort_values(ascending=False)

Outlet_Age              0.0
Sales                   0.0
Outlet_Location_Type    0.0
Outlet_Size             0.0
Outlet_ID               0.0
Item_MRP                0.0
Item_Type               0.0
Item_W                  0.0
Item_ID                 0.0
dtype: float64

In [57]:
#Check for any missing values - test

round((test_as_df.isnull().sum() * 100/ len(test_as_df)),2).sort_values(ascending=False)

Outlet_Age              0.0
Outlet_Location_Type    0.0
Outlet_Size             0.0
Outlet_ID               0.0
Item_MRP                0.0
Item_Type               0.0
Item_W                  0.0
Item_ID                 0.0
dtype: float64

In [58]:
# Break the item id into 2 columns item code and item number

train_as_df['Item_Code'] = [x[:3] for x in train_as_df['Item_ID']]
train_as_df['Item_Number'] = train_as_df['Item_ID'].str[-2:]
train_as_df = train_as_df.drop(['Item_ID'], axis = 1)

test_as_df['Item_Code'] = [x[:3] for x in test_as_df['Item_ID']]
test_as_df['Item_Number'] = test_as_df['Item_ID'].str[-2:]
test_as_df = test_as_df.drop(['Item_ID'], axis = 1)

train_as_df[["Item_Number"]] = train_as_df[["Item_Number"]].apply(pd.to_numeric)
test_as_df[["Item_Number"]] = test_as_df[["Item_Number"]].apply(pd.to_numeric)

In [59]:
# One HOT Encoding

def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    original_dataframe = pd.concat([original_dataframe, dummies], axis=1)
    original_dataframe=original_dataframe.drop([feature_to_encode], axis = 1)
    return(original_dataframe)

In [60]:
train_as_df = encode_and_bind(train_as_df, 'Item_Type')
train_as_df = encode_and_bind(train_as_df, 'Outlet_Size')
train_as_df = encode_and_bind(train_as_df, 'Outlet_Location_Type')
train_as_df = encode_and_bind(train_as_df, 'Outlet_ID')
train_as_df = encode_and_bind(train_as_df, 'Item_Code')

In [61]:
train_as_df.head(5)

Unnamed: 0,Item_W,Item_MRP,Sales,Outlet_Age,Item_Number,Item_Type_Baking Goods,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,Item_Type_Dairy,...,Item_Code_NCQ,Item_Code_NCR,Item_Code_NCS,Item_Code_NCT,Item_Code_NCU,Item_Code_NCV,Item_Code_NCW,Item_Code_NCX,Item_Code_NCY,Item_Code_NCZ
0,21.027499,197.352319,2689.457781,17,32,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,21.102371,148.250214,3437.350375,34,54,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,20.882263,205.46501,3129.967268,22,8,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,21.050435,253.417583,1306.514376,25,22,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,21.247876,240.871039,1739.769829,33,47,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [62]:
test_as_df = encode_and_bind(test_as_df, 'Item_Type')
test_as_df = encode_and_bind(test_as_df, 'Outlet_Size')
test_as_df = encode_and_bind(test_as_df, 'Outlet_Location_Type')
test_as_df = encode_and_bind(test_as_df, 'Outlet_ID')
test_as_df = encode_and_bind(test_as_df, 'Item_Code')

In [63]:
test_as_df.head(5)

Unnamed: 0,Item_W,Item_MRP,Outlet_Age,Item_Number,Item_Type_Baking Goods,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,Item_Type_Dairy,Item_Type_Frozen Foods,...,Item_Code_NCQ,Item_Code_NCR,Item_Code_NCS,Item_Code_NCT,Item_Code_NCU,Item_Code_NCV,Item_Code_NCW,Item_Code_NCX,Item_Code_NCY,Item_Code_NCZ
0,10.156725,119.319482,22,23,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,10.434425,263.600449,34,47,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,11.953589,246.28699,24,21,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,14.948546,136.200508,17,23,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,11.080042,173.145664,12,16,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [64]:
#Convert pandas dataframe back to H2O frame
train = h2o.H2OFrame(train_as_df)
test = h2o.H2OFrame(test_as_df) 

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [65]:
# Identify predictors and response
x = train.columns
y = "Sales"
x.remove(y)

In [None]:
# Run AutoML for 10 base models (limited to 1 hour max runtime by default)

aml = H2OAutoML(max_models=20, seed=1)
aml.train(x=x, y=y, training_frame=train)

AutoML progress: |█
19:17:03.458: AutoML: XGBoost is not available; skipping it.
19:17:03.458: Step 'best_of_family_xgboost' not defined in provider 'StackedEnsemble': skipping it.
19:17:03.458: Step 'all_xgboost' not defined in provider 'StackedEnsemble': skipping it.

████████████

In [None]:
# View the AutoML Leaderboard

lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)

In [None]:
# The leader model is stored here

aml.leader

In [None]:
# To generate predictions on a test set, you can make predictions
# directly on the `"H2OAutoML"` object or on the leader model
# object directly
#preds = aml.predict(test)
# or
preds = aml.leader.predict(test)

In [None]:
preds.head(5)

In [None]:
df = test.cbind(preds)
df.head(5)

In [None]:
# Slice cols by vector of names
res = df[:, ["predict"]]
res.head(5)

In [None]:
#Rename column
res.set_names(['Sales']) 

#### Save results

In [None]:
# Export the file
# There are two ways to save the leader model -- binary format and MOJO format. If you're taking your leader model to production, 
# then we'd suggest the MOJO format since it's optimized for production use.

#h2o.export_file(res, path = "C:\\Data_Science\\Competitions\\MachineHack-2021\\my_submission.csv", force = True)

# Convert to Pandas dataframe
# Save as .CSV file
res_as_df = h2o.as_list(res, use_pandas=True)
res_as_df.to_csv('C:\\Data_Science\\Competitions\\MachineHack-2021\\my_submission.csv', index=False)

In [None]:
h2o.save_model(aml.leader, path = "C:\\Data_Science\\Competitions\\MachineHack-2021\\h20_model_bin")

In [None]:
aml.leader.download_mojo(path = "C:\\Data_Science\\Competitions\\MachineHack-2021")

#### Ensemble Exploration

    To understand how the ensemble works, let's take a peek inside the Stacked Ensemble "All Models" model. The "All Models" ensemble is an ensemble of all of the individual models in the AutoML run. This is often the top performing model on the leaderboard.

In [None]:
# Get model ids for all models in the AutoML Leaderboard
model_ids = list(aml.leaderboard['model_id'].as_data_frame().iloc[:,0])
# Get the "All Models" Stacked Ensemble model
se = h2o.get_model([mid for mid in model_ids if "StackedEnsemble_AllModels" in mid][0])
# Get the Stacked Ensemble metalearner model
metalearner = h2o.get_model(se.metalearner()['name'])

Examine the variable importance of the metalearner (combiner) algorithm in the ensemble. This shows us how much each base learner is contributing to the ensemble. The AutoML Stacked Ensembles use the default metalearner algorithm (GLM with non-negative weights), so the variable importance of the metalearner is actually the standardized coefficient magnitudes of the GLM.

In [None]:
metalearner.coef_norm()

In [None]:
#We can also plot the base learner contributions to the ensemble.
%matplotlib inline
metalearner.std_coef_plot()

#### XGBoost Regressor

In [None]:
import xgboost
from math import sqrt
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

In [None]:
X_train = train_as_df[train_as_df.columns.difference(['Sales'])]
y_train = train_as_df[['Sales']] 

In [None]:
# Hypertune XGBoost
# To improve, reduce max depth to 3, increase estimators to 1600 = 89.13%

model = xgboost.XGBRegressor()
parameters = {'nthread':[4],
              'objective':['reg:squarederror'],
              'learning_rate': [0.01], 
              'max_depth': [5],
              'min_child_weight': [3],
              'subsample': [1],
              'colsample_bytree': [1], 
              'booster' : ['gbtree'],
              'n_estimators': [1500]} 

model = GridSearchCV(model,
                        parameters,
                        cv = 2,
                        n_jobs = 5,
                        verbose=False) 
model.fit(X_train, y_train)

In [None]:
# Predict training data using model

y_pred=model.predict(X_train)
print(y_pred) 

mse = mean_squared_error(y_train, y_pred)
rmse = sqrt(mse)
print(rmse)

In [None]:
# Predict test data using model

y_pred=model.predict(test_as_df)
print(y_pred) 

In [None]:
# Create a dataframe with predicted values
# Give the column name as 'Sales'

columns = ['Sales']
res = pd.DataFrame(y_pred,columns=columns)

In [None]:
# Save the final predictions file

res.to_csv('C:\\Data_Science\\Competitions\\MachineHack-2021\\my_submission1.csv', index=False)