# First xgb Baseline Model w/o Preprocessing

*Notes:*
- rmsle error is not included in the xgboost 0.9 release. A nightly wheel with v1.0 can be installed to use it.
- My machine operates with 16GB RAM and it's already getting maxed out. GC before starting to train is helpful
- Issues while training occur. eval only shows NaN predictions after 3 iterations. Not to sure why. Could be the experimental xgb build, default parameters or just faulty/crappy data

In [14]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
import gc

### Load data from csv

In [2]:
building_df = pd.read_csv("../data/raw/building_metadata.csv")
weather_train = pd.read_csv("../data/raw/weather_train.csv")
train = pd.read_csv("../data/raw/train.csv")

### Join the single dataframes

In [3]:
train = train.merge(building_df, left_on = "building_id", right_on = "building_id", how = "left")
train = train.merge(weather_train, left_on = ["site_id", "timestamp"], right_on = ["site_id", "timestamp"])
del weather_train

### Introduce function for reducing in-memory size of data set

In [8]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [5]:
train = reduce_mem_usage(train)

Mem. usage decreased to 1036.44 Mb (60.3% reduction)


#### Which columns are in the set?

In [6]:
train.columns

Index(['building_id', 'meter', 'timestamp', 'meter_reading', 'site_id',
       'primary_use', 'square_feet', 'year_built', 'floor_count',
       'air_temperature', 'cloud_coverage', 'dew_temperature',
       'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction',
       'wind_speed'],
      dtype='object')

#### Isolate timestamp elements

*Note: I didn't add year as single feature as it is not recurrent and not useful in future implementations*

In [7]:
train["timestamp"] = pd.to_datetime(train["timestamp"])
train["hour"] = train["timestamp"].dt.hour
train["weekday"] = train["timestamp"].dt.weekday
train["month"] = train["timestamp"].dt.month

del train["timestamp"]

#### Encode categorical data

*Note: nominal features should probably be one-hot-encoded. I got out of memory errors on the first try so maybe we are looking for better code/approach*

In [8]:
nominal_cols = ["building_id", "meter", "site_id", "primary_use"]
ordinal_cols = ["hour", "weekday", "month"]

#one_hot = OneHotEncoder(categorical_features=nominal_cols)
#train = one_hot.fit_transform(train)
train[ordinal_cols + nominal_cols] = train[ordinal_cols + nominal_cols].apply(LabelEncoder().fit_transform)

#### Substitute NaNs with 0

In [9]:
train = train.fillna(0)
train.isnull().sum().sum()

0

In [10]:
train.head

<bound method NDFrame.head of           building_id  meter  meter_reading  site_id  primary_use  \
0                   0      0       0.000000        0            0   
1                   1      0       0.000000        0            0   
2                   2      0       0.000000        0            0   
3                   3      0       0.000000        0            0   
4                   4      0       0.000000        0            0   
...               ...    ...            ...      ...          ...   
20125600         1444      0       8.750000       15            1   
20125601         1445      0       4.825000       15            0   
20125602         1446      0       0.000000       15            1   
20125603         1447      0     159.574997       15            4   
20125604         1448      0       2.850000       15            6   

          square_feet  year_built  floor_count  air_temperature  \
0                7432      2008.0          0.0        25.000000   
1      

### Build Model

In [11]:
y_train = train["meter_reading"]
del train["meter_reading"]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(train, y_train, test_size=0.2, random_state=420)

train_dmatrix = xgb.DMatrix(data=X_train,label=y_train)
test_dmatrix = xgb.DMatrix(data=X_test,label=y_test)

del train, X_train, X_test, y_train, y_test

In [13]:
params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "booster": "gbtree",
    "verbosity": "1",
}
num_boost_round = 100
evals = [(test_dmatrix,'eval'), (train_dmatrix,'train')]

In [14]:
xgb_model = xgb.train(params=params,
                      dtrain=train_dmatrix,
                      num_boost_round=num_boost_round,
                      evals=evals,
                      verbose_eval=True,
                      early_stopping_rounds=10)

[0]	eval-rmse:128318.83594	train-rmse:132541.01562
Multiple eval metrics have been passed: 'train-rmse' will be used for early stopping.

Will train until train-rmse hasn't improved in 10 rounds.
[1]	eval-rmse:116805.26562	train-rmse:120230.08594
[2]	eval-rmse:110322.70312	train-rmse:113510.90625
[3]	eval-rmse:103635.53125	train-rmse:106681.41406
[4]	eval-rmse:98526.79688	train-rmse:101670.88281
[5]	eval-rmse:95082.10938	train-rmse:98099.51562
[6]	eval-rmse:92400.90625	train-rmse:95546.78125
[7]	eval-rmse:90657.03125	train-rmse:93773.64844
[8]	eval-rmse:89482.22656	train-rmse:92328.28906
[9]	eval-rmse:88398.79688	train-rmse:91257.91406
[10]	eval-rmse:85888.03906	train-rmse:88818.44531
[11]	eval-rmse:84572.00000	train-rmse:87542.23438
[12]	eval-rmse:83244.69531	train-rmse:86275.74219
[13]	eval-rmse:82830.67188	train-rmse:85889.92188
[14]	eval-rmse:82727.47656	train-rmse:85782.89062
[15]	eval-rmse:82394.21875	train-rmse:85407.80469
[16]	eval-rmse:82073.42188	train-rmse:85116.96094
[17]	e

In [15]:
xgb_model.save_model("../models/0001.model")

## Test model on submission data

In [1]:
del train_dmatrix, test_dmatrix

NameError: name 'train_dmatrix' is not defined

In [5]:
xgb_model = xgb.Booster()
xgb_model.load_model("0001.model")

In [10]:
building_df = pd.read_csv("../data/raw/building_metadata.csv")
weather_test = pd.read_csv("../data/raw/weather_test.csv")
test = pd.read_csv("../data/raw/test.csv")

test = test.merge(building_df, left_on = "building_id", right_on = "building_id", how = "left")
test = test.merge(weather_test, left_on = ["site_id", "timestamp"], right_on = ["site_id", "timestamp"])
del weather_test

test = reduce_mem_usage(test)

test["timestamp"] = pd.to_datetime(test["timestamp"])
test["hour"] = test["timestamp"].dt.hour
test["weekday"] = test["timestamp"].dt.weekday
test["month"] = test["timestamp"].dt.month

del test["timestamp"]
nominal_cols = ["building_id", "meter", "site_id", "primary_use"]
ordinal_cols = ["hour", "weekday", "month"]

test[ordinal_cols + nominal_cols] = test[ordinal_cols + nominal_cols].apply(LabelEncoder().fit_transform)

row_ids = test["row_id"]
del test["row_id"]

Mem. usage decreased to 2137.11 Mb (60.3% reduction)


In [18]:
gc.collect()

67

In [19]:
test_dmatrix = xgb.DMatrix(test)
del test
y_test = xgb_model.predict(test_dmatrix)

submission = pd.DataFrame({"row_id": row_ids, "meter_reading": y_test})
submission.to_csv("submission.csv", index=False)

MemoryError: Unable to allocate array with shape (7, 13832857) and data type int64