In this kernel we will be comparing the time taken for a model to fit to the data given to us with and without modifying the column data types (reduce size of input dataframes by more than half). We will run a LGBM classifier and compare the Kfold OOF AUC scores and the total run time to see if there are any visible advantages.

In [None]:
import numpy as np
import pandas as pd
import gc
import time
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
plt.style.use('fivethirtyeight')
num_folds = 10

## Training on data **without** reducing memory

In [None]:
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')

train_labels = np.array(train_df['target'])                                          
train_df.drop(['ID_code', 'target'], axis=1, inplace=True)
test_df.drop(['ID_code'], axis=1, inplace=True)

train_df_ids = np.array(train_df.index)                     
oof_preds = np.zeros(train_df.shape[0])
y_test_preds = np.zeros(test_df.shape[0])

folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

In [None]:
old_train_times = []
old_val_auc_scores = []

for i, ids in enumerate(folds.split(train_df_ids, train_labels)):
    start_time = time.time()
    print('\nFold {}'.format(i))
    X_train, y_train = train_df.values[ids[0]], train_labels[ids[0]]
    X_val, y_val = train_df.values[ids[1]], train_labels[ids[1]]
    
    model = LGBMClassifier(metric='auc',objective='binary',n_jobs=-1, n_estimators=1000)
    model.fit(X_train, y_train, 
              eval_set=[(X_val, y_val)],
              verbose=0,
              early_stopping_rounds=100)
    
    fold_preds = model.predict_proba(X_val)[:,1]
    oof_preds[ids[1]] = fold_preds
    y_test_preds += model.predict_proba(test_df)[:,1]

    fold_auc = roc_auc_score(y_val, fold_preds)
    old_val_auc_scores.append(fold_auc)
    old_train_times.append(time.time() - start_time)

In [None]:
old_oof_auc = roc_auc_score(train_labels, oof_preds)

## Train on data **with** reduced memory

In [None]:
# Taken from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage (Home Credit Default Risk competition)
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
train_df = reduce_mem_usage(pd.read_csv('../input/train.csv'))
test_df = reduce_mem_usage(pd.read_csv('../input/test.csv'))

train_labels = np.array(train_df['target'])                                          
train_df.drop(['ID_code', 'target'], axis=1, inplace=True)
test_df.drop(['ID_code'], axis=1, inplace=True)

train_df_ids = np.array(train_df.index)                     
oof_preds = np.zeros(train_df.shape[0])
y_test_preds = np.zeros(test_df.shape[0])

folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

### Dataframe sizes are reduced by >70%!

In [None]:
new_train_times = []
new_val_auc_scores = []

for i, ids in enumerate(folds.split(train_df_ids, train_labels)):
    start_time = time.time()
    print('\nFold {}'.format(i))
    X_train, y_train = train_df.values[ids[0]], train_labels[ids[0]]
    X_val, y_val = train_df.values[ids[1]], train_labels[ids[1]]
    
    model = LGBMClassifier(metric='auc',objective='binary',n_jobs=-1, n_estimators=1000)
    model.fit(X_train, y_train, 
              eval_set=[(X_val, y_val)],
              verbose=0,
              early_stopping_rounds=100)
    
    fold_preds = model.predict_proba(X_val)[:,1]
    oof_preds[ids[1]] = fold_preds
    y_test_preds += model.predict_proba(test_df)[:,1]

    fold_auc = roc_auc_score(y_val, fold_preds)
    new_val_auc_scores.append(fold_auc)
    new_train_times.append(time.time() - start_time)

In [None]:
new_oof_auc = roc_auc_score(train_labels, oof_preds)

In [None]:
results_df = pd.DataFrame({'Old time':old_train_times, 
                           'New time':new_train_times,
                           'Old val AUC':old_val_auc_scores, 
                           'New val AUC':new_val_auc_scores} 
                          )
results_df.head()

### Validation AUC score analysis

In [None]:
ax = results_df[['Old val AUC', 'New val AUC']].plot(
    figsize=(12, 6),
    fontsize=20
)
ax.set_title("AUC for OOF predictions", fontsize=20)
ax.set_xlabel('Fold Number')
ax.set_ylabel('AUC Score')
ax.set_xticks(range(num_folds))
sns.despine(bottom=True, left=True)

In [None]:
print('OOF AUC score for original dataframe: {}'.format(np.mean(old_oof_auc)))
print('OOF AUC score for new dataframe: {}'.format(np.mean(new_oof_auc)))

### Time analysis for model fitting

In [None]:
ax = results_df[['Old time', 'New time']].plot(
    figsize=(12, 6),
    fontsize=20
)
ax.set_title("Time taken for individual folds", fontsize=20)
ax.set_xlabel('Fold Number')
ax.set_ylabel('Time taken in seconds')
ax.set_xticks(range(num_folds))
sns.despine(bottom=True, left=True)

In [None]:
print('Average run time over original dataframe: {} seconds'.format(np.mean(old_train_times)))
print('Average run time over new dataframe: {} seconds'.format(np.mean(new_train_times)))

### There seems to be a slight drop in the AUC score with no tangible decrease in the model execution time. I haven't tested across different models and seeds so it might not be the case everywhere. 