In [2]:
import pandas as pd

data = pd.read_csv('london_merged.csv')

In [3]:
data.head()

Unnamed: 0,timestamp,cnt,t1,t2,hum,wind_speed,weather_code,is_holiday,is_weekend,season
0,2015-01-04 00:00:00,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0
1,2015-01-04 01:00:00,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0
2,2015-01-04 02:00:00,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0
3,2015-01-04 03:00:00,72,2.0,2.0,100.0,0.0,1.0,0.0,1.0,3.0
4,2015-01-04 04:00:00,47,2.0,0.0,93.0,6.5,1.0,0.0,1.0,3.0


In [48]:
import pandas as pd
def eda():
    data = pd.read_csv('./london_merged.csv')
    data['year'] = data['timestamp'].apply(lambda row: row[:4])
    data['month'] = data['timestamp'].apply(lambda row: row.split('-')[2][:2] )
    data['hour'] = data['timestamp'].apply(lambda row: row.split(':')[0][-2:] )

    # data.drop('timestamp', axis=1, inplace=True)

    # data['cnt'].describe()
    # data.corr().style.background_gradient(cmap='RdPu')

    #correlations between our target and other features
    corr_target = round(data.corr().iloc[0].sort_values(ascending=False), 3)
    # print(corr_target)

    return data


In [49]:
import numpy    as np
from numpy.testing._private.utils import decorate_methods
import pandas   as pd
import seaborn  as sb
import matplotlib.pyplot as plt
import sklearn  as skl
import time

from sklearn import pipeline      # Pipeline
from sklearn import preprocessing # OrdinalEncoder, LabelEncoder
from sklearn import impute
from sklearn import compose
from sklearn import model_selection # train_test_split
from sklearn import metrics         # accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn import set_config


data = eda()

np.random.seed(0)

data['year'] = data['timestamp'].apply(lambda row: row[:4])
data['month'] = data['timestamp'].apply(lambda row: row.split('-')[2][:2] )
data['hour'] = data['timestamp'].apply(lambda row: row.split(':')[0][-2:] )

data.drop('timestamp', axis=1, inplace=True)


def data_enhancement():

    data = eda()
    data['t1_t2'] = data.apply(lambda row: (row['t1']+row['t2'])/2, axis=1)
    # data['hum_win'] = data.apply(lambda row: row['hum']*row['wind_speed'])
    data = data.drop(['t1'], axis=1)
    data = data.drop(['t2'], axis=1)
    
    gen_data = data
    
    return gen_data


In [51]:
def new_col():

    data = data_enhancement()
   
    y = data['cnt']
    x = data.drop(['cnt'], axis=1)

    cat_vars = ['season','is_weekend','is_holiday','year','month','weather_code']
    num_vars = ['t1_t2','hum','wind_speed'] # multicollinearity problem

    x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y,
                                        test_size=0.2,
                                        random_state=0  # Recommended for reproducibility
                                    )

    transformer = preprocessing.PowerTransformer()
    y_train = transformer.fit_transform(y_train.values.reshape(-1,1))
    y_test = transformer.transform(y_test.values.reshape(-1,1))

    num_4_treeModels = pipeline.Pipeline(steps=[
        ('imputer', impute.SimpleImputer(strategy='constant', fill_value=-9999)),
    ])

    cat_4_treeModels = pipeline.Pipeline(steps=[
        ('imputer', impute.SimpleImputer(strategy='constant', fill_value='missing')),
        ('ordinal', preprocessing.OrdinalEncoder()) # handle_unknown='ignore' ONLY IN VERSION 0.24
    ])

    tree_prepro = compose.ColumnTransformer(transformers=[
        ('num', num_4_treeModels, num_vars),
        ('cat', cat_4_treeModels, cat_vars),
    ], remainder='drop') # Drop other vars not specified in num_vars or cat_vars

    return x_train, x_test, y_train, y_test, tree_prepro

In [53]:
import numpy    as np
from numpy.testing._private.utils import decorate_methods
import pandas   as pd
import seaborn  as sb
import matplotlib.pyplot as plt
import sklearn  as skl
import time

from sklearn import pipeline      # Pipeline
from sklearn import preprocessing # OrdinalEncoder, LabelEncoder
from sklearn import impute
from sklearn import compose
from sklearn import model_selection # train_test_split
from sklearn import metrics         # accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn import set_config

from sklearn.tree          import DecisionTreeRegressor
from sklearn.ensemble      import RandomForestRegressor
from sklearn.ensemble      import ExtraTreesRegressor
from sklearn.ensemble      import AdaBoostRegressor
from sklearn.ensemble      import GradientBoostingRegressor
from xgboost               import XGBRegressor
from lightgbm              import LGBMRegressor
from catboost              import CatBoostRegressor


def main():

    x_train, x_test, y_train, y_test, tree_prepro = new_col()

    tree_classifiers = {
  "Decision Tree": DecisionTreeRegressor(),
  "Extra Trees":   ExtraTreesRegressor(n_estimators=100),
  "Random Forest": RandomForestRegressor(n_estimators=100),
  "AdaBoost":      AdaBoostRegressor(n_estimators=100),
  "Skl GBM":       GradientBoostingRegressor(n_estimators=100),
  "XGBoost":       XGBRegressor(n_estimators=100),
  "LightGBM":      LGBMRegressor(n_estimators=100),
  "CatBoost":      CatBoostRegressor(n_estimators=100),
    }
    
    tree_classifiers = {name: pipeline.make_pipeline(tree_prepro, model) for name, model in tree_classifiers.items()}

    rang = abs(y_train.max()) + abs(y_train.min())

    results = pd.DataFrame({'Model': [], 'MSE': [], 'MAB': [], " % error": [], 'Time': []})

    for model_name, model in tree_classifiers.items():
        
        start_time = time.time()
        model.fit(x_train, y_train)
        total_time = time.time() - start_time
            
        pred = model.predict(x_test)
        
        results = results.append({"Model":    model_name,
                                "MSE": metrics.mean_squared_error(y_test, pred),
                                "MAB": metrics.mean_absolute_error(y_test, pred),
                                " % error": metrics.mean_squared_error(y_test, pred) / rang,
                                "Time":     total_time},
                                ignore_index=True)
    return results
main()

  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


Learning rate set to 0.403579
0:	learn: 0.9063881	total: 5.22ms	remaining: 517ms
1:	learn: 0.8506612	total: 10ms	remaining: 491ms
2:	learn: 0.8269483	total: 14.7ms	remaining: 475ms
3:	learn: 0.8103574	total: 18.9ms	remaining: 453ms
4:	learn: 0.8022581	total: 23ms	remaining: 438ms
5:	learn: 0.7956701	total: 27.5ms	remaining: 430ms
6:	learn: 0.7928462	total: 33.6ms	remaining: 446ms
7:	learn: 0.7898856	total: 38.1ms	remaining: 438ms
8:	learn: 0.7870869	total: 43.1ms	remaining: 435ms
9:	learn: 0.7825252	total: 52.3ms	remaining: 470ms
10:	learn: 0.7804358	total: 56.9ms	remaining: 460ms
11:	learn: 0.7783388	total: 62.2ms	remaining: 456ms
12:	learn: 0.7770159	total: 66.5ms	remaining: 445ms
13:	learn: 0.7751194	total: 70.9ms	remaining: 436ms
14:	learn: 0.7739744	total: 75.5ms	remaining: 428ms
15:	learn: 0.7721130	total: 79.9ms	remaining: 420ms
16:	learn: 0.7703520	total: 84.2ms	remaining: 411ms
17:	learn: 0.7684088	total: 88.4ms	remaining: 403ms
18:	learn: 0.7664659	total: 93.4ms	remaining: 39

Unnamed: 0,Model,MSE,MAB,% error,Time
0,Decision Tree,1.000974,0.725698,0.181615,0.154258
1,Extra Trees,0.566613,0.544645,0.102805,4.304508
2,Random Forest,0.556641,0.555369,0.100996,5.773753
3,AdaBoost,0.692954,0.662069,0.125728,0.479224
4,Skl GBM,0.623368,0.612744,0.113103,2.054904
5,XGBoost,0.590199,0.580939,0.107084,1.167431
6,LightGBM,0.582789,0.587469,0.10574,0.308609
7,CatBoost,0.590951,0.589399,0.107221,0.70204


In [None]:
data = eda()

def data_enhancement(data):
    
    gen_data = data
    
    for season in data['season'].unique():
        seasonal_data = gen_data[gen_data['season'] == season]
        hum_std = seasonal_data['hum'].mean()
        wind_speed_std = seasonal_data['wind_speed'].mean()
        t1_std = seasonal_data['t1'].mean()
        t2_std = seasonal_data['t2'].mean()
        
        for i in gen_data[gen_data['season'] == season].index:
            if np.random.randint(2) == 1:
                gen_data['hum'].values[i] += hum_std/10
            else:
                gen_data['hum'].values[i] -= hum_std/10
                
            if np.random.randint(2) == 1:
                gen_data['wind_speed'].values[i] += wind_speed_std/10
            else:
                gen_data['wind_speed'].values[i] -= wind_speed_std/10
                
            if np.random.randint(2) == 1:
                gen_data['t1'].values[i] += t1_std/10
            else:
                gen_data['t1'].values[i] -= t1_std/10
                
            if np.random.randint(2) == 1:
                gen_data['t2'].values[i] += t2_std/10
            else:
                gen_data['t2'].values[i] -= t2_std/10

    return gen_data
def mean():
    y = data['cnt']
    x = data.drop(['cnt'], axis=1)


    cat_vars = ['season','is_weekend','is_holiday','year','month','weather_code']
    num_vars = ['t1','t2','hum','wind_speed']


    x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y,
                                        test_size=0.2,
                                        random_state=0) # Recommended for reproducibility
                                    
    gen = data_enhancement(data)
    extra_sample = gen.sample(gen.shape[0] // 3)
    x_train = pd.concat([x_train, extra_sample.drop(['cnt'], axis=1 ) ])
    y_train = pd.concat([y_train, extra_sample['cnt'] ])


    transformer = preprocessing.PowerTransformer()
    y_train = transformer.fit_transform(y_train.values.reshape(-1,1))
    y_test = transformer.transform(y_test.values.reshape(-1,1))

    num_4_treeModels = pipeline.Pipeline(steps=[
        ('imputer', impute.SimpleImputer(strategy='constant', fill_value=-9999)),
    ])

    cat_4_treeModels = pipeline.Pipeline(steps=[
        ('imputer', impute.SimpleImputer(strategy='constant', fill_value='missing')),
        ('ordinal', preprocessing.OrdinalEncoder()) # handle_unknown='ignore' ONLY IN VERSION 0.24
    ])

    tree_prepro = compose.ColumnTransformer(transformers=[
        ('num', num_4_treeModels, num_vars),
        ('cat', cat_4_treeModels, cat_vars),
    ], remainder='drop') # Drop other vars not specified in num_vars or cat_vars

    return x_train, x_test, y_train, y_test, tree_prepro

In [58]:

def main():

    x_train, x_test, y_train, y_test, tree_prepro = mean()

    tree_classifiers = {
  "Decision Tree": DecisionTreeRegressor(),
  "Extra Trees":   ExtraTreesRegressor(n_estimators=100),
  "Random Forest": RandomForestRegressor(n_estimators=100),
  "AdaBoost":      AdaBoostRegressor(n_estimators=100),
  "Skl GBM":       GradientBoostingRegressor(n_estimators=100),
  "XGBoost":       XGBRegressor(n_estimators=100),
  "LightGBM":      LGBMRegressor(n_estimators=100),
  "CatBoost":      CatBoostRegressor(n_estimators=100),
    }
    
    tree_classifiers = {name: pipeline.make_pipeline(tree_prepro, model) for name, model in tree_classifiers.items()}

    rang = abs(y_train.max()) + abs(y_train.min())

    results = pd.DataFrame({'Model': [], 'MSE': [], 'MAB': [], " % error": [], 'Time': []})

    for model_name, model in tree_classifiers.items():
        
        start_time = time.time()
        model.fit(x_train, y_train)
        total_time = time.time() - start_time
            
        pred = model.predict(x_test)
        
        results = results.append({"Model":    model_name,
                                "MSE": metrics.mean_squared_error(y_test, pred),
                                "MAB": metrics.mean_absolute_error(y_test, pred),
                                " % error": metrics.mean_squared_error(y_test, pred) / rang,
                                "Time":     total_time},
                                ignore_index=True)
    return results
main()

  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


Learning rate set to 0.42641
0:	learn: 0.8959402	total: 9.21ms	remaining: 912ms
1:	learn: 0.8544429	total: 20.8ms	remaining: 1.02s
2:	learn: 0.8305189	total: 29.2ms	remaining: 944ms
3:	learn: 0.8161688	total: 36.3ms	remaining: 872ms
4:	learn: 0.8085484	total: 42ms	remaining: 798ms
5:	learn: 0.8023113	total: 49.7ms	remaining: 779ms
6:	learn: 0.7987743	total: 56ms	remaining: 744ms
7:	learn: 0.7962099	total: 64.3ms	remaining: 740ms
8:	learn: 0.7927090	total: 70.2ms	remaining: 710ms
9:	learn: 0.7905620	total: 77.8ms	remaining: 700ms
10:	learn: 0.7891601	total: 83.6ms	remaining: 677ms
11:	learn: 0.7871446	total: 89ms	remaining: 653ms
12:	learn: 0.7854028	total: 97.3ms	remaining: 651ms
13:	learn: 0.7833379	total: 103ms	remaining: 634ms
14:	learn: 0.7816341	total: 112ms	remaining: 636ms
15:	learn: 0.7800866	total: 118ms	remaining: 619ms
16:	learn: 0.7794007	total: 126ms	remaining: 615ms
17:	learn: 0.7784678	total: 132ms	remaining: 601ms
18:	learn: 0.7774372	total: 137ms	remaining: 584ms
19:	l

Unnamed: 0,Model,MSE,MAB,% error,Time
0,Decision Tree,0.951126,0.70145,0.17234,0.250999
1,Extra Trees,0.542326,0.530034,0.098267,6.88113
2,Random Forest,0.538235,0.546057,0.097526,9.371135
3,AdaBoost,0.694285,0.663337,0.125802,0.695199
4,Skl GBM,0.623868,0.614214,0.113042,2.864668
5,XGBoost,0.572888,0.578116,0.103805,1.644849
6,LightGBM,0.572599,0.583833,0.103752,0.416423
7,CatBoost,0.585487,0.591034,0.106088,1.160964


In [55]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler as scaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import PowerTransformer

In [64]:
def pca_():
    data = eda()
    np.random.seed(0)

    data['year'] = data['timestamp'].apply(lambda row: row[:4])
    data['month'] = data['timestamp'].apply(lambda row: row.split('-')[2][:2] )
    data['hour'] = data['timestamp'].apply(lambda row: row.split(':')[0][-2:] )

    data.drop('timestamp', axis=1, inplace=True)
    y = data['cnt']
    x = data.drop(['cnt'], axis=1)

    pca = PCA(n_components=3)
    x_red_dim = pca.fit_transform(x)

    x_train, x_test, y_train, y_test = train_test_split(x_red_dim, y,
                                        test_size=0.2,
                                        random_state=0)  

    transformer = PowerTransformer()
    y_train = transformer.fit_transform(y_train.values.reshape(-1,1))
    y_test = transformer.transform(y_test.values.reshape(-1,1))

    return x_train, x_test, y_train, y_test

In [65]:

def main():

    x_train, x_test, y_train, y_test = pca_()

    tree_classifiers = {
  "Decision Tree": DecisionTreeRegressor(),
  "Extra Trees":   ExtraTreesRegressor(n_estimators=100),
  "Random Forest": RandomForestRegressor(n_estimators=100),
  "AdaBoost":      AdaBoostRegressor(n_estimators=100),
  "Skl GBM":       GradientBoostingRegressor(n_estimators=100),
  "XGBoost":       XGBRegressor(n_estimators=100),
  "LightGBM":      LGBMRegressor(n_estimators=100),
  "CatBoost":      CatBoostRegressor(n_estimators=100),
    }
    
    # tree_classifiers = {name: pipeline.make_pipeline(tree_prepro, model) for name, model in tree_classifiers.items()}

    rang = abs(y_train.max()) + abs(y_train.min())

    results = pd.DataFrame({'Model': [], 'MSE': [], 'MAB': [], " % error": [], 'Time': []})

    for model_name, model in tree_classifiers.items():
        
        start_time = time.time()
        model.fit(x_train, y_train)
        total_time = time.time() - start_time
            
        pred = model.predict(x_test)
        
        results = results.append({"Model":    model_name,
                                "MSE": metrics.mean_squared_error(y_test, pred),
                                "MAB": metrics.mean_absolute_error(y_test, pred),
                                " % error": metrics.mean_squared_error(y_test, pred) / rang,
                                "Time":     total_time},
                                ignore_index=True)
    return results
main()

  model.fit(x_train, y_train)
  model.fit(x_train, y_train)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


Learning rate set to 0.403579
0:	learn: 0.8980442	total: 7.73ms	remaining: 766ms
1:	learn: 0.8536649	total: 15.7ms	remaining: 772ms
2:	learn: 0.8369116	total: 22.1ms	remaining: 714ms
3:	learn: 0.8287218	total: 27.3ms	remaining: 655ms
4:	learn: 0.8243486	total: 33.3ms	remaining: 634ms
5:	learn: 0.8219111	total: 38.7ms	remaining: 606ms
6:	learn: 0.8200639	total: 43.7ms	remaining: 580ms
7:	learn: 0.8190432	total: 54.1ms	remaining: 622ms
8:	learn: 0.8178497	total: 60.2ms	remaining: 609ms
9:	learn: 0.8168945	total: 68ms	remaining: 612ms
10:	learn: 0.8158675	total: 72.9ms	remaining: 590ms
11:	learn: 0.8148374	total: 79.2ms	remaining: 581ms
12:	learn: 0.8138711	total: 83.5ms	remaining: 558ms
13:	learn: 0.8127710	total: 87.7ms	remaining: 539ms
14:	learn: 0.8122630	total: 92.6ms	remaining: 525ms
15:	learn: 0.8118789	total: 98.4ms	remaining: 516ms
16:	learn: 0.8113141	total: 103ms	remaining: 501ms
17:	learn: 0.8109341	total: 107ms	remaining: 487ms
18:	learn: 0.8104854	total: 114ms	remaining: 485

Unnamed: 0,Model,MSE,MAB,% error,Time
0,Decision Tree,1.218573,0.83366,0.221095,0.121069
1,Extra Trees,0.665176,0.630812,0.120688,2.802218
2,Random Forest,0.644323,0.621851,0.116905,5.50692
3,AdaBoost,0.709598,0.676454,0.128748,0.369066
4,Skl GBM,0.671385,0.647842,0.121815,1.765698
5,XGBoost,0.659533,0.638493,0.119664,1.764719
6,LightGBM,0.644024,0.630831,0.11685,0.161974
7,CatBoost,0.653337,0.6356,0.11854,0.721887
