In [1]:
import pandas as pd
import numpy as np
from tsfresh.feature_extraction.feature_calculators import autocorrelation, linear_trend
from seglearn.feature_functions import mean_diff
import lightgbm as lgb
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.linear_model import LinearRegression

In [2]:
df=pd.read_csv(r'C:\Users\asus\Downloads\municipality_bus_utilization.csv')

In [3]:
df['timestamp']=pd.to_datetime(df['timestamp'])
df['date']=df['timestamp'].dt.date
df['hour']=df['timestamp'].dt.hour

In [4]:
a=df.groupby(['date','hour','municipality_id']).usage.max()
a=a.reset_index()
df=pd.merge(a,df,how='left',on=['date','hour','municipality_id','usage'])
df.drop_duplicates(subset=['date','hour','municipality_id','usage'],inplace=True)


In [5]:
df['date'] = pd.to_datetime(df.date)
#feature extraction
df['month'] = df['date'].dt.month
df['week_day'] = df['date'].dt.weekday
# hour of day converted sinus, cosines to avoid size problem for machine learning models 
df['hr_sin'] = np.sin(df['hour'] * (2. * np.pi / 24))
df['hr_cos'] = np.cos(df['hour']* (2. * np.pi / 24))

df.date = pd.to_datetime(df.date)
df.hour = df.hour.astype('str').str.pad(2, 'left', '0')
df.timestamp = df.date.astype('str') + ' ' + df.hour + ':00:00'
df.timestamp = pd.to_datetime(df.timestamp)

df.drop(['date','total_capacity'],axis=1, inplace=True)


In [6]:
def create_shift_col(df, col_name, n_shift, window=None, func=None, pct=False):

    new_col_name = col_name+'_shift_' + str(n_shift)

    df_unique = df[['timestamp', 'municipality_id', col_name]]
    df_unique = df_unique.groupby(['timestamp', 'municipality_id']).mean()

    pivot = df_unique.unstack('municipality_id').resample('H').asfreq()
    pivot.dropna(axis=0,inplace=True)
    
    if pct:
        pivot = pivot.pct_change(1)
        new_col_name = col_name +'_pct_shift_' + str(n_shift)
    if window:
        
        pivot = pivot.rolling(window).apply(func, raw=True)
        new_col_name = '{}_{}_{}'.format(col_name, func.__name__, window)
        
    pivot = pivot.shift(n_shift)
    pivot.fillna(method='ffill', limit=50, inplace=True)

    stacked = pivot.stack('municipality_id')[col_name].rename(new_col_name)
    df = df.join(stacked, on=['timestamp', 'municipality_id'])

    df[new_col_name] = df.groupby(['municipality_id'])[new_col_name].transform(lambda x: x.fillna(x.mean()))
    df[new_col_name].fillna(inplace=True, method='ffill')

    return df

def linear_trend_func(x):
    return linear_trend(x, [{'attr': 'slope'}])[0][1]


def autocorrelation_func(x):
    return np.nan_to_num(autocorrelation(x, 1))

def mean_diff_func(x):
    return mean_diff([x])[0]


In [7]:
pivot=df.groupby(['timestamp','municipality_id']).usage.mean()

In [8]:
# df = create_shift_col(df, 'usage', 1, pct=True)
df = create_shift_col(df, 'usage', 1)
df = create_shift_col(df, 'usage', 2)
df = create_shift_col(df, 'usage', 1, 2, linear_trend_func)
df = create_shift_col(df, 'usage', 2, 3, linear_trend_func)
df = create_shift_col(df, 'usage', 1, 2, mean_diff_func)
df = create_shift_col(df, 'usage', 1, 2, autocorrelation_func)


In [9]:
train_1 = df[df.timestamp<pd.Timestamp(2017,8,5)]
test_1 = df[(df.timestamp>=pd.Timestamp(2017,8,5))&(df.timestamp<pd.Timestamp(2017,8,13))]

# Transformations
y = train_1['usage']
x = train_1.drop(columns=["timestamp",'usage'])


ohe_cols = ['hour', 'municipality_id', 'month', 'week_day']
numeric_cols = list(set(x.columns) - set(ohe_cols))
col_transformer = ColumnTransformer(transformers=[('mm', MinMaxScaler(), numeric_cols),
                                                  ('ohe', OneHotEncoder(drop='first'), ohe_cols)],
                                    remainder='passthrough')
x = col_transformer.fit_transform(x)

mmscaler = MinMaxScaler()
y = mmscaler.fit_transform(y.values.reshape(-1, 1))


# Model Fit
mdl_basic = LinearRegression()

fit_model = mdl_basic.fit(x, y)
pred_past = mmscaler.inverse_transform((fit_model.predict(x)).reshape(-1, 1))

# # Train Accuracy First Week
y_train_1 = train_1['usage']
y_train_1 = pd.DataFrame(y_train_1)
y_train_1['prediction'] = pred_past
mae=mean_absolute_error(y_train_1['usage'],y_train_1['prediction'])
mse=mean_squared_error(y_train_1['usage'],y_train_1['prediction'])
print('------------------------------------------------------------')
print('train_basic mae: ',mae)
print('train_basic mse: ',mse)


# Test transformations and Prediction
y_test_1 = test_1['usage']
x_test_1 = test_1.drop(columns=["timestamp", 'usage'])
x_test_1 = col_transformer.transform(x_test_1)
pred_test = mmscaler.inverse_transform((fit_model.predict(x_test_1)).reshape(-1, 1))

# # Test Accuracy First Week
y_test_1 = pd.DataFrame(y_test_1)
y_test_1['prediction'] = pred_test

# ----------------------------------------------------------
train_2 = df[df.timestamp<pd.Timestamp(2017,8,13)]
test_2 = df[df.timestamp>=pd.Timestamp(2017,8,13)]

# Transformations
y = train_2['usage']
x = train_2.drop(columns=["timestamp",'usage'])


ohe_cols = ['hour', 'municipality_id', 'month', 'week_day']
numeric_cols = list(set(x.columns) - set(ohe_cols))
col_transformer = ColumnTransformer(transformers=[('mm', MinMaxScaler(), numeric_cols),
                                                  ('ohe', OneHotEncoder(drop='first'), ohe_cols)],
                                    remainder='passthrough')
x = col_transformer.fit_transform(x)

mmscaler = MinMaxScaler()
y = mmscaler.fit_transform(y.values.reshape(-1, 1))

# Model Fit
mdl_basic = LinearRegression()

fit_model = mdl_basic.fit(x, y)
pred_past = mmscaler.inverse_transform((fit_model.predict(x)).reshape(-1, 1))

# # Train Accuracy First Week
y_train_2 = train_2['usage']
y_train_2 = pd.DataFrame(y_train_2)
y_train_2['prediction'] = pred_past


# Test transformations and Prediction
y_test_2 = test_2['usage']
x_test_2 = test_2.drop(columns=["timestamp", 'usage'])
x_test_2 = col_transformer.transform(x_test_2)
pred_test = mmscaler.inverse_transform((fit_model.predict(x_test_2)).reshape(-1, 1))

# # Test Accuracy First Week
y_test_2 = pd.DataFrame(y_test_2)
y_test_2['prediction'] = pred_test


y_test=pd.concat([y_test_1,y_test_2],axis=0)

mae=mean_absolute_error(y_test['usage'],y_test['prediction'])
mse=mean_squared_error(y_test['usage'],y_test['prediction'])
print('------------------------------------------------------------')
print('test_basic mae: ',mae)
print('test_basic mse: ',mse)

------------------------------------------------------------
train_basic mae:  91.50155702748692
train_basic mse:  25024.064128801245
------------------------------------------------------------
test_basic mae:  98.0218141475194
test_basic mse:  24417.753251953927


In [10]:
train_1 = df[df.timestamp<pd.Timestamp(2017,8,5)]
test_1 = df[(df.timestamp>=pd.Timestamp(2017,8,5))&(df.timestamp<pd.Timestamp(2017,8,13))]

# Transformations
y = train_1['usage']
x = train_1.drop(columns=["timestamp",'usage'])


ohe_cols = ['hour', 'municipality_id', 'month', 'week_day']
numeric_cols = list(set(x.columns) - set(ohe_cols))
col_transformer = ColumnTransformer(transformers=[('mm', MinMaxScaler(), numeric_cols),
                                                  ('ohe', OneHotEncoder(drop='first'), ohe_cols)],
                                    remainder='passthrough')
x = col_transformer.fit_transform(x)

mmscaler = MinMaxScaler()
y = mmscaler.fit_transform(y.values.reshape(-1, 1))

# Model Fit
mdl = lgb.LGBMRegressor(colsample_bytree=0.5, max_depth=2, n_estimators=400,
                        objective='mae', random_state=501, reg_alpha=0.01, reg_lambda=1,
                        subsample=0.5)
fit_model = mdl.fit(x, y)
pred_past = mmscaler.inverse_transform((fit_model.predict(x)).reshape(-1, 1))

# # Train Accuracy First Week
y_train_1 = train_1['usage']
y_train_1 = pd.DataFrame(y_train_1)
y_train_1['prediction'] = pred_past
mae=mean_absolute_error(y_train_1['usage'],y_train_1['prediction'])
mse=mean_squared_error(y_train_1['usage'],y_train_1['prediction'])
print('------------------------------------------------------------')
print('train_lgb mae: ',mae)
print('train_lgb mse: ',mse)


# Test transformations and Prediction
y_test_1 = test_1['usage']
x_test_1 = test_1.drop(columns=["timestamp", 'usage'])
x_test_1 = col_transformer.transform(x_test_1)
pred_test = mmscaler.inverse_transform((fit_model.predict(x_test_1)).reshape(-1, 1))

# # Test Accuracy First Week
y_test_1 = pd.DataFrame(y_test_1)
y_test_1['prediction'] = pred_test

# ----------------------------------------------------------
train_2 = df[df.timestamp<pd.Timestamp(2017,8,13)]
test_2 = df[df.timestamp>=pd.Timestamp(2017,8,13)]

# Transformations
y = train_2['usage']
x = train_2.drop(columns=["timestamp",'usage'])


ohe_cols = ['hour', 'municipality_id', 'month', 'week_day']
numeric_cols = list(set(x.columns) - set(ohe_cols))
col_transformer = ColumnTransformer(transformers=[('mm', MinMaxScaler(), numeric_cols),
                                                  ('ohe', OneHotEncoder(drop='first'), ohe_cols)],
                                    remainder='passthrough')
x = col_transformer.fit_transform(x)

mmscaler = MinMaxScaler()
y = mmscaler.fit_transform(y.values.reshape(-1, 1))

# Model Fit

mdl = lgb.LGBMRegressor(colsample_bytree=0.5, max_depth=2, n_estimators=800,
                        objective='mae', random_state=501, reg_alpha=0.01, reg_lambda=1,
                        subsample=0.5)
fit_model = mdl.fit(x, y)
pred_past = mmscaler.inverse_transform((fit_model.predict(x)).reshape(-1, 1))

# # Train Accuracy First Week
y_train_2 = train_2['usage']
y_train_2 = pd.DataFrame(y_train_2)
y_train_2['prediction'] = pred_past


# Test transformations and Prediction
y_test_2 = test_2['usage']
x_test_2 = test_2.drop(columns=["timestamp", 'usage'])
x_test_2 = col_transformer.transform(x_test_2)
pred_test = mmscaler.inverse_transform((fit_model.predict(x_test_2)).reshape(-1, 1))

# # Test Accuracy First Week
y_test_2 = pd.DataFrame(y_test_2)
y_test_2['prediction'] = pred_test


y_test=pd.concat([y_test_1,y_test_2],axis=0)

mae=mean_absolute_error(y_test['usage'],y_test['prediction'])
mse=mean_squared_error(y_test['usage'],y_test['prediction'])
print('------------------------------------------------------------')
print('test_lgb mae: ',mae)
print('test_lgb mse: ',mse)


  return f(*args, **kwargs)


------------------------------------------------------------
train_lgb mae:  51.78339021325185
train_lgb mse:  9300.579211153214


  return f(*args, **kwargs)


------------------------------------------------------------
test_lgb mae:  59.229934657739314
test_lgb mse:  10374.5930932
