In [1]:
import pandas as pd
import numpy as np
from  sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import explained_variance_score,mean_squared_error,mean_absolute_percentage_error
from datetime import timedelta

In [2]:
def load_data():
    raw_df=[]
    df=pd.read_excel(r'C:\Users\Chandan\Desktop\Analog_Reports\report_ss4_CheLab_2.xlsx',
                skiprows=6,parse_dates={'Time':[0]},index_col='Time',skipfooter=1,sheet_name=None)
    for key,dframe in df.items():
        dframe.drop(dframe.tail(1).index,inplace=True)
        raw_df.append(dframe[['KW']])
    raw_df=pd.concat(raw_df)
    raw_df.index=pd.to_datetime(raw_df.index,format='%d-%m-%Y %H:%M')
    return raw_df

def load_weather():
    weather = pd.read_csv(r'C:\Users\Chandan\Desktop\Analog_Reports\kanpur_weather.csv',
                             header=24)
    LIST=[]
    for i in range(weather.shape[0]):
        if weather.HR.values[i] < 10:
            string = f"{weather.YEAR.values[i]}-{weather.MO.values[i]}-{weather.DY.values[i]} {0}{weather.HR.values[i]}"
        else:
            string = f"{weather.YEAR.values[i]}-{weather.MO.values[i]}-{weather.DY.values[i]} {weather.HR.values[i]}"

        LIST.append(pd.to_datetime(string,format='%Y-%m-%d %H'))

    weather['time']=LIST
    weather=weather.set_index(weather['time'],drop=False)
    weather.drop(['YEAR','MO','DY','HR','time'],axis=1,inplace=True)
    return weather

def fill_missing(df,neighbours=6):
    start,end=df.index[0],df.index[-1]
    index=pd.date_range(start=start,end=end,freq='30T')
    t_df=pd.DataFrame({'val':np.NaN},index=index)
    result = df.join(t_df, how="outer").drop(['val'],axis=1)
    imputer = KNNImputer(n_neighbors=neighbours)
    imputed = imputer.fit_transform(result)
    return pd.DataFrame({'KW':imputed.flatten()},index=index)

def adding_hour_feature(data,hour_lags=5):
    data_index= data.index
    for i in range(1,(hour_lags+1)):
        for index in data_index:
            if index-timedelta(hours=i) in data.index:
                data.loc[index,f"t_{i}"]=data.loc[index-timedelta(hours=i),'KW']
            else:
                data.loc[index,f"t_{i}"]=data.loc[index-timedelta(hours=i)+timedelta(days=1),'KW']
    return data

def adding_day_feature(data,day_lags=1):
    data_index= data.index
    for i in range(1,(day_lags+1)):
        for index in data_index:
            if index-timedelta(days=i) in data.index:
                a = data.loc[index-timedelta(hours=24*i),'KW']
                b = data.loc[index-timedelta(hours=24*i-1),'KW']
                if index-timedelta(hours=24*i+1) in data.index:
                    c = data.loc[index-timedelta(hours=24*i+1),'KW'] 
                else:
                    c= data.loc[index-timedelta(hours=24*i),'KW']
                data.loc[index,f"d_{i}"] = np.mean((a,b,c)) 
            else:
                data.loc[index,f"d_{i}"] = data.loc[index-timedelta(days=i)+timedelta(weeks=1),'KW']
    return data

def add_timing_feature(dff,hour_lags=5,day_lags=1):
                              
    dff = adding_hour_feature(dff,hour_lags=hour_lags)
    dff = adding_day_feature(dff,day_lags=day_lags)  
                              
    dff['day_of_week']=dff.index.day_name()
    dff['hour']=dff.index.hour
    
    # adding cyclicity feature...
    #dff['hour_cos'] = np.cos(2 * np.pi * dff['hour'] / 24)
    #dff['hour_sin'] = np.sin(2 * np.pi * dff['hour'] / 24)
    
    # adding hour wise mean...
    for hour,hour_df in dff.groupby('hour'):
        dff.loc[dff[dff.hour==hour].index,'hour_mean']=hour_df[-7:].KW.mean()
    
    #for daywise mean....
    for day,day_df in dff.groupby('day_of_week'):
        dff.loc[dff[dff.day_of_week==day].index,'week_mean'] = day_df[-5:].KW.mean()
        
    return dff.drop(columns=['day_of_week','hour'],axis=1)

In [3]:
raw_df=load_data()
imputed_df = fill_missing(raw_df)
hourly_df=imputed_df.resample('1H').mean()
hourly_dff=add_timing_feature(hourly_df,hour_lags=6,day_lags=5)
weather_df=load_weather()
combine_df = pd.merge(hourly_dff, weather_df, left_index=True, right_index=True)
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_df = scaler.fit_transform(combine_df.drop(columns=['KW'],axis=1))

Applying PCA...

In [4]:
from sklearn.decomposition import PCA
pca = PCA(0.99)
X= scaled_df
X_pca = pca.fit_transform(X)
X_pca.shape

(3092, 12)

In [5]:
pca.explained_variance_ratio_

array([0.53827018, 0.14030178, 0.10644566, 0.06715712, 0.05913008,
       0.04423583, 0.01072645, 0.00841584, 0.0070201 , 0.00452925,
       0.00321742, 0.00289248])

In [6]:
X_pca

array([[ 1.39052080e+00, -4.58164647e-01,  6.56466102e-02, ...,
         2.75326924e-02, -2.02386744e-03,  2.86352504e-03],
       [ 1.39109152e+00, -4.86096934e-01,  6.30309930e-02, ...,
         3.08008676e-02,  1.62272791e-02, -1.70639332e-02],
       [ 1.39263705e+00, -5.17626413e-01,  6.06139989e-02, ...,
        -1.40818291e-03, -7.29730575e-03, -3.24299739e-02],
       ...,
       [-6.86528020e-01,  8.76367628e-01,  2.93244724e+00, ...,
        -9.33164476e-03,  2.43462548e-02, -5.18647074e-02],
       [-6.84128883e-01,  7.04777220e-01,  2.94844080e+00, ...,
        -7.26536606e-02,  7.55789426e-03, -3.52704652e-03],
       [-6.80657958e-01,  5.39490718e-01,  2.96069879e+00, ...,
        -1.07288852e-01, -1.13105291e-02,  1.27689089e-02]])

In [7]:
input_data = X_pca
target_data = combine_df.KW.values
inp_dim = int(0.7 * X_pca.shape[0])
train_input = input_data[:inp_dim]
train_target = target_data[:inp_dim]
test_input = input_data[inp_dim:]
test_target = target_data[inp_dim:]

In [8]:
%%time
base_model = RandomForestRegressor(n_jobs=-1, random_state=42)
base_model.fit(train_input, train_target)

Wall time: 909 ms


RandomForestRegressor(n_jobs=-1, random_state=42)

In [9]:
test_result = base_model.predict(test_input)
train_result = base_model.predict(train_input)
print(f"Training set model score....")
base_model.score(train_input, train_target)

Training set model score....


0.9829568003241742

In [10]:
print(f"Test set model score....")
base_model.score(test_input, test_target)

Test set model score....


0.8421762931920035

In [13]:
def print_summary():
    print(f"test...summary")
    print(f"explained_variance_score ={explained_variance_score(train_result,train_target)}")
    print(f"mean_squared_error ={mean_squared_error(train_result,train_target)}")
    print(f"mean_absolute_percentage_error ={mean_absolute_percentage_error(train_result,train_target)}")
print_summary()

test...summary
explained_variance_score =0.9811925116869648
mean_squared_error =1.7751955379930473
mean_absolute_percentage_error =0.01785985501244579


In [11]:
def print_summary():
    print(f"test...summary")
    print(f"explained_variance_score ={explained_variance_score(test_result,test_target)}")
    print(f"mean_squared_error ={mean_squared_error(test_result,test_target)}")
    print(f"mean_absolute_percentage_error ={mean_absolute_percentage_error(test_result,test_target)}")
print_summary()

test...summary
explained_variance_score =0.8313332868808613
mean_squared_error =13.173732231497295
mean_absolute_percentage_error =0.062120855565392756


importance_df = pd.DataFrame({
    'feature': combine_df.columns[1:],
    'importance': base_model.feature_importances_
}).sort_values('importance', ascending=False)

In [29]:
def get_score(max_estimator):
    acc_list=[]
    for estimator in max_estimator:
        model = RandomForestRegressor(random_state=42, n_jobs=-1, n_estimators=estimator)
        model.fit(train_input, train_target)
        test_result = model.predict(test_input)
        acc_list.append((f"estimator={estimator}",
                        f"test_acc={model.score(test_input, test_target)}",
                        f"mean_absolute_percentage_error ={mean_absolute_percentage_error(test_result,test_target)}"))
    return acc_list
get_score([90,100,110,120])

[('estimator=90',
  'test_acc=0.8426267012290392',
  'mean_absolute_percentage_error =0.062191532109316126'),
 ('estimator=100',
  'test_acc=0.8421762931920035',
  'mean_absolute_percentage_error =0.062120855565392756'),
 ('estimator=110',
  'test_acc=0.842256389861469',
  'mean_absolute_percentage_error =0.06220996482109334'),
 ('estimator=120',
  'test_acc=0.8423940502867032',
  'mean_absolute_percentage_error =0.062129082820372175')]

In [34]:
def get_score(max_depth):
    acc_list=[]
    for depth in max_depth:
        model=RandomForestRegressor(random_state=42, n_jobs=-1,n_estimators=100,max_depth=depth)
        model.fit(train_input, train_target)
        test_result = model.predict(test_input)
        acc_list.append((f"maxdepth={depth}",
                         f"test_acc={model.score(test_input, test_target)}",
                         f"mean_absolute_percentage_error ={mean_absolute_percentage_error(test_result,test_target)}"))
    return acc_list
get_score([30,35,40])


[('maxdepth=30',
  'test_acc=0.8421762931920035',
  'mean_absolute_percentage_error =0.06212085556539275'),
 ('maxdepth=35',
  'test_acc=0.8421762931920035',
  'mean_absolute_percentage_error =0.062120855565392756'),
 ('maxdepth=40',
  'test_acc=0.8421762931920035',
  'mean_absolute_percentage_error =0.06212085556539275')]

In [36]:
def get_score(max_leaf_node):
    acc_list=[]
    for leaf in max_leaf_node:
        model=RandomForestRegressor(random_state=42,
                                    n_jobs=-1,
                                    n_estimators=100,
                                    max_depth=30,
                                    max_leaf_nodes=2**leaf)
        model.fit(train_input, train_target)
        test_result = model.predict(test_input)
        acc_list.append((f"max_leaf_nodes=2**{leaf}",
                        f"test_acc={model.score(test_input, test_target)}",
                        f"mean_absolute_percentage_error ={mean_absolute_percentage_error(test_result,test_target)}"))
    return acc_list
get_score([10,15,20])

[('max_leaf_nodes=2**10',
  'test_acc=0.8440674361602364',
  'mean_absolute_percentage_error =0.06196448574495167'),
 ('max_leaf_nodes=2**15',
  'test_acc=0.8440892155382209',
  'mean_absolute_percentage_error =0.061959658379459624'),
 ('max_leaf_nodes=2**20',
  'test_acc=0.8440892155382209',
  'mean_absolute_percentage_error =0.061959658379459624')]

In [39]:
model=RandomForestRegressor(random_state=42,
                                    n_jobs=-1,
                                    n_estimators=100,
                                    max_depth=30,
                                    max_leaf_nodes=2**10,
                                   max_samples=0.5,
                               min_impurity_decrease=1e-6)
model.fit(train_input, train_target)
model.score(train_input, train_target),model.score(test_input, test_target)

(0.9510701877032838, 0.8417313582704616)