# Tree Based Methods....

Loading DATA..

In [1]:
import pandas as pd
import numpy as np
from  sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import explained_variance_score,mean_squared_error,mean_absolute_percentage_error
from datetime import timedelta

In [2]:
def load_data():
    raw_df=[]
    df=pd.read_excel(r'C:\Users\SR-19\Desktop\Analog_Reports\report_ss4_CheLab_2.xlsx',
                skiprows=6,parse_dates={'Time':[0]},index_col='Time',skipfooter=1,sheet_name=None)
    for key,dframe in df.items():
        dframe.drop(dframe.tail(1).index,inplace=True)
        raw_df.append(dframe[['KW']])
    raw_df=pd.concat(raw_df)
    raw_df.index=pd.to_datetime(raw_df.index,format='%d-%m-%Y %H:%M')
    return raw_df

def load_weather():
    weather = pd.read_csv(r'C:\Users\SR-19\Desktop\Analog_Reports\kanpur_weather.csv',
                             header=24)
    LIST=[]
    for i in range(weather.shape[0]):
        if weather.HR.values[i] < 10:
            string = f"{weather.YEAR.values[i]}-{weather.MO.values[i]}-{weather.DY.values[i]} {0}{weather.HR.values[i]}"
        else:
            string = f"{weather.YEAR.values[i]}-{weather.MO.values[i]}-{weather.DY.values[i]} {weather.HR.values[i]}"

        LIST.append(pd.to_datetime(string,format='%Y-%m-%d %H'))

    weather['time']=LIST
    weather=weather.set_index(weather['time'],drop=False)
    weather.drop(['YEAR','MO','DY','HR','time'],axis=1,inplace=True)
    return weather

def fill_missing(df,neighbours=6):
    start,end=df.index[0],df.index[-1]
    index=pd.date_range(start=start,end=end,freq='30T')
    t_df=pd.DataFrame({'val':np.NaN},index=index)
    result = df.join(t_df, how="outer").drop(['val'],axis=1)
    imputer = KNNImputer(n_neighbors=neighbours)
    imputed = imputer.fit_transform(result)
    return pd.DataFrame({'KW':imputed.flatten()},index=index)

def adding_hour_feature(data,hour_lags=5):
    data_index= data.index
    for i in range(1,(hour_lags+1)):
        for index in data_index:
            if index-timedelta(hours=i) in data.index:
                data.loc[index,f"t_{i}"]=data.loc[index-timedelta(hours=i),'KW']
            else:
                data.loc[index,f"t_{i}"]=data.loc[index-timedelta(hours=i)+timedelta(days=1),'KW']
    return data

def adding_day_feature(data,day_lags=1):
    data_index= data.index
    for i in range(1,(day_lags+1)):
        for index in data_index:
            if index-timedelta(days=i) in data.index:
                a = data.loc[index-timedelta(hours=24*i),'KW']
                b = data.loc[index-timedelta(hours=24*i-1),'KW']
                if index-timedelta(hours=24*i+1) in data.index:
                    c = data.loc[index-timedelta(hours=24*i+1),'KW'] 
                else:
                    c= data.loc[index-timedelta(hours=24*i),'KW']
                data.loc[index,f"d_{i}"] = np.mean((a,b,c)) 
            else:
                data.loc[index,f"d_{i}"] = data.loc[index-timedelta(days=i)+timedelta(weeks=1),'KW']
    return data
                              
def add_timing_feature(dff,hour_lags=5,day_lags=1):
                              
    dff = adding_hour_feature(dff,hour_lags=hour_lags)
    dff = adding_day_feature(dff,day_lags=day_lags)  
                              
    dff['day_of_week']=dff.index.day_name()
    dff['hour']=dff.index.hour
   
    # adding hour wise mean...
    for hour,hour_df in dff.groupby('hour'):
        dff.loc[dff[dff.hour==hour].index,'hour_mean']=hour_df[-7:].KW.mean()
    
    #for daywise mean....
    for day,day_df in dff.groupby('day_of_week'):
        dff.loc[dff[dff.day_of_week==day].index,'week_mean'] = day_df[-5:].KW.mean()
        
    return dff.drop(columns=['day_of_week','hour'],axis=1)


def adding_week_feature(week_lags=4,data):
    global date_index
    for index in date_index:
        if index-timedelta(weeks=week) in new_data.index:
            data.loc[index,f"week-{week}"]=data.loc[index-timedelta(weeks=week),'Global_active_power']
        else:
            data.loc[index,f"week-{week}"]=data.loc[index+timedelta(weeks=week),'Global_active_power']
    return data

In [3]:
raw_df=load_data()
imputed_df = fill_missing(raw_df)
hourly_df=imputed_df.resample('1H').mean()
hourly_dff=add_timing_feature(hourly_df,hour_lags=6,day_lags=5)
weather_df=load_weather()
combine_df = pd.merge(hourly_dff, weather_df, left_index=True, right_index=True)
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_df = scaler.fit_transform(combine_df)

In [4]:
input_data = scaled_df[:,1:]
target_data = scaled_df[:,0]
inp_dim = int(0.7 * scaled_df.shape[0])
train_input = input_data[:inp_dim]
train_target = target_data[:inp_dim]
test_input = input_data[inp_dim:]
test_target = target_data[inp_dim:]

Base Model.....

In [5]:
%%time
base_model = RandomForestRegressor(n_jobs=-1, random_state=42)
base_model.fit(train_input, train_target)

Wall time: 728 ms


RandomForestRegressor(n_jobs=-1, random_state=42)

In [6]:
test_result = base_model.predict(test_input)
train_result = base_model.predict(train_input)
print(f"Training set model score....")
base_model.score(train_input, train_target)

Training set model score....


0.9881945163347673

In [7]:
print(f"Test set model score....")
base_model.score(test_input, test_target)

Test set model score....


0.9068675819298846

In [8]:
def print_summary():
    print(f"test...summary")
    print(f"explained_variance_score ={explained_variance_score(test_result,test_target)}")
    print(f"mean_squared_error ={mean_squared_error(test_result,test_target)}")
    print(f"mean_absolute_percentage_error ={mean_absolute_percentage_error(test_result,test_target)}")
print_summary()

test...summary
explained_variance_score =0.8925106348587433
mean_squared_error =0.0017589186737005048
mean_absolute_percentage_error =0.14022354139064405


In [9]:
importance_df = pd.DataFrame({
    'feature': combine_df.columns[1:],
    'importance': base_model.feature_importances_
}).sort_values('importance', ascending=False)

In [10]:
importance_df

Unnamed: 0,feature,importance
0,t_1,0.83336
11,hour_mean,0.04316
1,t_2,0.022426
6,d_1,0.019715
7,d_2,0.008819
21,T2M,0.0087
8,d_3,0.007933
2,t_3,0.006703
5,t_6,0.005609
3,t_4,0.005303


In [11]:
def get_score(max_estimator):
    acc_list=[]
    model = RandomForestRegressor(random_state=42, n_jobs=-1, n_estimators=max_estimator)
    model.fit(X_train, train_target.values.ravel())
    for i in range(max_estimator):
        acc_list.append((f"estimator={i}",
                         f"train_acc={model.score(X_train, train_target)}",
                         f"test_acc={model.score(X_test, test_target)}"))
    return acc_list

In [12]:
get_score(200)

NameError: name 'X_train' is not defined

In [None]:
def get_score(max_depth):
    acc_list=[]
    for i in range(2,max_depth,5):
        model=RandomForestRegressor(random_state=42, n_jobs=-1,max_depth=i)
        model.fit(X_train, train_target.values.ravel())
        acc_list.append((f"max_depth={i}",
                             f"train_acc={model.score(X_train, train_target)}",
                             f"test_acc={model.score(X_test, test_target)}"))
    return acc_list

In [None]:
get_score(50)

In [None]:
def get_score(max_leaf_node):
    acc_list=[]
    for i in range(4,max_leaf_node):
        model=RandomForestRegressor(random_state=42, n_jobs=-1,max_depth=25,max_leaf_nodes=2**i)
        model.fit(X_train, train_target.values.ravel())
        acc_list.append((f"max_leaf_nodes={2**i}",
                             f"train_acc={model.score(X_train, train_target)}",
                             f"test_acc={model.score(X_test, test_target)}"))
    return acc_list

In [None]:
get_score(12)

In [None]:
model=RandomForestRegressor(random_state=42,
                            n_jobs=-1,
                            max_depth=16,
                            max_leaf_nodes=2**5)
model.fit(X_train, train_target.values.ravel())
model.score(X_train, train_target),model.score(X_test, test_target)

In [None]:
model=RandomForestRegressor(random_state=42,
                            n_jobs=-1,
                            max_depth=25,
                            max_leaf_nodes=2**10,
                           max_features='log2')
model.fit(X_train, train_target.values.ravel())
model.score(X_train, train_target),model.score(X_test, test_target)

In [None]:
model=RandomForestRegressor(random_state=42,
                            n_jobs=-1,
                            max_depth=25,
                            max_leaf_nodes=2**8,
                           max_features=10,
                           max_samples=0.5,
                           min_impurity_decrease=1e-6)
model.fit(X_train, train_target.values.ravel())
model.score(X_train, train_target),model.score(X_test, test_target)

In [None]:
model=RandomForestRegressor(random_state=42,
                            n_jobs=-1,
                            max_depth=27,
                            max_leaf_nodes=2**8,
                           max_features=7)
model.fit(X_train, train_target.values.ravel())
model.score(X_train, train_target),model.score(X_test, test_target)

## Adding some features... 

In [None]:
def normalizing_data(raw_df,data):
    numerical_cols=['day-1','day-2','day-3','day-4','day-5','day-6','week-1','week-2','week-3','week-4']
    scaler = MinMaxScaler().fit(raw_df[numerical_cols])
    normlized_data[numerical_cols] = scaler.transform(data[numerical_cols]) 
    return normlized_data

In [None]:
def encoding_data(raw_df,data):
    categorical_cols=['weekend']
    encoder = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(raw_df[categorical_cols])
    encoded_cols = list(encoder.get_feature_names_out(categorical_cols))
    encoded_data[encoded_cols] = encoder.transform(data[categorical_cols])
    return encoded_data,encoded_cols


In [None]:
def split_data(raw_df):
    
    #creating diff columns...and splitting data into.. input and target
    input_cols=['day-1','day-2','day-3','day-4','day-5','week-1','week-2','week-3','week-4','weekend']
    target_cols=['Global_active_power']
    
    train_input=train_data[input_cols].copy()
    train_target=train_data[target_cols].copy()
    test_input=test_data[input_cols].copy()
    test_target=test_data[target_cols].copy()
    return train_input,train_target,test_input,test_target

In [None]:
adding_day_feature(1,date_index)

In [None]:
X_train = train_input[numerical_cols+encoded_cols]
X_test = test_input[numerical_cols+encoded_cols]

### Adding Some Features... 

# pca

In [None]:
raw_df.head(10)
xt=raw_df.drop(["weekend","year","day_of_week","month","hour"],axis=1)
xt

In [None]:
xt.to_csv("mydata.csv")

In [None]:
pca = PCA(0.95)
data=pca.fit_transform(xt)

In [None]:
pca.explained_variance_ratio_

# printing....

In [None]:
temp_data=new_data[new_data.year>2006] 

In [None]:
temp_data.columns

In [None]:
import seaborn as sns

In [None]:
plt.figure(figsize=(30,15))
weekend=temp_data.weekend.astype(float)
ax = sns.boxplot(x=temp_data.hour, y=temp_data.Global_active_power,hue=weekend,data=temp_data)

In [None]:
plt.figure(figsize=(30,15))
weekday=temp_data.day_of_week
ax = sns.boxplot(x=temp_data.hour, y=temp_data.Global_active_power,hue=weekday,data=temp_data)

In [None]:
plt.figure(figsize=(25,15))
month_name=temp_data.month
ax = sns.boxplot(x=temp_data.hour, y=temp_data.Global_active_power,hue=month_name,data=temp_data)

In [None]:
plt.scatter(temp_data.hour,temp_data.Global_active_power)

In [None]:
plt.scatter(temp_data.weekend,temp_data.Global_active_power)

In [None]:
plt.scatter(temp_data.month,temp_data.Global_active_power)

In [None]:
plt.figure(figsize=(30,15))
ax = sns.boxplot(x=temp_data.month, y=temp_data.Global_active_power,data=temp_data)

In [None]:
plt.scatter(temp_data.day_of_week,temp_data.Global_active_power)

In [None]:
plt.figure(figsize=(30,15))

ax = sns.boxplot(x=temp_data.day_of_week, y=temp_data.Global_active_power,data=temp_data)

# Grid Search CV

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    'max_depth': [8,10],
    'min_samples_split': [2,4],
    'n_estimators': [int(x) for x in range(100, 500, 100)]}

In [None]:
model=RandomForestRegressor()
grid_search = GridSearchCV(estimator = model, param_grid = param_grid, 
                          cv = 10, n_jobs = -1, verbose = 2)
grid_search.fit(X_train,train_target)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

In [None]:
import pandas as pd
 
# The read_csv is reading the csv file into Dataframe
 
df = pd.read_csv(r'C:\Users\SR-19\Desktop\Env_hourly_data.csv')
 
# then to_excel method converting the .csv file to .xlsx file.
 
df.to_excel(r'C:\Users\SR-19\Desktop\Env_hourly_data.xlsx')
 

In [None]:
np.mean((5,10,15))