In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline  
sns.set()

In [2]:
data = pd.read_csv('household_power_consumption.txt', sep=';',parse_dates={'Date_Time' : ['Date', 'Time']},
                   infer_datetime_format=True,
                   low_memory=False)
data

Unnamed: 0,Date_Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
0,2006-12-16 17:24:00,4.216,0.418,234.840,18.400,0.000,1.000,17.0
1,2006-12-16 17:25:00,5.360,0.436,233.630,23.000,0.000,1.000,16.0
2,2006-12-16 17:26:00,5.374,0.498,233.290,23.000,0.000,2.000,17.0
3,2006-12-16 17:27:00,5.388,0.502,233.740,23.000,0.000,1.000,17.0
4,2006-12-16 17:28:00,3.666,0.528,235.680,15.800,0.000,1.000,17.0
...,...,...,...,...,...,...,...,...
2075254,2010-11-26 20:58:00,0.946,0.000,240.430,4.000,0.000,0.000,0.0
2075255,2010-11-26 20:59:00,0.944,0.000,240.000,4.000,0.000,0.000,0.0
2075256,2010-11-26 21:00:00,0.938,0.000,239.820,3.800,0.000,0.000,0.0
2075257,2010-11-26 21:01:00,0.934,0.000,239.700,3.800,0.000,0.000,0.0


In [3]:
data.columns

Index(['Date_Time', 'Global_active_power', 'Global_reactive_power', 'Voltage',
       'Global_intensity', 'Sub_metering_1', 'Sub_metering_2',
       'Sub_metering_3'],
      dtype='object')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075259 entries, 0 to 2075258
Data columns (total 8 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   Date_Time              datetime64[ns]
 1   Global_active_power    object        
 2   Global_reactive_power  object        
 3   Voltage                object        
 4   Global_intensity       object        
 5   Sub_metering_1         object        
 6   Sub_metering_2         object        
 7   Sub_metering_3         float64       
dtypes: datetime64[ns](1), float64(1), object(6)
memory usage: 126.7+ MB


In [5]:
data.isna().sum()

Date_Time                    0
Global_active_power          0
Global_reactive_power        0
Voltage                      0
Global_intensity             0
Sub_metering_1               0
Sub_metering_2               0
Sub_metering_3           25979
dtype: int64

In [6]:
sample_data = data.sample(n=15000)
sample_data

Unnamed: 0,Date_Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
501201,2007-11-29 18:45:00,1.548,0.056,239.070,6.400,0.000,0.000,0.0
1900125,2010-07-28 06:09:00,0.332,0.214,241.740,1.600,0.000,0.000,1.0
977604,2008-10-25 14:48:00,1.294,0.062,242.550,5.200,0.000,0.000,18.0
1100607,2009-01-19 00:51:00,0.368,0.110,246.270,1.600,0.000,1.000,0.0
261161,2007-06-16 02:05:00,0.158,0.000,238.350,0.600,0.000,0.000,0.0
...,...,...,...,...,...,...,...,...
1075540,2009-01-01 15:04:00,0.330,0.088,248.540,1.400,0.000,0.000,0.0
120944,2007-03-10 17:08:00,0.952,0.350,240.420,4.200,0.000,0.000,0.0
1932498,2010-08-19 17:42:00,?,?,?,?,?,?,
380697,2007-09-07 02:21:00,0.110,0.000,240.100,0.600,0.000,0.000,0.0


In [7]:
sample_data.isna().sum()

Date_Time                  0
Global_active_power        0
Global_reactive_power      0
Voltage                    0
Global_intensity           0
Sub_metering_1             0
Sub_metering_2             0
Sub_metering_3           207
dtype: int64

In [8]:
sample_data.dropna(inplace=True)

In [9]:
sample_data.isna().sum().sum()

0

In [10]:
sample_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14793 entries, 501201 to 1979261
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   Date_Time              14793 non-null  datetime64[ns]
 1   Global_active_power    14793 non-null  object        
 2   Global_reactive_power  14793 non-null  object        
 3   Voltage                14793 non-null  object        
 4   Global_intensity       14793 non-null  object        
 5   Sub_metering_1         14793 non-null  object        
 6   Sub_metering_2         14793 non-null  object        
 7   Sub_metering_3         14793 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(6)
memory usage: 1.0+ MB


In [11]:
sample_data['Sub_metering_1'] = pd.to_numeric(sample_data['Sub_metering_1'],errors='coerce')
sample_data['Sub_metering_2'] = pd.to_numeric(sample_data['Sub_metering_2'],errors='coerce')

In [12]:
sample_data['year'] = sample_data['Date_Time'].dt.year
sample_data['month'] = sample_data['Date_Time'].dt.month
sample_data['day'] = sample_data['Date_Time'].dt.day
sample_data['hour'] = sample_data['Date_Time'].dt.hour
sample_data['minute'] = sample_data['Date_Time'].dt.minute

In [13]:
sample_data

Unnamed: 0,Date_Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,year,month,day,hour,minute
501201,2007-11-29 18:45:00,1.548,0.056,239.070,6.400,0.0,0.0,0.0,2007,11,29,18,45
1900125,2010-07-28 06:09:00,0.332,0.214,241.740,1.600,0.0,0.0,1.0,2010,7,28,6,9
977604,2008-10-25 14:48:00,1.294,0.062,242.550,5.200,0.0,0.0,18.0,2008,10,25,14,48
1100607,2009-01-19 00:51:00,0.368,0.110,246.270,1.600,0.0,1.0,0.0,2009,1,19,0,51
261161,2007-06-16 02:05:00,0.158,0.000,238.350,0.600,0.0,0.0,0.0,2007,6,16,2,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
980247,2008-10-27 10:51:00,0.208,0.052,242.470,0.800,0.0,0.0,0.0,2008,10,27,10,51
1075540,2009-01-01 15:04:00,0.330,0.088,248.540,1.400,0.0,0.0,0.0,2009,1,1,15,4
120944,2007-03-10 17:08:00,0.952,0.350,240.420,4.200,0.0,0.0,0.0,2007,3,10,17,8
380697,2007-09-07 02:21:00,0.110,0.000,240.100,0.600,0.0,0.0,0.0,2007,9,7,2,21


In [14]:
sample_data["total_energy"] = sample_data['Sub_metering_1']+sample_data['Sub_metering_2']+sample_data['Sub_metering_3']

In [15]:
sample_data.head()

Unnamed: 0,Date_Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,year,month,day,hour,minute,total_energy
501201,2007-11-29 18:45:00,1.548,0.056,239.07,6.4,0.0,0.0,0.0,2007,11,29,18,45,0.0
1900125,2010-07-28 06:09:00,0.332,0.214,241.74,1.6,0.0,0.0,1.0,2010,7,28,6,9,1.0
977604,2008-10-25 14:48:00,1.294,0.062,242.55,5.2,0.0,0.0,18.0,2008,10,25,14,48,18.0
1100607,2009-01-19 00:51:00,0.368,0.11,246.27,1.6,0.0,1.0,0.0,2009,1,19,0,51,1.0
261161,2007-06-16 02:05:00,0.158,0.0,238.35,0.6,0.0,0.0,0.0,2007,6,16,2,5,0.0


In [16]:
sample_data.drop(['Sub_metering_1', 'Sub_metering_2','Sub_metering_3'], axis=1,inplace=True)

In [17]:
sample_data.drop(['Date_Time'], axis=1,inplace=True)

In [18]:
sample_data.head()

Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,year,month,day,hour,minute,total_energy
501201,1.548,0.056,239.07,6.4,2007,11,29,18,45,0.0
1900125,0.332,0.214,241.74,1.6,2010,7,28,6,9,1.0
977604,1.294,0.062,242.55,5.2,2008,10,25,14,48,18.0
1100607,0.368,0.11,246.27,1.6,2009,1,19,0,51,1.0
261161,0.158,0.0,238.35,0.6,2007,6,16,2,5,0.0


In [19]:
X = sample_data.iloc[:,:-1] # taking all the features in to X variable except the last feature that is total_energy
y = sample_data.iloc[:,-1] # taking all the data point of the total_energy column

In [20]:
X

Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,year,month,day,hour,minute
501201,1.548,0.056,239.070,6.400,2007,11,29,18,45
1900125,0.332,0.214,241.740,1.600,2010,7,28,6,9
977604,1.294,0.062,242.550,5.200,2008,10,25,14,48
1100607,0.368,0.110,246.270,1.600,2009,1,19,0,51
261161,0.158,0.000,238.350,0.600,2007,6,16,2,5
...,...,...,...,...,...,...,...,...,...
980247,0.208,0.052,242.470,0.800,2008,10,27,10,51
1075540,0.330,0.088,248.540,1.400,2009,1,1,15,4
120944,0.952,0.350,240.420,4.200,2007,3,10,17,8
380697,0.110,0.000,240.100,0.600,2007,9,7,2,21


In [21]:
y

501201      0.0
1900125     1.0
977604     18.0
1100607     1.0
261161      0.0
           ... 
980247      0.0
1075540     0.0
120944      0.0
380697      0.0
1979261     1.0
Name: total_energy, Length: 14793, dtype: float64

### Train Test Split

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Standard Scalling 

In [26]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [27]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Bagging Regressor

In [28]:
from sklearn.svm import SVR
from sklearn.ensemble import BaggingRegressor
bag_regr = BaggingRegressor(base_estimator=SVR(),n_estimators=10, random_state=0)

In [29]:
bag_regr.fit(X_train,y_train)

BaggingRegressor(base_estimator=SVR(), random_state=0)

In [30]:
bag_regr_predict = bag_regr.predict(X_test)

In [31]:
bag_regr_predict

array([10.51248835, 23.13512895, 15.89158993, ..., 20.30156649,
        2.33189905, -0.58409162])

In [32]:
from sklearn.metrics import r2_score
score=r2_score(y_test,bag_regr_predict)
print(score)

0.6828399917912038


### Extra Tree Regressor

In [35]:
from sklearn.ensemble import ExtraTreesRegressor
extra_tree_reg = ExtraTreesRegressor(n_estimators=100, random_state=0)

In [36]:
extra_tree_reg.fit(X_train,y_train)

ExtraTreesRegressor(random_state=0)

In [37]:
Extra_tree_predict=extra_tree_reg.predict(X_test)

In [38]:
Extra_tree_predict

array([ 5.01, 19.01, 18.79, ..., 18.52,  0.7 ,  0.  ])

In [39]:
from sklearn.metrics import r2_score
score=r2_score(y_test,Extra_tree_predict)
print(score)

0.8420969275325114


In [40]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.neighbors import KNeighborsRegressor
r1 = LinearRegression()
r2 = RandomForestRegressor(n_estimators=10, random_state=1)
r3 = KNeighborsRegressor()

In [41]:
voting_regr = VotingRegressor([('lr', r1), ('rf', r2), ('r3', r3)])

In [42]:
voting_regr.fit(X_train,y_train)

VotingRegressor(estimators=[('lr', LinearRegression()),
                            ('rf',
                             RandomForestRegressor(n_estimators=10,
                                                   random_state=1)),
                            ('r3', KNeighborsRegressor())])

In [43]:
voting_regr_predict = voting_regr.predict(X_test)

In [44]:
voting_regr_predict

array([10.5021504 , 18.86179041, 16.75161751, ..., 18.6570156 ,
        0.95291115, -0.88596937])

In [45]:
from sklearn.metrics import r2_score
score=r2_score(y_test,voting_regr_predict)
print(score)

0.8076441040948558


### Random Forest Regressor

In [46]:
from sklearn.ensemble import RandomForestRegressor
rand_forest_regr = RandomForestRegressor(max_depth=2, random_state=0)

In [47]:
rand_forest_regr.fit(X_train,y_train)

RandomForestRegressor(max_depth=2, random_state=0)

In [48]:
rand_forest_regr_predict = rand_forest_regr.predict(X_test)

In [49]:
rand_forest_regr_predict

array([16.92573181, 16.92573181, 16.92573181, ..., 16.92573181,
        0.80369973,  0.80369973])

In [50]:
from sklearn.metrics import r2_score
score=r2_score(y_test,voting_regr_predict)
print(score)

0.8076441040948558
