## Introduction

**Loading dataset**

In [7]:
# Pandas is used for data manipulation
import pandas as pd

# Read in data as pandas dataframe and display first 5 rows
features = pd.read_csv('input/electricity_consumption.csv')
features = features.iloc[:,1:,]

features.head(5)

Unnamed: 0,stat.socio,dwelling.size,dwelling.tenure2,region.f,state.f,municipality.pop,ac.acquire,total_niv,material.wall,material.roof,material.floor,antigua,cuart_dorm,fridge.adopt,washing.adopt,iron.adopt,tv.adopt,celphone.adopt,fan.adopt,cons.month.kwh.hig1
0,mid-high,<201m2,Owned,Temperate,Aguascalientes,>100k,No,2,brick,concrete,wood/mosaic,40,5,Yes,Yes,Yes,Yes,Yes,Yes,129.154069
1,mid-low,151-200m2,Owned,Temperate,Aguascalientes,>100k,No,1,adobe,concrete,cement,50,3,No,Yes,No,Yes,No,No,52.86092
2,mid-high,151-200m2,Borrowed,Temperate,Aguascalientes,>100k,No,2,brick,concrete,wood/mosaic,30,2,Yes,Yes,No,Yes,Yes,Yes,142.31966
3,mid-high,101-150m2,Owned,Temperate,Aguascalientes,>100k,No,1,brick,concrete,wood/mosaic,30,2,Yes,Yes,Yes,Yes,Yes,Yes,108.572344
4,mid-high,76-100m2,Owned,Temperate,Aguascalientes,>100k,No,1,brick,concrete,wood/mosaic,45,2,Yes,Yes,Yes,Yes,Yes,Yes,75.910042


## Identify Anomalies

In [8]:
print('The shape of our features is:', features.shape)

The shape of our features is: (8504, 20)


In [9]:
# Descriptive statistics for each column
features.describe()

Unnamed: 0,total_niv,antigua,cuart_dorm,cons.month.kwh.hig1
count,8504.0,8504.0,8504.0,8504.0
mean,1.340898,36.322201,2.139581,171.139465
std,0.63501,32.310157,0.913739,176.127042
min,1.0,0.0,1.0,13.669182
25%,1.0,13.0,1.0,89.468881
50%,1.0,25.0,2.0,139.126686
75%,2.0,45.0,3.0,193.555534
max,12.0,99.0,7.0,3686.849791


## Data Preparation

In [10]:
# One-hot encode categorical features
features = pd.get_dummies(features)
features.head(5)

Unnamed: 0,total_niv,antigua,cuart_dorm,cons.month.kwh.hig1,stat.socio_high,stat.socio_low,stat.socio_mid-high,stat.socio_mid-low,dwelling.size_101-150m2,dwelling.size_151-200m2,...,washing.adopt_No,washing.adopt_Yes,iron.adopt_No,iron.adopt_Yes,tv.adopt_No,tv.adopt_Yes,celphone.adopt_No,celphone.adopt_Yes,fan.adopt_No,fan.adopt_Yes
0,2,40,5,129.154069,0,0,1,0,0,0,...,0,1,0,1,0,1,0,1,0,1
1,1,50,3,52.86092,0,0,0,1,0,1,...,0,1,1,0,0,1,1,0,1,0
2,2,30,2,142.31966,0,0,1,0,0,1,...,0,1,1,0,0,1,0,1,0,1
3,1,30,2,108.572344,0,0,1,0,1,0,...,0,1,0,1,0,1,0,1,0,1
4,1,45,2,75.910042,0,0,1,0,0,0,...,0,1,0,1,0,1,0,1,0,1


In [11]:
print('Shape of features after one-hot encoding:', features.shape)

Shape of features after one-hot encoding: (8504, 93)


## Features and Labels and Convert Data to Arrays

In [13]:
# Use numpy to convert to arrays
import numpy as np

# Labels are the values we want to predict
labels = np.array(features['cons.month.kwh.hig1'])

# Remove the labels from the features
# axis 1 refers to the columns
features= features.drop('cons.month.kwh.hig1', axis = 1)

# Saving feature names for later use
feature_list = list(features.columns)

# Convert to numpy array
features = np.array(features)

## Training and Testing Sets¶

In [14]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25,
                                                                           random_state = 42)

In [15]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (6378, 92)
Training Labels Shape: (6378,)
Testing Features Shape: (2126, 92)
Testing Labels Shape: (2126,)


In [17]:
test_features

array([[ 2, 12,  2, ...,  1,  1,  0],
       [ 2,  5,  2, ...,  1,  0,  1],
       [ 2, 16,  2, ...,  1,  1,  0],
       ...,
       [ 2, 99,  2, ...,  1,  1,  0],
       [ 4, 99,  2, ...,  1,  1,  0],
       [ 1, 12,  2, ...,  1,  1,  0]])

In [18]:
feature_list

['total_niv',
 'antigua',
 'cuart_dorm',
 'stat.socio_high',
 'stat.socio_low',
 'stat.socio_mid-high',
 'stat.socio_mid-low',
 'dwelling.size_101-150m2',
 'dwelling.size_151-200m2',
 'dwelling.size_31-55m2',
 'dwelling.size_56-75m2',
 'dwelling.size_76-100m2',
 'dwelling.size_<201m2',
 'dwelling.size_>30m2',
 "dwelling.size_Don't know",
 'dwelling.tenure2_Borrowed',
 'dwelling.tenure2_Other',
 'dwelling.tenure2_Owned',
 'dwelling.tenure2_Rented',
 'region.f_Extremely hot',
 'region.f_Temperate',
 'region.f_Tropical',
 'state.f_Aguascalientes',
 'state.f_Baja California',
 'state.f_Baja California Sur',
 'state.f_Campeche',
 'state.f_Chiapas',
 'state.f_Chihuahua',
 'state.f_Coahuila',
 'state.f_Colima',
 'state.f_Durango',
 'state.f_Guanajuato',
 'state.f_Guerrero',
 'state.f_Hidalgo',
 'state.f_Jalisco',
 'state.f_Mexico',
 'state.f_Mexico City',
 'state.f_Michoacan',
 'state.f_Morelos',
 'state.f_Nayarit',
 'state.f_NuevoLeon',
 'state.f_Oaxaca',
 'state.f_Puebla',
 'state.f_Quereta