# Data Preprocessing

In [101]:
from utilities.utils import Utils
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.preprocessing import MinMaxScaler

warnings.simplefilter(action='ignore', category=FutureWarning)

In [102]:
data_path = Utils.load_config("ANALYZED_DATASET_PATH")

df = pd.read_csv(data_path, index_col='instant', parse_dates=True)
df.head()

Unnamed: 0_level_0,date,season,year,month,hour,holiday,weekday,workingday,weather,temp,humidity,windspeed,rentals
instant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,2011-01-01,1,0,1,0,0,6,0,1,10.5,81.0,0.0,16
2,2011-01-01,1,0,1,1,0,6,0,1,9.7,80.0,0.0,40
3,2011-01-01,1,0,1,2,0,6,0,1,9.7,80.0,0.0,32
4,2011-01-01,1,0,1,3,0,6,0,1,10.5,75.0,0.0,13
5,2011-01-01,1,0,1,4,0,6,0,1,10.5,75.0,0.0,1


In [103]:
df.shape

(17379, 13)

## Outlier Removal

Outliers can distort statistical analyses and violate their assumptions. Outliers increase the variability in your data, which decreases statistical power. Consequently, excluding outliers can cause results to become
statistically significant. We perform it as it makes the data clean and increases our model’s accuracy.

In [104]:
# method that returns the indexes of outliers in the dataframe

def detect_outliers(df, ft):
    Q1 = df[ft].quantile(0.25)
    Q3 = df[ft].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    ls = df.index[ (df[ft] < lower_bound) | (df[ft] > upper_bound) ]
    
    return ls

In [105]:
# list of indexes of observations to be removed 

index_list = []
for feature in ['temp', 'humidity' ,'rentals']:
    index_list.extend(detect_outliers(df, feature))
    
index_list

[1552,
 1553,
 1554,
 1555,
 1556,
 1557,
 1558,
 1559,
 1560,
 1561,
 1562,
 1563,
 1564,
 1565,
 1566,
 1567,
 1568,
 1569,
 1570,
 1571,
 1572,
 1573,
 5537,
 10359,
 10360,
 10383,
 10384,
 10398,
 10407,
 10408,
 10431,
 10432,
 10474,
 10475,
 10476,
 10477,
 10478,
 10479,
 10527,
 10528,
 10551,
 10552,
 10575,
 10576,
 10590,
 10599,
 10600,
 10614,
 10623,
 10624,
 10625,
 10686,
 10695,
 10696,
 10720,
 10734,
 10758,
 10767,
 10768,
 10862,
 10886,
 10887,
 10901,
 10910,
 10911,
 10934,
 10935,
 10958,
 10978,
 11031,
 11045,
 11054,
 11055,
 11101,
 11125,
 11126,
 11144,
 11145,
 11146,
 11147,
 11169,
 11197,
 11198,
 11212,
 11221,
 11222,
 11260,
 11269,
 11270,
 11284,
 11293,
 11294,
 11313,
 11389,
 11404,
 11413,
 11414,
 11438,
 11461,
 11462,
 11506,
 11533,
 11534,
 11557,
 11558,
 11572,
 11581,
 11582,
 11605,
 11606,
 11629,
 11701,
 11702,
 11725,
 11726,
 11749,
 11773,
 11774,
 11797,
 11817,
 11893,
 11894,
 11908,
 11917,
 11918,
 11941,
 11942,
 11956,

In [106]:
df_without_outliers = df.drop(index_list)
df_without_outliers.shape

(16852, 13)

## Feature Selection

In [107]:
predictors = ['season','year', 'month', 'hour', 'holiday', 'weekday', 'workingday', 'weather', 'temp', 'humidity', 'windspeed']

def ranking_attributes_contribution(dataset):
    X = dataset[predictors]  # independent columns
    y = dataset['rentals']  # target column

    best_features = SelectKBest(score_func=mutual_info_regression, k='all')
    fit = best_features.fit(X, y)
    df_scores = pd.DataFrame(fit.scores_)
    df_columns = pd.DataFrame(X.columns)

    # concat two dataframes for better visualization
    featureScores = pd.concat([df_columns, df_scores], axis=1)
    
    # naming the dataframe columns
    featureScores.columns = ['Specs', 'Score']  
    print(featureScores.nlargest(14, 'Score'))

In [108]:
ranking_attributes_contribution(df);

         Specs     Score
3         hour  0.626659
8         temp  0.141669
9     humidity  0.099196
2        month  0.065129
0       season  0.059341
1         year  0.053457
7      weather  0.019728
6   workingday  0.018352
5      weekday  0.016922
10   windspeed  0.016863
4      holiday  0.000000


## Delete low valuable features

As feature 'holiday' doesn't contribute in determining target attribute, we decide to drop it

In [109]:
df_without_outliers.drop('holiday',axis=1,inplace=True)

# Data Normalization

We perform data normalization to avoid features with higher range to impact more on training algorithm

In [110]:
feature_to_scale = ['temp', 'humidity', 'windspeed']
scaler = MinMaxScaler()
df_without_outliers[feature_to_scale] = scaler.fit_transform(df_without_outliers[feature_to_scale])

df_without_outliers

Unnamed: 0_level_0,date,season,year,month,hour,weekday,workingday,weather,temp,humidity,windspeed,rentals
instant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,2011-01-01,1,0,1,0,6,0,1,0.225888,0.793478,0.000000,16
2,2011-01-01,1,0,1,1,6,0,1,0.205584,0.782609,0.000000,40
3,2011-01-01,1,0,1,2,6,0,1,0.205584,0.782609,0.000000,32
4,2011-01-01,1,0,1,3,6,0,1,0.225888,0.728261,0.000000,13
5,2011-01-01,1,0,1,4,6,0,1,0.225888,0.728261,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...
17375,2012-12-31,1,1,12,19,1,1,2,0.246193,0.565217,0.193029,119
17376,2012-12-31,1,1,12,20,1,1,2,0.246193,0.565217,0.193029,89
17377,2012-12-31,1,1,12,21,1,1,1,0.246193,0.565217,0.193029,90
17378,2012-12-31,1,1,12,22,1,1,1,0.246193,0.521739,0.157764,61


## Transforming Categorical Values into Binary Attributes

In the vast majority of cases using dummy variables is more statistically significant than using a single numerical variable. A single numerical variable does not accurately encode the information represented by a categorical variable, because of the relationships between numerical values it implies.
This process is simple to do with Pandas by directly using the get_dummies() function.

In [111]:
dummy_fields = ['season', 'month', 'weather', 'hour', 'weekday']
for feature_to_dummy in dummy_fields:
    dummies = pd.get_dummies(df_without_outliers[feature_to_dummy], prefix=feature_to_dummy, drop_first=False)
    df_without_outliers = pd.concat([df_without_outliers, dummies], axis=1)

df_with_dummies = df_without_outliers.drop(dummy_fields, axis=1)
df_with_dummies

Unnamed: 0_level_0,date,year,workingday,temp,humidity,windspeed,rentals,season_1,season_2,season_3,...,hour_21,hour_22,hour_23,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6
instant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2011-01-01,0,0,0.225888,0.793478,0.000000,16,1,0,0,...,0,0,0,0,0,0,0,0,0,1
2,2011-01-01,0,0,0.205584,0.782609,0.000000,40,1,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2011-01-01,0,0,0.205584,0.782609,0.000000,32,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,2011-01-01,0,0,0.225888,0.728261,0.000000,13,1,0,0,...,0,0,0,0,0,0,0,0,0,1
5,2011-01-01,0,0,0.225888,0.728261,0.000000,1,1,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17375,2012-12-31,1,1,0.246193,0.565217,0.193029,119,1,0,0,...,0,0,0,0,1,0,0,0,0,0
17376,2012-12-31,1,1,0.246193,0.565217,0.193029,89,1,0,0,...,0,0,0,0,1,0,0,0,0,0
17377,2012-12-31,1,1,0.246193,0.565217,0.193029,90,1,0,0,...,1,0,0,0,1,0,0,0,0,0
17378,2012-12-31,1,1,0.246193,0.521739,0.157764,61,1,0,0,...,0,1,0,0,1,0,0,0,0,0


In [112]:
df_with_dummies.columns

Index(['date', 'year', 'workingday', 'temp', 'humidity', 'windspeed',
       'rentals', 'season_1', 'season_2', 'season_3', 'season_4', 'month_1',
       'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7',
       'month_8', 'month_9', 'month_10', 'month_11', 'month_12', 'weather_1',
       'weather_2', 'weather_3', 'weather_4', 'hour_0', 'hour_1', 'hour_2',
       'hour_3', 'hour_4', 'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9',
       'hour_10', 'hour_11', 'hour_12', 'hour_13', 'hour_14', 'hour_15',
       'hour_16', 'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21',
       'hour_22', 'hour_23', 'weekday_0', 'weekday_1', 'weekday_2',
       'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6'],
      dtype='object')

## Export processed dataset

In [113]:
df_with_dummies.to_csv('data/2_bike_rental_dataset_preprocessed.csv', header=True);