# Data Preprocessing

In [74]:
from utilities.utils import Utils
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from sklearn.feature_selection import SelectKBest, mutual_info_regression

warnings.simplefilter(action='ignore', category=FutureWarning)

In [75]:
data_path = Utils.load_config("ANALYZED_DATASET_PATH")

df = pd.read_csv(data_path)
df.head()

Unnamed: 0,instant,date,season,year,month,hour,holiday,weekday,workingday,weather,temp,humidity,windspeed,rentals
0,1,2011-01-01,1,0,1,0,0,6,0,1,10.5,81.0,0.0,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,9.7,80.0,0.0,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,9.7,80.0,0.0,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,10.5,75.0,0.0,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,10.5,75.0,0.0,1


In [76]:
df.shape

(17379, 14)

## Outlier Removal

Outliers can distort statistical analyses and violate their assumptions. Outliers increase the variability in your data, which decreases statistical power. Consequently, excluding outliers can cause results to become
statistically significant. We perform it as it makes the data clean and increases our model’s accuracy.

In [77]:
# method that returns the indexes of outliers in the dataframe

def detect_outliers(df, ft):
    Q1 = df[ft].quantile(0.25)
    Q3 = df[ft].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    ls = df.index[ (df[ft] < lower_bound) | (df[ft] > upper_bound) ]
    
    return ls

In [78]:
# list of indexes of observations to be removed 

index_list = []
for feature in ['temp', 'humidity' ,'rentals']:
    index_list.extend(detect_outliers(df, feature))
    
index_list

[1551,
 1552,
 1553,
 1554,
 1555,
 1556,
 1557,
 1558,
 1559,
 1560,
 1561,
 1562,
 1563,
 1564,
 1565,
 1566,
 1567,
 1568,
 1569,
 1570,
 1571,
 1572,
 5536,
 10358,
 10359,
 10382,
 10383,
 10397,
 10406,
 10407,
 10430,
 10431,
 10473,
 10474,
 10475,
 10476,
 10477,
 10478,
 10526,
 10527,
 10550,
 10551,
 10574,
 10575,
 10589,
 10598,
 10599,
 10613,
 10622,
 10623,
 10624,
 10685,
 10694,
 10695,
 10719,
 10733,
 10757,
 10766,
 10767,
 10861,
 10885,
 10886,
 10900,
 10909,
 10910,
 10933,
 10934,
 10957,
 10977,
 11030,
 11044,
 11053,
 11054,
 11100,
 11124,
 11125,
 11143,
 11144,
 11145,
 11146,
 11168,
 11196,
 11197,
 11211,
 11220,
 11221,
 11259,
 11268,
 11269,
 11283,
 11292,
 11293,
 11312,
 11388,
 11403,
 11412,
 11413,
 11437,
 11460,
 11461,
 11505,
 11532,
 11533,
 11556,
 11557,
 11571,
 11580,
 11581,
 11604,
 11605,
 11628,
 11700,
 11701,
 11724,
 11725,
 11748,
 11772,
 11773,
 11796,
 11816,
 11892,
 11893,
 11907,
 11916,
 11917,
 11940,
 11941,
 11955,

In [79]:
df_without_outliers = df.drop(index_list)
df_without_outliers.shape

(16852, 14)

## Feature Selection

In [80]:
predictors = ['season','year', 'month', 'hour', 'holiday', 'weekday', 'workingday', 'weather', 'temp', 'humidity', 'windspeed']

def ranking_attributes_contribution(dataset):
    X = dataset[predictors]  # independent columns
    y = dataset['rentals']  # target column

    best_features = SelectKBest(score_func=mutual_info_regression, k='all')
    fit = best_features.fit(X, y)
    df_scores = pd.DataFrame(fit.scores_)
    df_columns = pd.DataFrame(X.columns)

    # concat two dataframes for better visualization
    featureScores = pd.concat([df_columns, df_scores], axis=1)
    
    # naming the dataframe columns
    featureScores.columns = ['Specs', 'Score']  
    print(featureScores.nlargest(14, 'Score'))

In [81]:
ranking_attributes_contribution(df);

         Specs     Score
3         hour  0.625615
8         temp  0.142558
9     humidity  0.098703
2        month  0.072852
0       season  0.056711
1         year  0.048793
7      weather  0.019976
10   windspeed  0.017226
5      weekday  0.017064
6   workingday  0.013918
4      holiday  0.003232


## Delete low valuable features

As feature 'holiday' doesn't contribute in determining target attribute, we decide to drop it

In [82]:
df_without_outliers.drop('holiday',axis=1,inplace=True)

## Export processed dataset

In [83]:
df.to_csv('data/2_bike_rental_dataset_preprocessed.csv', header=True);