In [16]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder

In [2]:
# Load the data
bike_data = pd.read_csv("../data/bikes.csv")

In [3]:
bike_data.head()

Unnamed: 0,date,real_temperature,feel_temperature,humidity,wind_speed,weather_code,is_holiday,is_weekend,season,count
0,04/01/2015,2.75,0.0,93.0,7.5,broken_clouds,False,True,winter,9234
1,05/01/2015,9.0,7.25,81.5,8.854167,broken_clouds,False,False,winter,20372
2,06/01/2015,8.0,5.75,79.75,16.0,clear,False,False,winter,20613
3,07/01/2015,9.0,5.5,81.0,19.76087,scattered_clouds,False,False,winter,21064
4,08/01/2015,9.0,6.75,79.5,20.479167,rain,False,False,winter,15601


In [4]:
bike_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 730 entries, 0 to 729
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   date              730 non-null    object 
 1   real_temperature  730 non-null    float64
 2   feel_temperature  657 non-null    float64
 3   humidity          730 non-null    float64
 4   wind_speed        730 non-null    float64
 5   weather_code      679 non-null    object 
 6   is_holiday        730 non-null    bool   
 7   is_weekend        730 non-null    bool   
 8   season            694 non-null    object 
 9   count             730 non-null    int64  
dtypes: bool(2), float64(4), int64(1), object(3)
memory usage: 47.2+ KB


In [6]:
# Get median for feel_temp
median_feel_temp = bike_data['feel_temperature'].median()

# Fill missing values in feel_temperature with the median
bike_data['feel_temperature'] = bike_data['feel_temperature'].fillna(median_feel_temp)

In [8]:
# Fill missing values in season column by padding
bike_data['season'] = bike_data['season'].interpolate(method = 'pad')

In [12]:
# bike_data.weather_code.unique()
bike_data['weather_code'].unique()

array(['broken_clouds', 'clear', 'scattered_clouds', 'rain', nan,
       'cloudy', 'snowfall'], dtype=object)

In [14]:
bike_data.head(10)

Unnamed: 0,date,real_temperature,feel_temperature,humidity,wind_speed,weather_code,is_holiday,is_weekend,season,count
0,04/01/2015,2.75,0.0,93.0,7.5,broken_clouds,False,True,winter,9234
1,05/01/2015,9.0,7.25,81.5,8.854167,broken_clouds,False,False,winter,20372
2,06/01/2015,8.0,5.75,79.75,16.0,clear,False,False,winter,20613
3,07/01/2015,9.0,5.5,81.0,19.76087,scattered_clouds,False,False,winter,21064
4,08/01/2015,9.0,6.75,79.5,20.479167,rain,False,False,winter,15601
5,09/01/2015,13.0,13.0,75.75,32.916667,broken_clouds,False,False,winter,22104
6,10/01/2015,11.5,11.5,66.75,34.3125,broken_clouds,False,True,winter,14709
7,11/01/2015,7.75,3.75,69.0,26.5625,clear,False,True,winter,14575
8,12/01/2015,11.5,11.5,76.5,28.1875,rain,False,False,winter,17199
9,13/01/2015,9.0,6.0,74.75,21.208333,rain,False,False,winter,24697


In [15]:
bike_data.select_dtypes(include='object')

Unnamed: 0,date,weather_code,season
0,04/01/2015,broken_clouds,winter
1,05/01/2015,broken_clouds,winter
2,06/01/2015,clear,winter
3,07/01/2015,scattered_clouds,winter
4,08/01/2015,rain,winter
...,...,...,...
725,30/12/2016,cloudy,winter
726,31/12/2016,cloudy,winter
727,01/01/2017,rain,winter
728,02/01/2017,clear,winter


In [43]:
# Fetch the season data as a new df
season_data = pd.DataFrame(bike_data['season'], columns=['season'])

# Create a one-hot encoder object
one_hot_encoder = OneHotEncoder()

# One-hot encode the season column
ohe_array = one_hot_encoder.fit_transform(season_data[['season']]).toarray()

# Get column names
col_names = one_hot_encoder.get_feature_names_out(['season'])

# Convert the array into a dataframe
ohe_season_data = pd.DataFrame(data = ohe_array, columns=col_names)

# Concatenate unencoded and OHE-data to compare
season_data = pd.concat([season_data, ohe_season_data], axis=1)

In [44]:
# x = pd.DataFrame([1,2,3,4], columns=['s'])
# y = [x['s']]

In [45]:
# type(y)

In [46]:
season_data.sample(7)

Unnamed: 0,season,season_autumm,season_spring,season_summer,season_winter
348,winter,0.0,0.0,0.0,1.0
661,autumm,1.0,0.0,0.0,0.0
139,spring,0.0,1.0,0.0,0.0
419,winter,0.0,0.0,0.0,1.0
265,autumm,1.0,0.0,0.0,0.0
22,winter,0.0,0.0,0.0,1.0
380,winter,0.0,0.0,0.0,1.0


In [27]:
# season_data['season'].unique()

In [26]:
# ohe_array

In [20]:
season_data

Unnamed: 0,season
0,winter
1,winter
2,winter
3,winter
4,winter
...,...
725,winter
726,winter
727,winter
728,winter
