In [10]:
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
import seaborn as sns

import datetime as dt

from sklearn.preprocessing import StandardScaler


plt.rcParams["figure.figsize"] = (10,8)

In [11]:
df = pd.read_pickle("Datos/bikes_eda_limpio.pkl")
df.head()

Unnamed: 0,instant,dteday,year,month,weathersit,temp,windspeed,casual,registered,count_bikes,dia_semana,laborables
0,1,01-01-2018,0,1,2,14.110847,10.749882,331,654,985,0,0
1,2,02-01-2018,0,1,2,14.902598,16.652113,131,670,801,1,1
2,3,03-01-2018,0,1,1,8.050924,16.636703,120,1229,1349,2,1
3,4,04-01-2018,0,1,1,8.2,10.739832,108,1454,1562,3,1
4,5,05-01-2018,0,1,1,9.305237,12.5223,82,1518,1600,4,1


### <font color=violet>Estandarización.

In [12]:
numericas = df.select_dtypes(include=np.number)
numericas

Unnamed: 0,instant,temp,windspeed,casual,registered,count_bikes,dia_semana,laborables
0,1,14.110847,10.749882,331,654,985,0,0
1,2,14.902598,16.652113,131,670,801,1,1
2,3,8.050924,16.636703,120,1229,1349,2,1
3,4,8.200000,10.739832,108,1454,1562,3,1
4,5,9.305237,12.522300,82,1518,1600,4,1
...,...,...,...,...,...,...,...,...
725,726,10.420847,23.458911,247,1867,2114,4,1
726,727,10.386653,10.416557,644,2451,3095,5,0
727,728,10.386653,8.333661,159,1182,1341,6,0
728,729,10.489153,23.500518,364,1432,1796,0,1


In [13]:
# iniciamos el método para escalar

scaler = StandardScaler()

In [14]:
numericas.drop(["count_bikes", 'instant', 'dia_semana', 'laborables','casual','registered'], axis = 1, inplace = True)

numericas.head(2)


Unnamed: 0,temp,windspeed
0,14.110847,10.749882
1,14.902598,16.652113


In [15]:
# ajustamos nuestros datos.  

scaler.fit(numericas)

In [16]:
# transformamos los datos

X_escaladas = scaler.transform(numericas)

In [17]:
# por último convertiremos el array que nos devuelve en un dataframe. 

numericas_estandar = pd.DataFrame(X_escaladas, columns = numericas.columns)
numericas_estandar.head(2)

Unnamed: 0,temp,windspeed
0,-0.827613,-0.387833
1,-0.722069,0.748899


In [18]:
numericas_estandar.rename(columns={'temp':'temp_es', 'windspeed':'windspeed_es'}, inplace=True)

Chequeamos la media y la desviación estándar

In [19]:
for columna in numericas_estandar.columns:
    print(f'La media de la columna {columna} es:  {numericas_estandar[columna].mean()}')
    print(f'La desviación estándar de la columna {columna} es: {numericas_estandar[columna].std()}')
    print("-------------------------------------------------------------")

La media de la columna temp_es es:  -2.5307001547620006e-16
La desviación estándar de la columna temp_es es: 1.0006856360078737
-------------------------------------------------------------
La media de la columna windspeed_es es:  7.786769706960002e-17
La desviación estándar de la columna windspeed_es es: 1.000685636007874
-------------------------------------------------------------


In [20]:
numericas_estandar.head(2)

Unnamed: 0,temp_es,windspeed_es
0,-0.827613,-0.387833
1,-0.722069,0.748899


Unimos el df con las columnas estandarizadas al original.

In [21]:
df_estandarizado = pd.concat([df, numericas_estandar], axis =1)
df_estandarizado.head()

Unnamed: 0,instant,dteday,year,month,weathersit,temp,windspeed,casual,registered,count_bikes,dia_semana,laborables,temp_es,windspeed_es
0,1,01-01-2018,0,1,2,14.110847,10.749882,331,654,985,0,0,-0.827613,-0.387833
1,2,02-01-2018,0,1,2,14.902598,16.652113,131,670,801,1,1,-0.722069,0.748899
2,3,03-01-2018,0,1,1,8.050924,16.636703,120,1229,1349,2,1,-1.635432,0.745931
3,4,04-01-2018,0,1,1,8.2,10.739832,108,1454,1562,3,1,-1.61556,-0.389769
4,5,05-01-2018,0,1,1,9.305237,12.5223,82,1518,1600,4,1,-1.468226,-0.046477


Dropeamos las columnas originales y dejamos las estandarizadas.

In [22]:
df_estandarizado.drop(['temp', 'windspeed'], axis = 1, inplace = True)

In [23]:
df_estandarizado.head()

Unnamed: 0,instant,dteday,year,month,weathersit,casual,registered,count_bikes,dia_semana,laborables,temp_es,windspeed_es
0,1,01-01-2018,0,1,2,331,654,985,0,0,-0.827613,-0.387833
1,2,02-01-2018,0,1,2,131,670,801,1,1,-0.722069,0.748899
2,3,03-01-2018,0,1,1,120,1229,1349,2,1,-1.635432,0.745931
3,4,04-01-2018,0,1,1,108,1454,1562,3,1,-1.61556,-0.389769
4,5,05-01-2018,0,1,1,82,1518,1600,4,1,-1.468226,-0.046477


In [24]:
df_estandarizado = df_estandarizado.astype({'dia_semana':'category', 'laborables':'category'})

In [25]:
df_estandarizado.dtypes

instant            int64
dteday            object
year            category
month           category
weathersit      category
casual             int64
registered         int64
count_bikes        int64
dia_semana      category
laborables      category
temp_es          float64
windspeed_es     float64
dtype: object

Separamos en dos dataset para hacer un modelo para los usuarios casuales y otro para los registrados.

In [26]:
df_casual = df_estandarizado[["dteday","year","month","weathersit","temp_es","windspeed_es","casual", "dia_semana","laborables"]]
df_casual.head()

Unnamed: 0,dteday,year,month,weathersit,temp_es,windspeed_es,casual,dia_semana,laborables
0,01-01-2018,0,1,2,-0.827613,-0.387833,331,0,0
1,02-01-2018,0,1,2,-0.722069,0.748899,131,1,1
2,03-01-2018,0,1,1,-1.635432,0.745931,120,2,1
3,04-01-2018,0,1,1,-1.61556,-0.389769,108,3,1
4,05-01-2018,0,1,1,-1.468226,-0.046477,82,4,1


In [27]:
df_registered = df_estandarizado[["dteday","year","month","weathersit","temp_es","windspeed_es","registered", "dia_semana","laborables"]]
df_registered.head()

Unnamed: 0,dteday,year,month,weathersit,temp_es,windspeed_es,registered,dia_semana,laborables
0,01-01-2018,0,1,2,-0.827613,-0.387833,654,0,0
1,02-01-2018,0,1,2,-0.722069,0.748899,670,1,1
2,03-01-2018,0,1,1,-1.635432,0.745931,1229,2,1
3,04-01-2018,0,1,1,-1.61556,-0.389769,1454,3,1
4,05-01-2018,0,1,1,-1.468226,-0.046477,1518,4,1


### <font color=violet> Encoding df_casual

<font color=orange>dia_semana

In [28]:
map_dia_sem= {4:0, 2:1, 3:1, 5:1, 6:2, 1:3, 0:4 }


In [29]:
df_casual['dia_sem_encoded'] = df_casual['dia_semana'].map(map_dia_sem)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_casual['dia_sem_encoded'] = df_casual['dia_semana'].map(map_dia_sem)


In [30]:
df_casual.drop('dia_semana', axis=1, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_casual.drop('dia_semana', axis=1, inplace=True)


In [31]:
df_casual.head()

Unnamed: 0,dteday,year,month,weathersit,temp_es,windspeed_es,casual,laborables,dia_sem_encoded
0,01-01-2018,0,1,2,-0.827613,-0.387833,331,0,4
1,02-01-2018,0,1,2,-0.722069,0.748899,131,1,3
2,03-01-2018,0,1,1,-1.635432,0.745931,120,1,1
3,04-01-2018,0,1,1,-1.61556,-0.389769,108,1,1
4,05-01-2018,0,1,1,-1.468226,-0.046477,82,1,0


<font color=orange>laborables

In [32]:
dummies = pd.get_dummies(df_casual['laborables'], prefix_sep='_', prefix='laborables', dtype= int)

In [33]:
df_casual = pd.concat([df_casual, dummies], axis=1)

In [34]:
df_casual.drop('laborables', axis=1, inplace=True )

In [35]:
df_casual.head()

Unnamed: 0,dteday,year,month,weathersit,temp_es,windspeed_es,casual,dia_sem_encoded,laborables_0,laborables_1
0,01-01-2018,0,1,2,-0.827613,-0.387833,331,4,1,0
1,02-01-2018,0,1,2,-0.722069,0.748899,131,3,0,1
2,03-01-2018,0,1,1,-1.635432,0.745931,120,1,0,1
3,04-01-2018,0,1,1,-1.61556,-0.389769,108,1,0,1
4,05-01-2018,0,1,1,-1.468226,-0.046477,82,0,0,1


<font color=orange>weathersit


In [36]:
map_weathersit = {1:3, 2:2, 3:1}

In [37]:
df_casual['weathersit_encoded'] = df_casual['weathersit'].map(map_weathersit)

In [38]:
df_casual.drop('weathersit', axis=1, inplace=True)

In [39]:
df_casual.head()

Unnamed: 0,dteday,year,month,temp_es,windspeed_es,casual,dia_sem_encoded,laborables_0,laborables_1,weathersit_encoded
0,01-01-2018,0,1,-0.827613,-0.387833,331,4,1,0,2
1,02-01-2018,0,1,-0.722069,0.748899,131,3,0,1,2
2,03-01-2018,0,1,-1.635432,0.745931,120,1,0,1,3
3,04-01-2018,0,1,-1.61556,-0.389769,108,1,0,1,3
4,05-01-2018,0,1,-1.468226,-0.046477,82,0,0,1,3


<font color=orange>month


In [40]:
map_month={1:1, 2:2, 12:3, 3:4, 11:4, 4:5, 10:5, 5:6, 6:7, 9:7, 7:7, 8:7}

In [41]:
df_casual['month_encoded'] = df_casual['month'].map(map_month)

In [42]:
df_casual.drop('month', axis=1, inplace=True)

In [43]:
df_casual.head()

Unnamed: 0,dteday,year,temp_es,windspeed_es,casual,dia_sem_encoded,laborables_0,laborables_1,weathersit_encoded,month_encoded
0,01-01-2018,0,-0.827613,-0.387833,331,4,1,0,2,1
1,02-01-2018,0,-0.722069,0.748899,131,3,0,1,2,1
2,03-01-2018,0,-1.635432,0.745931,120,1,0,1,3,1
3,04-01-2018,0,-1.61556,-0.389769,108,1,0,1,3,1
4,05-01-2018,0,-1.468226,-0.046477,82,0,0,1,3,1


In [44]:
lista = ["year"]

df_encoded = pd.DataFrame()


for columna in lista:
    df_dummies = pd.get_dummies(df_casual[columna], prefix_sep = "_", prefix = columna, dtype = int)

    df_encoded = pd.concat([df_encoded, df_dummies], axis = 1)

In [45]:
df_casual = pd.concat([df_casual, df_encoded], axis = 1)
df_casual.head()

Unnamed: 0,dteday,year,temp_es,windspeed_es,casual,dia_sem_encoded,laborables_0,laborables_1,weathersit_encoded,month_encoded,year_0,year_1
0,01-01-2018,0,-0.827613,-0.387833,331,4,1,0,2,1,1,0
1,02-01-2018,0,-0.722069,0.748899,131,3,0,1,2,1,1,0
2,03-01-2018,0,-1.635432,0.745931,120,1,0,1,3,1,1,0
3,04-01-2018,0,-1.61556,-0.389769,108,1,0,1,3,1,1,0
4,05-01-2018,0,-1.468226,-0.046477,82,0,0,1,3,1,1,0


In [46]:
df_casual.drop(lista, axis = 1, inplace=True)
df_casual.head(2)

Unnamed: 0,dteday,temp_es,windspeed_es,casual,dia_sem_encoded,laborables_0,laborables_1,weathersit_encoded,month_encoded,year_0,year_1
0,01-01-2018,-0.827613,-0.387833,331,4,1,0,2,1,1,0
1,02-01-2018,-0.722069,0.748899,131,3,0,1,2,1,1,0


### <font color=violet> Encoding df_registered

<font color=orange>dia_semana


In [47]:
map_dia_semr= {4:3, 2:2, 3:3, 5:3, 6:2, 1:1, 0:1 }

In [48]:
df_registered['dia_sem_encoded'] = df_registered['dia_semana'].map(map_dia_semr)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_registered['dia_sem_encoded'] = df_registered['dia_semana'].map(map_dia_semr)


In [49]:
df_registered.drop('dia_semana', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_registered.drop('dia_semana', axis=1, inplace=True)


In [50]:
df_registered.head()

Unnamed: 0,dteday,year,month,weathersit,temp_es,windspeed_es,registered,laborables,dia_sem_encoded
0,01-01-2018,0,1,2,-0.827613,-0.387833,654,0,1
1,02-01-2018,0,1,2,-0.722069,0.748899,670,1,1
2,03-01-2018,0,1,1,-1.635432,0.745931,1229,1,2
3,04-01-2018,0,1,1,-1.61556,-0.389769,1454,1,3
4,05-01-2018,0,1,1,-1.468226,-0.046477,1518,1,3


<font color=orange>laborables


In [51]:
dummiesr = pd.get_dummies(df_registered['laborables'], prefix_sep='_', prefix='laborables', dtype= int)

In [52]:
df_registered = pd.concat([df_registered, dummiesr], axis=1)


In [53]:
df_registered.drop('laborables', axis=1, inplace=True )

In [54]:
df_registered.head()

Unnamed: 0,dteday,year,month,weathersit,temp_es,windspeed_es,registered,dia_sem_encoded,laborables_0,laborables_1
0,01-01-2018,0,1,2,-0.827613,-0.387833,654,1,1,0
1,02-01-2018,0,1,2,-0.722069,0.748899,670,1,0,1
2,03-01-2018,0,1,1,-1.635432,0.745931,1229,2,0,1
3,04-01-2018,0,1,1,-1.61556,-0.389769,1454,3,0,1
4,05-01-2018,0,1,1,-1.468226,-0.046477,1518,3,0,1


<font color=orange>weathersit


In [55]:
map_weathersitr = {1:3, 2:2, 3:1}

In [56]:
df_registered['weathersit_encoded'] = df_registered['weathersit'].map(map_weathersitr)

In [57]:
df_registered.drop('weathersit', axis=1, inplace=True)

In [58]:
df_registered.head()

Unnamed: 0,dteday,year,month,temp_es,windspeed_es,registered,dia_sem_encoded,laborables_0,laborables_1,weathersit_encoded
0,01-01-2018,0,1,-0.827613,-0.387833,654,1,1,0,2
1,02-01-2018,0,1,-0.722069,0.748899,670,1,0,1,2
2,03-01-2018,0,1,-1.635432,0.745931,1229,2,0,1,3
3,04-01-2018,0,1,-1.61556,-0.389769,1454,3,0,1,3
4,05-01-2018,0,1,-1.468226,-0.046477,1518,3,0,1,3


<font color=orange>month


In [59]:
map_monthr={1:1, 2:2, 3:3, 12:4, 11:6, 4:5, 10:7, 5:6, 6:8, 9:8, 7:7, 8:8}

In [60]:
df_registered['month_encoded'] = df_registered['month'].map(map_monthr)

In [61]:
df_registered.drop('month', axis=1, inplace=True)

In [62]:
df_registered.head()

Unnamed: 0,dteday,year,temp_es,windspeed_es,registered,dia_sem_encoded,laborables_0,laborables_1,weathersit_encoded,month_encoded
0,01-01-2018,0,-0.827613,-0.387833,654,1,1,0,2,1
1,02-01-2018,0,-0.722069,0.748899,670,1,0,1,2,1
2,03-01-2018,0,-1.635432,0.745931,1229,2,0,1,3,1
3,04-01-2018,0,-1.61556,-0.389769,1454,3,0,1,3,1
4,05-01-2018,0,-1.468226,-0.046477,1518,3,0,1,3,1


In [63]:
lista = ["year"]

df_encoded = pd.DataFrame()


for columna in lista:
    df_dummies = pd.get_dummies(df_registered[columna], prefix_sep = "_", prefix = columna, dtype = int)

    df_encoded = pd.concat([df_encoded, df_dummies], axis = 1)

In [64]:
df_registered = pd.concat([df_registered, df_encoded], axis = 1)
df_registered.head()

Unnamed: 0,dteday,year,temp_es,windspeed_es,registered,dia_sem_encoded,laborables_0,laborables_1,weathersit_encoded,month_encoded,year_0,year_1
0,01-01-2018,0,-0.827613,-0.387833,654,1,1,0,2,1,1,0
1,02-01-2018,0,-0.722069,0.748899,670,1,0,1,2,1,1,0
2,03-01-2018,0,-1.635432,0.745931,1229,2,0,1,3,1,1,0
3,04-01-2018,0,-1.61556,-0.389769,1454,3,0,1,3,1,1,0
4,05-01-2018,0,-1.468226,-0.046477,1518,3,0,1,3,1,1,0


In [65]:
df_registered.drop(lista, axis = 1, inplace=True)
df_registered.head(2)

Unnamed: 0,dteday,temp_es,windspeed_es,registered,dia_sem_encoded,laborables_0,laborables_1,weathersit_encoded,month_encoded,year_0,year_1
0,01-01-2018,-0.827613,-0.387833,654,1,1,0,2,1,1,0
1,02-01-2018,-0.722069,0.748899,670,1,0,1,2,1,1,0


In [66]:
df_casual.head(2)

Unnamed: 0,dteday,temp_es,windspeed_es,casual,dia_sem_encoded,laborables_0,laborables_1,weathersit_encoded,month_encoded,year_0,year_1
0,01-01-2018,-0.827613,-0.387833,331,4,1,0,2,1,1,0
1,02-01-2018,-0.722069,0.748899,131,3,0,1,2,1,1,0


In [67]:
df_casual.to_csv("./Datos/casual_encoded.csv")

In [68]:
df_registered.to_csv("./Datos/registered_encoded.csv")