# Neural network

Modelo de red neuronal que sirve para realizar la prediccion y analizar resultados.

- Lee los datos del df conjunto.
- Lee los datos que se usarán de test.
- Crea el modelo.
- Realiza la predicción.
- Analiza resultados.

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

df = pd.read_parquet('data/data.parquet')
test_data = pd.read_csv("test/submission.csv")

In [20]:
df.head(5)

Unnamed: 0,index,station_id,houryear,num_bikes_available,num_bikes_available_types.mechanical,num_bikes_available_types.ebike,num_docks_available,is_installed,is_renting,is_returning,...,dayofweek,time2,Rain,Rain_Lectura,year,dayyear,Wind,Wind_Lectura,AñoMes,traffic
0,0,1,2081,16.0,16.0,0.0,14.0,1.0,1.0,1.0,...,3,2019-03-28,0,0.0,2019,87,0,6.6,201903,
1,4665,70,2081,21.0,21.0,0.0,5.0,1.0,1.0,1.0,...,3,2019-03-28,0,0.0,2019,87,0,6.6,201903,
2,29692,425,2081,27.0,27.0,0.0,0.0,1.0,1.0,1.0,...,3,2019-03-28,0,0.0,2019,87,0,6.6,201903,
3,4741,71,2081,18.0,18.0,0.0,3.0,1.0,1.0,1.0,...,3,2019-03-28,0,0.0,2019,87,0,6.6,201903,
4,22422,316,2081,25.0,25.0,0.0,1.0,1.0,1.0,1.0,...,3,2019-03-28,0,0.0,2019,87,0,6.6,201903,


In [21]:
test_data.head(5)

Unnamed: 0,index,station_id,month,day,hour,ctx-4,ctx-3,ctx-2,ctx-1,year_x,...,dayyear_x,houryear,dayofweek,time2,Rain,Rain_Lectura,year,dayyear,Wind,Wind_Lectura
0,0,394,3,7,8,0.753086,0.780864,0.799383,0.824074,2023,...,66,1568,1,2023-03-07,0,0.0,2023,66,0,5.8
1,1,337,3,23,12,0.463768,0.536232,0.532609,0.601449,2023,...,82,1956,3,2023-03-23,0,0.0,2023,82,0,6.2
2,2,368,3,31,1,0.787037,0.709877,0.611111,0.601852,2023,...,90,2137,4,2023-03-31,0,0.0,2023,90,0,7.4
3,3,327,3,23,15,0.753472,0.809028,0.819444,0.736111,2023,...,82,1959,3,2023-03-23,0,0.0,2023,82,0,6.2
4,4,328,3,4,20,0.861111,0.802469,0.814815,0.82716,2023,...,63,1508,5,2023-03-04,0,0.0,2023,63,0,10.8


In [9]:
df.dtypes

index                                     int64
station_id                                int64
houryear                                  int64
num_bikes_available                     float64
num_bikes_available_types.mechanical    float64
num_bikes_available_types.ebike         float64
num_docks_available                     float64
is_installed                            float64
is_renting                              float64
is_returning                            float64
is_charging_station                     float64
ttl                                     float64
year_x                                    int64
month                                   float64
hour                                    float64
dayyear_x                               float64
Llocs                                   float64
time                                     object
dayofweek                                 int64
time2                                    object
Rain                                    

In [10]:
test_data.dtypes

index             int64
station_id        int64
month             int64
day               int64
hour              int64
ctx-4           float64
ctx-3           float64
ctx-2           float64
ctx-1           float64
year_x            int64
time             object
dayyear_x         int64
houryear          int64
dayofweek         int64
time2            object
Rain              int64
Rain_Lectura    float64
year              int64
dayyear           int64
Wind              int64
Wind_Lectura    float64
dtype: object

In [3]:
test_data

Unnamed: 0,index,station_id,month,day,hour,ctx-4,ctx-3,ctx-2,ctx-1,year_x,...,dayyear_x,houryear,dayofweek,time2,Rain,Rain_Lectura,year,dayyear,Wind,Wind_Lectura
0,0,394,3,7,8,0.753086,0.780864,0.799383,0.824074,2023,...,66,1568,1,2023-03-07,0,0.0,2023,66,0,5.8
1,1,337,3,23,12,0.463768,0.536232,0.532609,0.601449,2023,...,82,1956,3,2023-03-23,0,0.0,2023,82,0,6.2
2,2,368,3,31,1,0.787037,0.709877,0.611111,0.601852,2023,...,90,2137,4,2023-03-31,0,0.0,2023,90,0,7.4
3,3,327,3,23,15,0.753472,0.809028,0.819444,0.736111,2023,...,82,1959,3,2023-03-23,0,0.0,2023,82,0,6.2
4,4,328,3,4,20,0.861111,0.802469,0.814815,0.827160,2023,...,63,1508,5,2023-03-04,0,0.0,2023,63,0,10.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54994,54994,269,3,14,3,0.478261,0.478261,0.478261,0.478261,2023,...,73,1731,1,2023-03-14,0,0.0,2023,73,0,7.6
54995,54995,280,3,1,5,0.098765,0.074074,0.074074,0.077160,2023,...,60,1421,2,2023-03-01,0,0.0,2023,60,0,11.6
54996,54996,180,3,20,16,0.782680,0.821895,0.812092,0.777778,2023,...,79,1888,0,2023-03-20,0,0.0,2023,79,0,7.3
54997,54997,277,3,14,11,0.774691,0.953704,0.972222,0.472222,2023,...,73,1739,1,2023-03-14,0,0.0,2023,73,0,7.6


In [8]:
X = df
target = ['num_docks_available']
seed = 42
batch_size = 32
epochs = 10
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

In [22]:
y_test

Unnamed: 0,num_docks_available
6633923,23.166667
37760722,3.000000
27660296,10.916667
43971579,20.416667
25180265,18.583333
...,...
23272491,8.666667
3905533,0.166667
34755168,7.000000
36805833,13.272727


In [13]:
X_test.dtypes

index                                     int64
station_id                                int64
houryear                                  int64
num_bikes_available                     float64
num_bikes_available_types.mechanical    float64
num_bikes_available_types.ebike         float64
num_docks_available                     float64
is_installed                            float64
is_renting                              float64
is_returning                            float64
is_charging_station                     float64
ttl                                     float64
year_x                                    int64
month                                   float64
hour                                    float64
dayyear_x                               float64
Llocs                                   float64
time                                     object
dayofweek                                 int64
time2                                    object
Rain                                    

### Selección de características

In [None]:
#Variables globales
seed = 42
batch_size = 32
epochs = 10

features = ['station_id','num_bikes_available', 'num_bikes_available_types.mechanical', 'num_bikes_available_types.ebike',
            'num_docks_available', 'hour', 'year','dayofweek', 'Rain', 'Wind']
target = ['num_docks_available']

### Dividir en train y test, normalizar datos y PCA

In [None]:
# train_test_split
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

# Normalizar los datos
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(test_data)
#X_test_scaled = scaler.transform(X_test)

# Aplicar Análisis de Componentes Principales
pca = PCA(n_components=0.95)  # Mantener el 95% de la varianza explicada
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(test_data)
#X_test_pca = pca.transform(X_test_scaled)

### Definicion de la red neuronal, compilacion y entrenamiento del modelo

In [None]:
# Modelo
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X_train_pca.shape[1],)))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(8, activation='relu'))
model.add(Dense(1))

# Compile
model.compile(optimizer=Adam(learning_rate=0.001), loss=MeanSquaredError())

# Entrenar el modelo
history = model.fit(X_train_pca, y_train,batch_size=batch_size,epochs=epochs,verbose=1,validation_data=(X_test_pca, y_test))

### Predicción

In [None]:
# Prediccion
predictions = model.predict(X_test_pca)

### Evaluación

In [None]:
# Evaluación
score = model.evaluate(X_test_pca, y_test, verbose=0)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
print("Pérdida en el conjunto de prueba:", score)
print('MSE:', mse)
print('R^2:', r2)

### Comparar las predicciones con los valores reales

In [None]:
df_predictions = pd.DataFrame({'Predicciones': predictions.flatten(), 'Valores Reales': y_test.values.flatten()})
df_predictions

### Análisis descriptivo

In [None]:
describe_results = df_predictions.describe()
describe_results

### Visualización de resultados

In [None]:
import matplotlib.pyplot as plt

# Graficar las predicciones frente a las etiquetas reales
plt.scatter(y_test, predictions)
plt.xlabel('Etiquetas reales')
plt.ylabel('Predicciones')
plt.title('Comparación entre las etiquetas reales y las predicciones')
plt.show()