In [None]:
%run './feature_engineering.ipynb'

In [None]:
# # Vectorize the features
from pyspark.ml.feature import VectorAssembler

"""
Running PCA against the features used in Multi-class Classification: [multi_class logistisc regression](notebooks/regression-model-stations-to-bike-type.ipynb)
Filtering by workdays to reduce the data size and improve the model's performance
The Logistic Regression model
    labelCol: rideable_type_index
    features = ['day_period_index', 'start_station_id_index', 'end_station_id_index']
    rideable_type - 0: classic_bike, 1: docked_bike, 2: electric_bike
We'll discard the docked_bike type as it's not relevant for the predictive analysis of the number of bikes needed at each station at different times of the day.
"""

sampled_df_with_added_features_indexed = sampled_df_with_added_features_indexed.filter(sampled_df_with_added_features['rideable_type'] != 'docked_bike')

df_workdays = sampled_df_with_added_features_indexed.filter(sampled_df_with_added_features_indexed['week_day'] == 'Workday')
features = ['day_period_index', 'start_station_id_index', 'end_station_id_index', 'rideable_type_index']

assembler = VectorAssembler(inputCols=features, outputCol="features")
# # Scaling the features
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=False)

# # Combine the VectorAssembler and StandardScaler into a Pipeline
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.ml import Pipeline

# You can now define a pipeline that includes both the assembler and the scaler
pipeline = Pipeline(stages=[assembler, scaler])

# Fit and transform the DataFrame using the defined pipeline
sampled_df_scaled = pipeline.fit(sampled_df_with_added_features_indexed).transform(
    sampled_df_with_added_features_indexed)

from pyspark.ml.feature import PCA

# Apply PCA
pca = PCA(k=4, inputCol="scaled_features", outputCol="pca_features")
pca_model = pca.fit(sampled_df_scaled)
pca_result = pca_model.transform(sampled_df_scaled)

In [None]:
import numpy as np

# Extract the PCA loadings
loadings = pca_model.pc.toArray()

# For visualization, ensure we have the correct shape: components x features
print(loadings.shape)  # Should be (n_components, n_features)

import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
sns.heatmap(loadings, annot=True, cmap='inferno', yticklabels=[f'PC{i+1}' for i in range(loadings.shape[0])], xticklabels=features)
plt.title('PCA Loadings Heatmap')
plt.ylabel('Principal Components')
plt.xlabel('Features')
plt.savefig(os.path.join(images_path, 'heatmap_PCA_model_features.png'))
plt.show()

In [None]:
pca_pd['PC1'] = pca_pd['pca_features'].apply(lambda x: x[0])
pca_pd['PC2'] = pca_pd['pca_features'].apply(lambda x: x[1])
pca_pd['PC3'] = pca_pd['pca_features'].apply(lambda x: x[2])
pca_pd['PC4'] = pca_pd['pca_features'].apply(lambda x: x[3])
pca_pd['PC5'] = pca_pd['pca_features'].apply(lambda x: x[4])
pca_pd['PC6'] = pca_pd['pca_features'].apply(lambda x: x[5])
pd_scatter = pd.DataFrame(pca_pd, columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6'])

features = ['day_period_index', 'start_station_id_index', 'end_station_id_index']
# Plotting PCA results
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 8))
plt.scatter(pca_pd['PC5'], pca_pd['PC6'], cmap='inferno')
plt.title('PCA: First vs. Second Principal Component')
plt.xlabel('Principal Component 1 (PC5)')
plt.ylabel('Principal Component 2 (PC6)')
plt.grid(True)
plt.savefig(os.path.join(images_path, 'pca_scatter_plot_PC5_and_PC6.png'))
plt.show()


In [None]:
# #Plot a Correlation matrix of the features before applying PCA
import seaborn as sns
pd_sampled_df = sampled_df_with_added_features_indexed.toPandas()

# Plotting correlation matrix of the features
import seaborn as sns
import matplotlib.pyplot as plt

corr = pd_sampled_df[features].corr()
# Plot the heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, fmt=".2f", cmap='inferno', linewidth=.5, cbar=True, square=True)
plt.title('Feature Correlation Matrix')
plt.xticks(rotation=45)
plt.savefig(os.path.join(images_path, 'selected_features_correlation_matrix.png'))
plt.show()
