In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.lines import Line2D
from IPython.display import display  # to display variables in a "nice" way

# pd.options.display.max_rows = 9999
pd.options.display.max_columns = 200

Setting the Random State:

In [None]:
IDs =  [312920,312919,313385]
rs = np.min(IDs)
np.random.seed(rs)

Exercise 1 (Loading and Preparing the Data):

(a)

In [None]:
# PATH TO THE cla4lsp22_bikez_curated.csv FILE
bikez_path = 'cla4lsp22_bikez_curated.csv'

# LOADING THE DATASET AS DATAFRAME then store in the variable df_tot
df_tot = pd.read_csv(bikez_path)

# DISPLAY OF THE DATAFRAME
# display(df_tot)

##### b) Generate x as a random number: 0, 1, or 2. workdf is the dftot containing only data corresponding to years with reminder r resulted by modulus 3

In [None]:
x = int(np.random.uniform(0,3))
workdf = df_tot[df_tot['Year'] % 3 == x]
display(workdf)

##### c) Remove randomly from workdf two columns among the features: Front/Rear breaks, Front/Rear tire, Front/Rear suspension.

In [None]:
temp_features = ['Front brakes', 'Rear brakes','Front tire', 'Rear tire','Front suspension', 'Rear suspension']

feat1,feat2 = np.random.choice(temp_features, 2, replace=False)
workdf = workdf.drop(columns=[feat1,feat2])

Denote:
labels: the columns Brand, Model, Year, Category, Rating;
features: all the other ones

In [None]:
labels = workdf.columns[:5].tolist()
features = workdf.columns[5:].tolist()


d) Clean the dataset workdf from missing values in the feature columns (if needed).

##### Show percentage of missing values for each column of workdf

In [None]:
# compute the percentage of NaN values for each column
na_percentage = workdf.isna().mean() * 100
display(na_percentage)

# filter the na_percentage series to only include columns with a percentage of NaN values > 0
na_percentage_filtered = na_percentage[na_percentage > 0]

# subtract the labels list from the list of columns with NaN values > 0
na_features = na_percentage_filtered.index.difference(labels).tolist()

Remove Nan values from 'Displacement (ccm)' since NaN values are sparse

In [None]:
workdf.dropna(subset=['Displacement (ccm)'], inplace=True)


Fill Nan values from other columns by mean

In [None]:
for col in na_features:
    workdf[col] = workdf[col].fillna(workdf[col].mean())

Exercise 2 (Encoding of Categorical Data):

In [None]:
# select only the string columns in workdf that are in the features list
string_cols = workdf.select_dtypes(include=['object']).columns.intersection(features)

# apply one-hot encoding to the selected string columns
for col in string_cols:
    encoded_cols = workdf[col].str.get_dummies(sep='.').add_prefix(col + '_')
    workdf = pd.concat([workdf, encoded_cols], axis=1)

    # drop the original string column
    workdf.drop(col, axis=1, inplace=True)

# print the updated DataFrame
display(workdf)

# create a new DataFrame without the labels (only features)
Xworkdf = workdf.drop(columns=labels)

#### Exercise 3 (Preprocessing and PCA): Preprocess the data, before applying the PCA

In [None]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler object
scaler = StandardScaler()

# Fit the scaler to the data and transform the data
Xworkdf_std = pd.DataFrame(scaler.fit_transform(Xworkdf), columns=Xworkdf.columns)


In [None]:
from sklearn.preprocessing import MinMaxScaler

# Create a MinMaxScaler object
scaler = MinMaxScaler()

# Fit the scaler to the data and transform the data
Xworkdf_mm = pd.DataFrame(scaler.fit_transform(Xworkdf), columns=Xworkdf.columns)


To compare the variances of the original dataset Xworkdf with the variances of the datasets Xworkdf_std and Xworkdf_mm, we can calculate the variances of the non-categorical features in all three datasets.

Assuming that Xworkdf contains both categorical and non-categorical features, we can select only the non-categorical features as follows:

In [None]:
noncat_features = Xworkdf.select_dtypes(exclude='object').columns

Then, we can calculate the variances of the non-categorical features in all three datasets:

In [None]:
# Calculate variances of non-categorical features in Xworkdf
variances_original = Xworkdf[noncat_features].var()

# Calculate variances of non-categorical features in Xworkdf_std
variances_standardized = Xworkdf_std[noncat_features].var()

# Calculate variances of non-categorical features in Xworkdf_mm
variances_minmax = Xworkdf_mm[noncat_features].var()


Now we can compare the variances of the non-categorical features in the three dataframes. We can plot the variances using a bar plot:

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))

# Display only the initial 20 characteristics for the sake of legibility as the features are extensive and displaying all of them would be unsuitable.
plt.bar(variances_original[:20].index, variances_original[:20], label='Original')

plt.xticks(rotation=90)
plt.title('Variances of the original data')
plt.ylabel('Variance')
plt.legend()
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))

# Display only the initial 20 characteristics for the sake of legibility as the features are extensive and displaying all of them would be unsuitable.
plt.bar(variances_standardized[:20].index, variances_standardized[:20], label='Standardized')
plt.bar(variances_minmax[:20].index, variances_minmax[:20], label='Min-Max Scaled')
plt.xticks(rotation=90)
plt.title('Comparison of Variances')
plt.ylabel('Variance')
plt.legend()
plt.show()

This will produce two plot with three bars, one for each dataset, with the height of each bar representing the variance of the corresponding non-categorical feature.

From the plots, we can observe that the variances of the non-categorical features are different in the three datasets. Specifically, the variances of the standardized dataset Xworkdf_std are all equal to 1, as expected since this is a property of standardized data. On the other hand, the variances of the min-max scaled dataset Xworkdf_mm are all between 0 and 1, since this is the range specified by the MinMaxScaler. The variances of the original dataset Xworkdf are not normalized, and can be much larger than 1.

Based on this analysis, we can infer that scaling the non-categorical features using either a StandardScaler or a MinMaxScaler can help to ensure that these features are on the same scale and have comparable variances. This can be particularly important for certain machine learning algorithms, such as those that rely on distance-based calculations or regularization, where features with large variances can have a disproportionate impact on the algorithm's performance. However, the choice of scaler may depend on the specific requirements of the problem at hand, and in some cases it may be appropriate to use a different scaler or no scaler at all.

Here we use the PCA class from the sklearn.decomposition module to fit a PCA model to each of the three DataFrames, with n_components='mle' to retain all components. We then calculate the cumulative explained variance using the explained_variance_ratio_ attribute of the PCA object, and store the results in three arrays cumulative_variances_original, cumulative_variances_standardized, and cumulative_variances_minmax.

In [None]:
from sklearn.decomposition import PCA

# Create a PCA object with all components
pca = PCA(n_components='mle')

# Fit the PCA model to the original data
pca.fit(Xworkdf)

# Calculate cumulative explained variance
cumulative_variances_original = np.cumsum(pca.explained_variance_ratio_)

# Fit the PCA model to the standardized data
pca.fit(Xworkdf_std)

# Calculate cumulative explained variance
cumulative_variances_standardized = np.cumsum(pca.explained_variance_ratio_)

# Fit the PCA model to the min-max scaled data
pca.fit(Xworkdf_mm)

# Calculate cumulative explained variance
cumulative_variances_minmax = np.cumsum(pca.explained_variance_ratio_)


In [None]:
plt.plot(cumulative_variances_original, label='Original')
plt.plot(cumulative_variances_standardized, label='Standardized')
plt.plot(cumulative_variances_minmax, label='Min-Max Scaled')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Cumulative Explained Variance by Number of Components')
plt.legend()
plt.show()

This will produce a plot with three lines, one for each DataFrame, with the x-axis representing the number of components and the y-axis representing the cumulative explained variance.

Looking at the results, we can see that in all three cases, the cumulative explained variance increases as the number of components increases, with the largest increase occurring in the first few components. The standardized dataset Xworkdf_std and the min-max scaled dataset Xworkdf_mm both have a more linear increase in cumulative explained variance than the original dataset Xworkdf, indicating that the PCA has reduced the impact of features with large variances.

Compared to the previous analysis, we can now see that applying PCA has a significant impact on the amount of variance explained by the non-categorical features. In particular, we can observe that a small number of principal components explain a large amount of the variance in the standardized and min-max scaled datasets, suggesting that these scalers may be effective in reducing the impact of features with large variances. On the other hand, the original dataset has a more gradual increase in cumulative explained variance, indicating that the non-categorical features in this dataset may have a wider range of variances. Depending on the specific requirements of the problem, it may be necessary to use a scaler or feature selection technique to address this issue.