### PCA

In [30]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

import pandas as pd

# Load the dataset
# file_path = '/mnt/data/singapore_listings.csv'
# data = pd.read_csv(file_path)
# data = pd.read_csv('datasets/singapore_listings.csv')
# data = pd.read_csv('datasets/newyorkcity_listings.csv')
data = pd.read_csv('datasets/madrid_listings.csv')

# Display the first few rows of the dataset and its basic info
data.head(), data.info()

# Drop unnecessary columns
data_pca = data.drop(['id', 'name', 'host_id', 'host_name', 'last_review'], axis=1)

# Check for missing values
missing_values = data_pca.isnull().sum()

# Define categorical and numerical columns
categorical_cols = data_pca.select_dtypes(include=['object']).columns
numerical_cols = data_pca.select_dtypes(include=['float64', 'int64']).columns

missing_values, categorical_cols, numerical_cols

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19618 entries, 0 to 19617
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              19618 non-null  int64  
 1   name                            19615 non-null  object 
 2   host_id                         19618 non-null  int64  
 3   host_name                       19091 non-null  object 
 4   neighbourhood_group             19618 non-null  object 
 5   neighbourhood                   19618 non-null  object 
 6   latitude                        19618 non-null  float64
 7   longitude                       19618 non-null  float64
 8   room_type                       19618 non-null  object 
 9   price                           19618 non-null  int64  
 10  minimum_nights                  19618 non-null  int64  
 11  number_of_reviews               19618 non-null  int64  
 12  last_review                     

(neighbourhood_group                  0
 neighbourhood                        0
 latitude                             0
 longitude                            0
 room_type                            0
 price                                0
 minimum_nights                       0
 number_of_reviews                    0
 reviews_per_month                 5637
 calculated_host_listings_count       0
 availability_365                     0
 dtype: int64,
 Index(['neighbourhood_group', 'neighbourhood', 'room_type'], dtype='object'),
 Index(['latitude', 'longitude', 'price', 'minimum_nights', 'number_of_reviews',
        'reviews_per_month', 'calculated_host_listings_count',
        'availability_365'],
       dtype='object'))

In [31]:

# Review the preprocessing configuration
# Recap of the current preprocessing steps

# Numerical features for scaling
numerical_features = ['latitude', 'longitude', 'price', 'minimum_nights', 'number_of_reviews',
                      'reviews_per_month', 'calculated_host_listings_count', 'availability_365']

# Categorical features for one-hot encoding
categorical_features = ['neighbourhood_group', 'neighbourhood', 'room_type']

# Re-create the preprocessing transformers
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))])  # Ensure dense output

# Combine preprocessing for numerical and categorical data into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

# Display the final setup of preprocessing steps for confirmation before applying PCA
preprocessor

In [32]:
# Append PCA to the preprocessing pipeline.
# Use enough components to explain 95% of the variance
pca_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('pca', PCA(n_components=0.95))])

# Fit the PCA pipeline to the data
pca_pipeline.fit(data_pca)

# Access the PCA component to see results
pca = pca_pipeline.named_steps['pca']
explained_variance = pca.explained_variance_ratio_
components = pca.components_

# Extract and display explained variance and component shapes
explained_variance, components.shape




(array([0.18422701, 0.12705194, 0.10982893, 0.09973822, 0.0975411 ,
        0.08835102, 0.0752432 , 0.04098031, 0.02442749, 0.01633778,
        0.01034956, 0.00827233, 0.00738766, 0.00662021, 0.00629966,
        0.0057298 , 0.00507721, 0.00449325, 0.00430624, 0.00402637,
        0.00397987, 0.00380907, 0.00363917, 0.00333544, 0.00301161,
        0.00256797, 0.00227439, 0.00193352]),
 (28, 161))

In [33]:
# Retrieve the feature names after transformation (both numerical and one-hot encoded categorical features)
transformed_feature_names = preprocessor.transformers_[0][1].named_steps['scaler'].get_feature_names_out(numerical_features).tolist() + \
                            preprocessor.transformers_[1][1].named_steps['onehot'].get_feature_names_out(categorical_features).tolist()

# Display the names of the transformed features to verify correctness
transformed_feature_names  # Show the first 10 feature names to check


['latitude',
 'longitude',
 'price',
 'minimum_nights',
 'number_of_reviews',
 'reviews_per_month',
 'calculated_host_listings_count',
 'availability_365',
 'neighbourhood_group_Arganzuela',
 'neighbourhood_group_Barajas',
 'neighbourhood_group_Carabanchel',
 'neighbourhood_group_Centro',
 'neighbourhood_group_Chamartín',
 'neighbourhood_group_Chamberí',
 'neighbourhood_group_Ciudad Lineal',
 'neighbourhood_group_Fuencarral - El Pardo',
 'neighbourhood_group_Hortaleza',
 'neighbourhood_group_Latina',
 'neighbourhood_group_Moncloa - Aravaca',
 'neighbourhood_group_Moratalaz',
 'neighbourhood_group_Puente de Vallecas',
 'neighbourhood_group_Retiro',
 'neighbourhood_group_Salamanca',
 'neighbourhood_group_San Blas - Canillejas',
 'neighbourhood_group_Tetuán',
 'neighbourhood_group_Usera',
 'neighbourhood_group_Vicálvaro',
 'neighbourhood_group_Villa de Vallecas',
 'neighbourhood_group_Villaverde',
 'neighbourhood_Abrantes',
 'neighbourhood_Acacias',
 'neighbourhood_Adelfas',
 'neighbourho

In [35]:
# Create a DataFrame of the PCA components with their corresponding feature names
pca_components_df = pd.DataFrame(components, columns=transformed_feature_names)

# Identify the top 5 influential features for the first three principal components
top_features_per_component = {}
for i in range(3):  # First three components
    top_features = pca_components_df.iloc[i].abs().nlargest(5).index.tolist()
    top_features_per_component[f'Component {i+1}'] = top_features

print("Madrid: Top 5 features per PCA component:\n")
top_features_per_component


Madrid: Top 5 features per PCA component:



{'Component 1': ['reviews_per_month',
  'number_of_reviews',
  'neighbourhood_group_Centro',
  'longitude',
  'latitude'],
 'Component 2': ['longitude',
  'latitude',
  'calculated_host_listings_count',
  'neighbourhood_group_Centro',
  'reviews_per_month'],
 'Component 3': ['calculated_host_listings_count',
  'availability_365',
  'latitude',
  'minimum_nights',
  'room_type_Private room']}