### PCA
Principal Component Analysis (PCA) is a dimensionality reduction technique that is widely used in machine learning. It is used to transform a dataset into a new coordinate system such that the greatest variance lies along the first coordinate, the second greatest variance along the second coordinate, and so on. The new coordinates are called principal components. PCA is an unsupervised technique, meaning it does not require labels for the data. It is used to reduce the number of features in a dataset while retaining as much information as possible. This is useful for reducing the computational complexity of a model and for visualizing high-dimensional data.

In [27]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

# Load the datasets for Singapore, New York City, and Madrid
singapore_df = pd.read_csv('datasets/singapore_listings.csv')
ny_df = pd.read_csv('datasets/newyorkcity_listings.csv')
madrid_df = pd.read_csv('datasets/madrid_listings.csv')

# Drop unnecessary columns not diretcly releted to accomodations
# data_pca = data.drop(['id', 'name', 'host_id', 'host_name', 'last_review'], axis=1)

singapore_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7907 entries, 0 to 7906
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              7907 non-null   int64  
 1   name                            7905 non-null   object 
 2   host_id                         7907 non-null   int64  
 3   host_name                       7907 non-null   object 
 4   neighbourhood_group             7907 non-null   object 
 5   neighbourhood                   7907 non-null   object 
 6   latitude                        7907 non-null   float64
 7   longitude                       7907 non-null   float64
 8   room_type                       7907 non-null   object 
 9   price                           7907 non-null   int64  
 10  minimum_nights                  7907 non-null   int64  
 11  number_of_reviews               7907 non-null   int64  
 12  last_review                     51

In [28]:
# Define categorical and numerical columns in the dataset
# These are the same for both Madrid and New York City datasets so we can use the same code for all datasets
categorical_cols = singapore_df.select_dtypes(include=['object']).columns
numerical_cols = singapore_df.select_dtypes(include=['float64', 'int64']).columns

categorical_cols, numerical_cols

(Index(['name', 'host_name', 'neighbourhood_group', 'neighbourhood',
        'room_type', 'last_review'],
       dtype='object'),
 Index(['id', 'host_id', 'latitude', 'longitude', 'price', 'minimum_nights',
        'number_of_reviews', 'reviews_per_month',
        'calculated_host_listings_count', 'availability_365'],
       dtype='object'))

In [29]:
# Define the numerical and categorical features for scaling and one-hot encoding
numerical_features = ['latitude', 'longitude', 'price', 'minimum_nights', 'number_of_reviews',
                      'reviews_per_month', 'calculated_host_listings_count', 'availability_365']

# These only chosen due to a MemoryError when trying to use all categorical features i.e. (name, host_name, last_review)
categorical_features = ['neighbourhood_group', 'neighbourhood', 'room_type']

# Define preprocessing steps for both numerical and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

preprocessor

In [30]:
# Function to get top features from PCA components
def get_top_features(dataset):
    # Append PCA to the preprocessing pipeline, explaining 95% of the variance
    pca_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('pca', PCA(n_components=0.95))])
    pca_pipeline.fit(dataset)

    # Extract PCA components and transformed feature names
    pca = pca_pipeline.named_steps['pca']
    components = pca.components_
    transformed_feature_names = preprocessor.transformers_[0][1].named_steps['scaler'].get_feature_names_out(numerical_features).tolist() + \
                                preprocessor.transformers_[1][1].named_steps['onehot'].get_feature_names_out(categorical_features).tolist()

    # Create a DataFrame of PCA components with feature names
    pca_components_df = pd.DataFrame(components, columns=transformed_feature_names)

    # Get the top 5 features for the first three components
    top_features_per_component = {}
    for i in range(3):
        top_features = pca_components_df.iloc[i].abs().nlargest(5).index.tolist()
        top_features_per_component[f'Component {i+1}'] = top_features

    return top_features_per_component

# Get the top features from PCA top 3 components for each dataset
singapore_top_features = get_top_features(singapore_df)
ny_top_features = get_top_features(ny_df)
madrid_top_features = get_top_features(madrid_df)


def get_most_frequent_features(top_features):
    all_top_features = []
    for features in [top_features]:
        for component in features.values():
            all_top_features.extend(component)

    # Get the most frequent features among all datasets
    most_frequent_features = pd.Series(all_top_features).value_counts().index.tolist()

    return most_frequent_features

# Display the top features for each dataset
singapore_most_frequent = get_most_frequent_features(singapore_top_features)
print("Singapore: Top 5 features per PCA component:")
print(singapore_most_frequent)

ny_most_frequent = get_most_frequent_features(ny_top_features)
print("\nNew York City: Top 5 features per PCA component:")
print(ny_most_frequent)

madrid_most_frequent = get_most_frequent_features(madrid_top_features)
print("\nMadrid: Top 5 features per PCA component:")
print(madrid_most_frequent)

Singapore: Top 5 features per PCA component:
['availability_365', 'number_of_reviews', 'calculated_host_listings_count', 'minimum_nights', 'latitude', 'reviews_per_month', 'longitude', 'neighbourhood_group_Central Region', 'price']

New York City: Top 5 features per PCA component:
['availability_365', 'longitude', 'minimum_nights', 'latitude', 'reviews_per_month', 'number_of_reviews', 'calculated_host_listings_count', 'price', 'neighbourhood_group_Brooklyn', 'neighbourhood_group_Manhattan']

Madrid: Top 5 features per PCA component:
['latitude', 'reviews_per_month', 'neighbourhood_group_Centro', 'longitude', 'calculated_host_listings_count', 'number_of_reviews', 'availability_365', 'minimum_nights', 'room_type_Private room']


In [32]:
# # Combine all top features into a single list
all_top_features = []
for features in [singapore_most_frequent, ny_most_frequent, madrid_most_frequent]:
    all_top_features.extend(features)

# # Get the most frequent features among all datasets
most_frequent_features = pd.Series(all_top_features).value_counts().index.tolist()

# Display the most robust features useful for predicting Airbnb prices across all datasets
print("\nMost frequent features among all datasets:")
most_frequent_features


Most frequent features among all datasets:


['availability_365',
 'number_of_reviews',
 'calculated_host_listings_count',
 'minimum_nights',
 'latitude',
 'reviews_per_month',
 'longitude',
 'price',
 'neighbourhood_group_Central Region',
 'neighbourhood_group_Brooklyn',
 'neighbourhood_group_Manhattan',
 'neighbourhood_group_Centro',
 'room_type_Private room']