In [None]:
import pandas as pd

# Set display options to show more rows and columns
pd.set_option('display.max_columns', None) # Show all columns

listings = pd.read_csv('../AirBnb datasets/barcelona/listings.csv')

listings.head()

In [None]:
# Identify columns where all values are the same
constant_columns = [col for col in listings.columns if listings[col].nunique() == 1]

# Print the columns where all values are the same
print("Columns where all values are the same:", constant_columns)

In [None]:
# Remove columns where all values are the same
columns_to_remove = ['scrape_id', 'last_scraped', 'calendar_last_scraped']
listings = listings.drop(columns=columns_to_remove)

In [None]:
# Remove irrelevant or uninformative features
columns_to_remove = ['listing_url', 'source', 'name', 'description', 'neighborhood_overview',
                     'picture_url', 'host_id', 'host_url', 'host_name', 'host_about', 'host_thumbnail_url',
                     'host_picture_url', 'license', 'calendar_updated']

listings = listings.drop(columns=columns_to_remove)

In [None]:
# Convert boolean values into binary representations
listings['host_is_superhost'] = listings['host_is_superhost'].replace({'t': 1, 'f': 0})
listings['instant_bookable'] = listings['instant_bookable'].replace({'t': 1, 'f': 0})
listings['host_has_profile_pic'] = listings['host_has_profile_pic'].replace({'t': 1, 'f': 0})
listings['host_identity_verified'] = listings['host_identity_verified'].replace({'t': 1, 'f': 0})
listings['has_availability'] = listings['has_availability'].replace({'t': 1, 'f': 0})

In [None]:
# Remove the dollar symbol and commas, then convert the column to numeric
listings['price'] = listings['price'].str.replace('$', '', regex=False).str.replace(',', '', regex=False).astype(float)

listings['host_acceptance_rate'] = listings['host_acceptance_rate'].str.replace('%', '', regex=False).astype(float)
listings['host_response_rate'] = listings['host_response_rate'].str.replace('%', '', regex=False).astype(float)

Importing amenities from "amenities.csv" dataset

In [None]:
amenities_dummies = pd.read_csv('amenities.csv')

In [None]:
# Concatenate the original DataFrame (excluding the old 'amenities' column) with the new boolean features
listings = pd.concat([listings.drop(columns=['amenities']), amenities_dummies], axis=1)

In [None]:
sentiment = pd.read_csv('sentiment.csv')

In [None]:
listings['id'] = listings['id'].astype(str)
sentiment['listing_id'] = sentiment['listing_id'].astype(str)

# Merge the datasets on the corresponding ID columns
merged_data = pd.merge(listings, sentiment, left_on='id', right_on='listing_id', how='left')

# Drop the duplicate 'listing_id' column if it exists
merged_data = merged_data.drop(columns=['listing_id'])

merged_data

In [None]:
# Select numeric columns for scaling
numeric_columns = merged_data.select_dtypes(include=['float64', 'int64']).columns

# Select only numeric data
numeric_data = merged_data[numeric_columns]

Missing values handling through Regression Imputation

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer

# Function to fill missing values using linear regression
def fill_missing_values(df):
    df_filled = df.copy()
    
    for column in df.columns:
        if df[column].isnull().sum() > 0:

            # Separate the data into columns with and without missing values
            df_not_null = df_filled[df_filled[column].notnull()]
            df_null = df_filled[df_filled[column].isnull()]
            
            if df_not_null.empty or df_null.empty:
                # Skip column if there are no missing values
                continue
            
            # Define the model
            model = LinearRegression()
            
            # Ensure that X_train and X_test have no missing values
            X_train = df_not_null.drop(columns=[column])
            y_train = df_not_null[column]
            
            # Check for any NaNs in the training data
            if X_train.isnull().any().any():
                # If training data for the column contains NaNs, imput missing values before training
                imputer = SimpleImputer(strategy='mean')
                X_train = imputer.fit_transform(X_train)
            
            X_test = df_null.drop(columns=[column])
            
            if X_test.isnull().any().any():
                # If test data for the column contains NaNs, imput missing values before prediction
                X_test = imputer.transform(X_test)
            
            # Train the model
            model.fit(X_train, y_train)
            
            # Predict the missing values
            predicted_values = model.predict(X_test)
            
            # Assign the predicted values to the original dataframe
            df_filled.loc[df_filled[column].isnull(), column] = predicted_values
    
    return df_filled


# Apply the function to fill missing values in the numeric data
numeric_data_filled = fill_missing_values(numeric_data)

In [None]:
numeric_data_filled

In [None]:
from sklearn.preprocessing import StandardScaler

# Scale the numeric data
scaler = StandardScaler()
numeric_data_scaled = scaler.fit_transform(numeric_data_filled)

# Convert the scaled data back to a DataFrame
numeric_data_scaled_df = pd.DataFrame(numeric_data_scaled, columns=numeric_columns)

numeric_data_scaled_df

Feature Selection with PCA

In [None]:
#from sklearn.decomposition import PCA

# Apply PCA
#pca = PCA(n_components=0.95)
#numeric_data_reduced = pca.fit_transform(numeric_data_scaled_df)

# Convert PCA result back to DataFrame
#numeric_data_reduced_df = pd.DataFrame(numeric_data_reduced)

In [None]:
#numeric_data_reduced_df

In [None]:
# Combine scaled numeric data with non-numeric data
non_numeric_data = merged_data.drop(columns=numeric_columns)
listings = pd.concat([non_numeric_data.reset_index(drop=True), numeric_data_scaled_df], axis=1)

listings = listings.drop(columns='id')

listings

In [None]:
listings.to_csv('final_dataset.csv', index=False)