In [1]:
import pandas as pd

# Load the uploaded CSV file to inspect its contents
df = pd.read_csv('listings.csv')

#print(df.info())

# Temporarily display all columns
pd.set_option('display.max_columns', None)
# Now print the head of the DataFrame with all columns visible
#print(df.head())
# Reset back to the default settings after displaying the data (optional)
#pd.reset_option('display.max_columns')

#print(df.columns.tolist())
#print(len(df['amenities'].unique()))

# Split the 'amenities' column into individual amenities
df['amenities'] = df['amenities'].str.replace('[{}"]', '', regex=True)  # Clean up the amenities
df['amenities_list'] = df['amenities'].apply(lambda x: x.split(','))

# Flatten the list of lists into a single list of all amenities
all_amenities = [amenity.strip() for sublist in df['amenities_list'] for amenity in sublist]

# Get the number of unique amenities
unique_amenities = set(all_amenities)
print(f"Total number of unique amenities: {len(unique_amenities)}")


# Display the 10 most common amenities
#print(amenity_counts.most_common(10))


Total number of unique amenities: 5703


In [2]:
# Display only columns with missing data (out of 36807 data points)
missing_data = df.isnull().sum()
missing_data[missing_data > 0]

description                       953
neighborhood_overview           18467
picture_url                         1
host_location                    8260
host_about                      17323
host_response_time               6747
host_response_rate               6747
host_acceptance_rate             4349
host_is_superhost                2144
host_neighbourhood              13699
neighbourhood                   18467
neighbourhood_group_cleansed    36807
bathrooms                        2956
bathrooms_text                     31
bedrooms                          610
beds                             2955
price                            2956
calendar_updated                36807
has_availability                 1250
first_review                     6979
last_review                      6979
review_scores_rating             6979
review_scores_accuracy           6980
review_scores_cleanliness        6979
review_scores_checkin            6980
review_scores_communication      6979
review_score

In [3]:
df_imputated = pd.read_csv('listings.csv')

df_imputated.drop_duplicates()
df_imputated.drop(['license', 'calendar_updated', 'neighbourhood_group_cleansed'], axis=1, inplace=True)
df_imputated.drop(['id', 'listing_url', 'scrape_id', 'picture_url', 'host_id', 'host_url', 'host_thumbnail_url', 'host_picture_url'], axis=1, inplace=True)

In [4]:
# Extract all unique values from the 'bathrooms_text' column
unique_bathrooms_text = df['bathrooms_text'].unique()
print(unique_bathrooms_text)


['1 bath' '1.5 baths' '2 baths' '5.5 baths' '2.5 baths' '0 baths'
 '1 shared bath' '1 private bath' '6 baths' '1.5 shared baths' nan
 '3 baths' '2 shared baths' '2.5 shared baths' '4.5 baths'
 '6 shared baths' '5 baths' '4 baths' '3.5 baths' '4 shared baths'
 '0 shared baths' '3 shared baths' '3.5 shared baths' 'Shared half-bath'
 'Half-bath' '9 baths' '8 shared baths' '5.5 shared baths'
 'Private half-bath' '7 baths' '4.5 shared baths' '22 baths'
 '7 shared baths' '6.5 shared baths' '10 baths' '9.5 baths' '16 baths'
 '7.5 baths' '13 baths' '8 baths' '8.5 shared baths' '7.5 shared baths'
 '5 shared baths' '6.5 baths' '9 shared baths' '12.5 baths' '8.5 baths'
 '15 baths' '20 baths']


In [5]:
import numpy as np
import pandas as pd

# Function to extract bathroom numbers from the text
def extract_bathrooms(row):
    bathroom_text = row['bathrooms_text']
    bathroom = row['bathrooms']
    
    # If both bathrooms and bathrooms_text are NaN, return NaN
    if pd.isnull(bathroom) and pd.isnull(bathroom_text):
        return np.nan
    
    # If bathrooms_text contains 'half', return 0.5
    if isinstance(bathroom_text, str):
        bathroom_text = bathroom_text.lower()
        if 'half' in bathroom_text:
            return 0.5
        # Extract digits if they exist
        num = ''.join([ch for ch in bathroom_text if ch.isdigit() or ch == '.'])
        return float(num) if num else np.nan
    
    # Otherwise, return the existing value of bathrooms
    return bathroom

# Function to classify bathrooms into 'private', 'shared', 'no bathroom', or NaN
def classify_bathroom(row):
    bathrooms_text = row['bathrooms_text']
    bathrooms = row['bathrooms']
    
    # If bathrooms is NaN, return NaN for category as well
    if pd.isnull(bathrooms):
        return 'no bathroom'
    
    # Convert bathrooms_text to lowercase for comparison
    if isinstance(bathrooms_text, str):
        text = bathrooms_text.lower()
        
        if bathrooms == 0:  # If bathrooms is 0, it indicates 'no bathroom'
            return 'no bathroom'
        elif 'shared' in text:  # If 'shared' is mentioned in text
            return 'shared'
        elif 'private' in text:  # If 'private' is mentioned in text
            return 'private'
    
    # Default to 'private' if not explicitly mentioned
    return 'private'

df_imputated['bathrooms'] = df_imputated.apply(extract_bathrooms, axis=1)

df_imputated['bathroom_category'] = df_imputated.apply(classify_bathroom, axis=1)

pd.set_option('display.max_rows', None)
print(df_imputated[['bathrooms_text', 'bathrooms', 'bathroom_category']].iloc[0: 100])
print(df[['bathrooms_text', 'bathrooms']].iloc[0: 100])
#pd.reset_option('display.max_rows')

      bathrooms_text  bathrooms bathroom_category
0             1 bath        1.0           private
1             1 bath        1.0           private
2             1 bath        1.0           private
3             1 bath        1.0           private
4          1.5 baths        1.5           private
5             1 bath        1.0           private
6             1 bath        1.0           private
7             1 bath        1.0           private
8             1 bath        1.0           private
9             1 bath        1.0           private
10            1 bath        1.0           private
11         1.5 baths        1.5           private
12            1 bath        1.0           private
13            1 bath        1.0           private
14            1 bath        1.0           private
15            1 bath        1.0           private
16           2 baths        2.0           private
17         1.5 baths        1.5           private
18         1.5 baths        1.5           private


In [6]:
# Fill missing values for categorical variables with 'missing' or similar labels
df_imputated['description'].fillna('No description', inplace=True)
df_imputated['host_about'].fillna('No host info', inplace=True)
df_imputated['host_location'].fillna('No host location', inplace=True)
df_imputated['bathrooms_text'].fillna('No bathroom text', inplace=True)
df_imputated['neighborhood_overview'].fillna('No neighborhood description', inplace=True)

# Fill missing review-related data with 0 (indicating no reviews)
df_imputated['review_scores_rating'].fillna(0, inplace=True)
df_imputated['review_scores_accuracy'].fillna(0, inplace=True)
df_imputated['review_scores_cleanliness'].fillna(0, inplace=True)
df_imputated['review_scores_checkin'].fillna(0, inplace=True)
df_imputated['review_scores_communication'].fillna(0, inplace=True)
df_imputated['review_scores_location'].fillna(0, inplace=True)
df_imputated['review_scores_value'].fillna(0, inplace=True)
df_imputated['reviews_per_month'].fillna(0, inplace=True)
# Set 'first_review' and 'last_review' as NaT (Not a Timestamp) to indicate no reviews
df_imputated['first_review'].fillna(0, inplace=True)
df_imputated['last_review'].fillna(0, inplace=True)

df_imputated.to_csv('listings_imputated.csv', index=False)

In [7]:
import pandas as pd

# Example DataFrame (replace this with your actual DataFrame loading)
# df = pd.read_csv('your_data.csv')

# 1. Fill 'host_response_time' with 'unknown'
df_imputated['host_response_time'].fillna('unknown', inplace=True)

# 2. Fill 'host_response_rate' with 0
df_imputated['host_response_rate'].fillna(0, inplace=True)

# 3. Fill 'host_acceptance_rate' with 0
df_imputated['host_acceptance_rate'].fillna(0, inplace=True)

# 4. Fill 'host_is_superhost' with False
df_imputated['host_is_superhost'].fillna('f', inplace=True)

# 5. Fill 'host_neighbourhood' with 'unknown'
df_imputated['host_neighbourhood'].fillna('unknown', inplace=True)

# 6. Fill 'neighbourhood' with 'unknown'
df_imputated['neighbourhood'].fillna('unknown', inplace=True)

# Convert 'price' to numeric (removing currency symbols and commas)
df['price'] = pd.to_numeric(df['price'].replace({'\$': '', ',': ''}, regex=True))
df_imputated['price'].fillna(df['price'].median(), inplace=True)

# 11. Fill 'has_availability' with False
df_imputated['has_availability'].fillna(False, inplace=True)

# Display the updated DataFrame
print(df_imputated.head())  # Check the first few rows to ensure everything worked as expected


  last_scraped           source  \
0   2024-08-28      city scrape   
1   2024-08-27      city scrape   
2   2024-08-28  previous scrape   
3   2024-08-27      city scrape   
4   2024-08-28      city scrape   

                                               name  \
0                Amazing Luxurious Apt-Palermo Soho   
1      RELAX IN HAPPY HOUSE - PALERMO, BUENOS AIRES   
2                             ROOM WITH RIVER SIGHT   
3                         DUPLEX LOFT 2 - SAN TELMO   
4  PENTHOUSE /Terrace & pool /City views /2bedrooms   

                                         description  \
0  LUXURIOUS 1 BDRM APT- POOL/ GYM/ SPA/ 24-HR SE...   
1  Beautiful cozy apartment in excellent location...   
2                                     No description   
3                                     No description   
4                                     No description   

                               neighborhood_overview      host_name  \
0  AREA: PALERMO SOHO<br /><br />Minutes walking .

# Seperate Test and Train Dataset

In [8]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets (80% train, 20% test)
train_set, test_set = train_test_split(df_imputated, test_size=0.2, random_state=42)
train_set = train_set.copy()
test_set = test_set.copy()

# Check the size of the splits
print("Training set size:", len(train_set))
print("Test set size:", len(test_set))


Training set size: 29445
Test set size: 7362


In [9]:
# 2. Calculate the median values from the training set
median_bathrooms_train = train_set['bathrooms'].median()
median_bedrooms_train = train_set['bedrooms'].median()
median_beds_train = train_set['beds'].median()

# 3. Fill missing values in the training set using the medians from the training set
train_set['bathrooms'].fillna(median_bathrooms_train, inplace=True)
train_set['bedrooms'].fillna(median_bedrooms_train, inplace=True)
train_set['beds'].fillna(median_beds_train, inplace=True)

# 4. Fill missing values in the test set using the same medians from the training set
test_set['bathrooms'].fillna(median_bathrooms_train, inplace=True)
test_set['bedrooms'].fillna(median_bedrooms_train, inplace=True)
test_set['beds'].fillna(median_beds_train, inplace=True)

In [10]:
missing_data_train_set = train_set.isnull().sum()
print(f'missing_data_train_set: {missing_data_train_set[missing_data_train_set > 0]}')

missing_data_test_set = test_set.isnull().sum()
print(f'missing_data_test_set: {missing_data_test_set[missing_data_test_set > 0]}')

missing_data_train_set: Series([], dtype: int64)
missing_data_test_set: Series([], dtype: int64)


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np
import gender_guesser.detector as gender


# 2. Process columns like `instant_bookable`, and `has_availability` 
# which have binary values 't'/'f' and convert them to 1/0
binary_cols = ['instant_bookable']
train_set[binary_cols] = train_set[binary_cols].replace({'t': 1, 'f': 0})
test_set[binary_cols] = test_set[binary_cols].replace({'t': 1, 'f': 0})

# 5. Convert boolean-like categorical data to numeric (e.g., `host_identity_verified`)
boolean_cols = ['host_identity_verified', 'host_has_profile_pic']
train_set[boolean_cols] = train_set[boolean_cols].replace({'t': 1, 'f': 0})
test_set[boolean_cols] = test_set[boolean_cols].replace({'t': 1, 'f': 0})

# Remove '%' and convert to float
def convert_percentage(column):
    return column.apply(lambda x: float(str(x).rstrip('%')) / 100 if '%' in str(x) else float(x))

# 7. Convert price from string to numeric by removing currency symbols
train_set['price'] = train_set['price'].replace('[\$,]', '', regex=True).astype(float)
test_set['price'] = test_set['price'].replace('[\$,]', '', regex=True).astype(float)

# Apply the function to both train and test sets
train_set['host_response_rate'] = convert_percentage(train_set['host_response_rate'])
train_set['host_acceptance_rate'] = convert_percentage(train_set['host_acceptance_rate'])

test_set['host_response_rate'] = convert_percentage(test_set['host_response_rate'])
test_set['host_acceptance_rate'] = convert_percentage(test_set['host_acceptance_rate'])

# Convert date columns to datetime format
train_set['last_scraped'] = pd.to_datetime(train_set['last_scraped'])
train_set['host_since'] = pd.to_datetime(train_set['host_since'])

test_set['last_scraped'] = pd.to_datetime(test_set['last_scraped'])
test_set['host_since'] = pd.to_datetime(test_set['host_since'])

# Feature: Length of time the host has been active (in days)
train_set['host_active_days'] = (train_set['last_scraped'] - train_set['host_since']).dt.days
test_set['host_active_days'] = (test_set['last_scraped'] - test_set['host_since']).dt.days

# Optionally drop the original date columns if they are no longer needed
train_set.drop(['last_scraped', 'host_since'], axis=1, inplace=True)
test_set.drop(['last_scraped', 'host_since'], axis=1, inplace=True)

# Convert binary columns from 't'/'f' to 1/0
binary_columns = ['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'has_availability']

train_set[binary_columns] = train_set[binary_columns].applymap(lambda x: 1 if x == 't' else 0)
test_set[binary_columns] = test_set[binary_columns].applymap(lambda x: 1 if x == 't' else 0)

# 2. One-Hot Encoding for 'source' on the training set
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
train_source_encoded = encoder.fit_transform(train_set[['source']])

# Apply the same transformation to the test set
test_source_encoded = encoder.transform(test_set[['source']])

# Convert the encoded results to DataFrames for better readability
train_encoded_df = pd.DataFrame(train_source_encoded, columns=encoder.get_feature_names(['source']))
test_encoded_df = pd.DataFrame(test_source_encoded, columns=encoder.get_feature_names(['source']))

# Concatenate the original training and test DataFrames with the One-Hot Encoded DataFrames
train_set = pd.concat([train_set.reset_index(drop=True), train_encoded_df], axis=1)
test_set = pd.concat([test_set.reset_index(drop=True), test_encoded_df], axis=1)

# Drop the original 'source' and 'host_response_time' columns since they've been encoded
train_set.drop(['source'], axis=1, inplace=True)
test_set.drop(['source'], axis=1, inplace=True)

# Define a mapping for host_response_time categories to numerical values
host_response_time_mapping = {
    'within an hour': 1.0,
    'within a few hours': 2.0,
    'within a day': 3.0,
    'a few days or more': 4.0,
    'unknown': 5.0  # Optionally, for cases where the response time is unknown/ probably never responded
}

# Apply the mapping to the 'host_response_time' column in train and test sets
train_set['host_response_time_numeric'] = train_set['host_response_time'].map(host_response_time_mapping)
test_set['host_response_time_numeric'] = test_set['host_response_time'].map(host_response_time_mapping)

# Drop the original 'host_response_time' column since it's been encoded
train_set.drop('host_response_time', axis=1, inplace=True)
test_set.drop('host_response_time', axis=1, inplace=True)

# 3. Extract useful features from 'host_location'
# For simplicity, we'll just extract the country/region
train_set['host_country'] = train_set['host_location'].apply(lambda x: x.split(',')[-1].strip() if isinstance(x, str) else 'Unknown')
test_set['host_country'] = test_set['host_location'].apply(lambda x: x.split(',')[-1].strip() if isinstance(x, str) else 'Unknown')

# One-Hot Encode 'host_country'
train_country_encoded = encoder.fit_transform(train_set[['host_country']])
test_country_encoded = encoder.transform(test_set[['host_country']])

train_country_df = pd.DataFrame(train_country_encoded, columns=encoder.get_feature_names(['host_country']))
test_country_df = pd.DataFrame(test_country_encoded, columns=encoder.get_feature_names(['host_country']))

# Add 'host_country' One-Hot Encoded columns back to the training and test sets
train_set = pd.concat([train_set.reset_index(drop=True), train_country_df], axis=1)
test_set = pd.concat([test_set.reset_index(drop=True), test_country_df], axis=1)

# Drop original 'host_location' and 'host_country' columns
train_set.drop(['host_location', 'host_country'], axis=1, inplace=True)
test_set.drop(['host_location', 'host_country'], axis=1, inplace=True)

# 4. Create 'host_about_length' feature based on text length
train_set['host_about_length'] = train_set['host_about'].apply(lambda x: len(str(x)) if pd.notnull(x) else 0)
test_set['host_about_length'] = test_set['host_about'].apply(lambda x: len(str(x)) if pd.notnull(x) else 0)

# Drop 'host_about' as we have the length now
train_set.drop('host_about', axis=1, inplace=True)
test_set.drop('host_about', axis=1, inplace=True)

# 5. Extract text length from 'name' and 'description'
train_set['name_length'] = train_set['name'].apply(lambda x: len(str(x)))
test_set['name_length'] = test_set['name'].apply(lambda x: len(str(x)))

train_set['description_length'] = train_set['description'].apply(lambda x: len(str(x)))
test_set['description_length'] = test_set['description'].apply(lambda x: len(str(x)))

# Drop the original 'name' and 'description'
train_set.drop(['name', 'description'], axis=1, inplace=True)
test_set.drop(['name', 'description'], axis=1, inplace=True)

# 6. Extract text length from 'neighborhood_overview'
train_set['neighborhood_overview_length'] = train_set['neighborhood_overview'].apply(lambda x: len(str(x)) if pd.notnull(x) else 0)
test_set['neighborhood_overview_length'] = test_set['neighborhood_overview'].apply(lambda x: len(str(x)) if pd.notnull(x) else 0)

# Drop 'neighborhood_overview'
train_set.drop('neighborhood_overview', axis=1, inplace=True)
test_set.drop('neighborhood_overview', axis=1, inplace=True)

# 7. Infer gender from 'host_name'
detector = gender.Detector()

def map_gender(value):
    """Map gender categories into simplified versions."""
    gender_value = detector.get_gender(value.split()[0]) if isinstance(value, str) else 'unknown'
    if gender_value in ['andy', 'unknown']:
        return 'unknown'
    elif gender_value == 'mostly_female':
        return 'female'
    elif gender_value == 'mostly_male':
        return 'male'
    return gender_value

train_set['host_gender'] = train_set['host_name'].apply(map_gender)
test_set['host_gender'] = test_set['host_name'].apply(map_gender)

# One-Hot Encode 'host_gender'
train_gender_encoded = encoder.fit_transform(train_set[['host_gender']])
test_gender_encoded = encoder.transform(test_set[['host_gender']])

train_gender_df = pd.DataFrame(train_gender_encoded, columns=encoder.get_feature_names(['host_gender']))
test_gender_df = pd.DataFrame(test_gender_encoded, columns=encoder.get_feature_names(['host_gender']))

# Add 'host_gender' encoded data back to the training and test sets
train_set = pd.concat([train_set.reset_index(drop=True), train_gender_df], axis=1)
test_set = pd.concat([test_set.reset_index(drop=True), test_gender_df], axis=1)

# Drop 'host_name' and 'host_gender'
train_set.drop(['host_name', 'host_gender'], axis=1, inplace=True)
test_set.drop(['host_name', 'host_gender'], axis=1, inplace=True)

# 6. One-Hot Encoding for 'host_verifications'
def one_hot_encode_verifications(data):
    # Explode 'host_verifications' so that each verification type is in a separate row
    data_exploded = data.explode('host_verifications')
    
    # Perform One-Hot Encoding on the exploded 'host_verifications' column
    one_hot_verifications = pd.get_dummies(data_exploded['host_verifications'], prefix='verification').astype('int64')
    
    # Group by index to return the data to its original shape with each verification type as a separate column
    return one_hot_verifications.groupby(data_exploded.index).max()

# Apply One-Hot Encoding to both train and test sets for 'host_verifications'
train_verifications_encoded = one_hot_encode_verifications(train_set)
test_verifications_encoded = one_hot_encode_verifications(test_set)

# Concatenate the One-Hot Encoded verifications back into the original training and test sets
train_set = pd.concat([train_set, train_verifications_encoded], axis=1)
test_set = pd.concat([test_set, test_verifications_encoded], axis=1)

# Drop the original 'host_verifications' column
train_set.drop(['host_verifications'], axis=1, inplace=True)
test_set.drop(['host_verifications'], axis=1, inplace=True)

# Drop any columns representing empty verifications (e.g., verification_[])
train_set = train_set.loc[:, ~train_set.columns.str.contains("verification_\\[\\]")]  # Filter out columns named "verification_[]"
test_set = test_set.loc[:, ~test_set.columns.str.contains("verification_\\[\\]")]



# Handle 'neighbourhood_cleansed' with One-Hot Encoding
train_neighbourhood_encoded = pd.get_dummies(train_set['neighbourhood_cleansed'], prefix='neighbourhood_cleansed').astype('int64')
test_neighbourhood_encoded = pd.get_dummies(test_set['neighbourhood_cleansed'], prefix='neighbourhood_cleansed').astype('int64')
# Ensure test set has the same columns as the train set by reindexing (filling missing columns with 0)
test_neighbourhood_encoded = test_neighbourhood_encoded.reindex(columns=train_neighbourhood_encoded.columns, fill_value=0)

# Step 1: Identify the categories in the test set that were not seen in the training set
unseen_neighbourhoods = set(test_set['neighbourhood_cleansed']) - set(train_set['neighbourhood_cleansed'])

# Step 2: Add an "Other" column to the test set to account for unseen categories
test_set['neighbourhood_cleansed_Other'] = test_set['neighbourhood_cleansed'].apply(lambda x: 1 if x in unseen_neighbourhoods else 0)

# Step 3: Add the "Other" column to the train set (even though it will contain only zeros)
train_set['neighbourhood_cleansed_Other'] = 0  # No unseen categories in the train set

# Step 4: Concatenate the "Other" column to the one-hot encoded data
train_neighbourhood_encoded['neighbourhood_cleansed_Other'] = train_set['neighbourhood_cleansed_Other']
test_neighbourhood_encoded['neighbourhood_cleansed_Other'] = test_set['neighbourhood_cleansed_Other']

# Concatenate the encoded columns back to the original datasets
train_set = pd.concat([train_set.reset_index(drop=True), train_neighbourhood_encoded], axis=1)
test_set = pd.concat([test_set.reset_index(drop=True), test_neighbourhood_encoded], axis=1)

# Drop the original 'neighbourhood_cleansed' columns
train_set.drop(['neighbourhood_cleansed'], axis=1, inplace=True)
test_set.drop(['neighbourhood_cleansed'], axis=1, inplace=True)


# One-Hot Encoding for 'property_type'
train_property_encoded = pd.get_dummies(train_set['property_type'], prefix='property_type').astype('int64')
test_property_encoded = pd.get_dummies(test_set['property_type'], prefix='property_type').astype('int64')

# One-Hot Encoding for 'room_type'
train_room_encoded = pd.get_dummies(train_set['room_type'], prefix='room_type').astype('int64')
test_room_encoded = pd.get_dummies(test_set['room_type'], prefix='room_type').astype('int64')

# One-Hot Encoding for 'bathroom_category'
train_bathroom_category_encoded = pd.get_dummies(train_set['bathroom_category'], prefix='bathroom_category').astype('int64')
test_bathroom_category_encoded = pd.get_dummies(test_set['bathroom_category'], prefix='bathroom_category').astype('int64')

# Step 1: Extract unique amenities from the training set only
def extract_unique_amenities(data):
    # Use regex=True to avoid FutureWarning
    return data['amenities'].str.replace('[{}]', '', regex=True).str.split(',').apply(
        lambda x: [amen.strip() for amen in x if amen != '']
    ).explode()

train_amenities = extract_unique_amenities(train_set)

# Step 2: Identify the top 200 most common amenities based on the training set only
most_common_amenities = train_amenities.value_counts().head(100).index.tolist()

# Step 3: Function to encode only the top 200 amenities
def encode_amenities(df, top_amenities):
    # Create a list to hold new columns
    new_columns = []
    for amenity in top_amenities:
        column_name = f'amenity_{amenity}'
        # Create a new column for the current amenity
        new_columns.append(df['amenities'].apply(lambda x: 1 if amenity in x else 0).rename(column_name))
    
    # Use pd.concat to add all new columns at once
    df = pd.concat([df] + new_columns, axis=1)
    return df

# Apply the encoding to the training set
train_set = encode_amenities(train_set, most_common_amenities)

# Apply the same encoding to the test set without modifying the amenities in the test set
test_set = encode_amenities(test_set, most_common_amenities)

from datetime import datetime
# Convert 'calendar_last_scraped', 'first_review', 'last_review' to datetime
train_set['calendar_last_scraped'] = pd.to_datetime(train_set['calendar_last_scraped'])
test_set['calendar_last_scraped'] = pd.to_datetime(test_set['calendar_last_scraped'])
train_set['first_review'] = pd.to_datetime(train_set['first_review'], errors='coerce')
train_set['last_review'] = pd.to_datetime(train_set['last_review'], errors='coerce')
test_set['first_review'] = pd.to_datetime(test_set['first_review'], errors='coerce')
test_set['last_review'] = pd.to_datetime(test_set['last_review'], errors='coerce')

# Current date to calculate days since last events
current_date = datetime.now()

# Create new columns for the days since various events (all at once)
train_date_features = pd.DataFrame({
    'days_since_scraped': (current_date - train_set['calendar_last_scraped']).dt.days,
    'days_since_first_review': (train_set['calendar_last_scraped'] - train_set['first_review']).dt.days,
    'days_since_last_review': (train_set['calendar_last_scraped'] - train_set['last_review']).dt.days
})

test_date_features = pd.DataFrame({
    'days_since_scraped': (current_date - test_set['calendar_last_scraped']).dt.days,
    'days_since_first_review': (test_set['calendar_last_scraped'] - test_set['first_review']).dt.days,
    'days_since_last_review': (test_set['calendar_last_scraped'] - test_set['last_review']).dt.days
})

# Concatenate the new columns with the train and test sets
train_set = pd.concat([train_set.reset_index(drop=True), train_date_features], axis=1)
test_set = pd.concat([test_set.reset_index(drop=True), test_date_features], axis=1)

# Drop the original date columns if they are no longer needed
train_set.drop(['calendar_last_scraped', 'first_review', 'last_review'], axis=1, inplace=True)
test_set.drop(['calendar_last_scraped', 'first_review', 'last_review'], axis=1, inplace=True)


# Drop the original 'property_type', 'room_type', 'amenities' and bathrooms_text  columns
train_set.drop(['property_type', 'room_type', 'amenities', 'bathrooms_text', 'bathroom_category', 'host_neighbourhood', 'neighbourhood'], axis=1, inplace=True)
test_set.drop(['property_type', 'room_type', 'amenities', 'bathrooms_text', 'bathroom_category', 'host_neighbourhood', 'neighbourhood'], axis=1, inplace=True)

In [12]:
pd.set_option('display.max_columns', None)
# Display columns that are not of type int64 or float64
non_numeric_columns = train_set.select_dtypes(exclude=['int64', 'float64']).columns

# Show the column names and their corresponding data types
print(f'dtypes: {train_set[non_numeric_columns].dtypes}')
print(train_set.columns.tolist())
print(train_set.head())

missing_data_train_set = train_set.isnull().sum()
print(f'missing_data_train_set: {missing_data_train_set[missing_data_train_set > 0]}')
missing_data_test_set = test_set.isnull().sum()
print(f'missing_data_test_set: {missing_data_test_set[missing_data_test_set > 0]}')
# Final processed train_set and test_set
#print("Processed Training Set:")
#print(train_set.head())

#print("\nProcessed Test Set:")
#print(test_set.head())

dtypes: Series([], dtype: object)
['host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_listings_count', 'host_total_listings_count', 'host_has_profile_pic', 'host_identity_verified', 'latitude', 'longitude', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'price', 'minimum_nights', 'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'has_availability', 'availability_30', 'availability_60', 'availability_90', 'availability_365', 'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'instant_bookable', 'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms', 'calculated_host_li

   host_response_rate  host_acceptance_rate  host_is_superhost  \
0                 1.0                  1.00                  1   
1                 0.0                  0.89                  0   
2                 1.0                  1.00                  0   
3                 0.0                  1.00                  1   
4                 1.0                  0.98                  0   

   host_listings_count  host_total_listings_count  host_has_profile_pic  \
0                    1                          1                     0   
1                    3                          4                     0   
2                   22                         22                     0   
3                    1                          1                     0   
4                   21                         23                     0   

   host_identity_verified   latitude  longitude  accommodates  bathrooms  \
0                       0 -34.590712 -58.395248             3        1.5   


In [13]:
# Compute correlation matrix
corr_matrix = train_set.corr()

# Sort correlations with the target variable (price)
corr_target = corr_matrix['price'].sort_values(ascending=False)
print(corr_target)


price                                              1.000000
neighbourhood_cleansed_Villa Luro                  0.043258
days_since_first_review                            0.025148
days_since_last_review                             0.024967
neighbourhood_cleansed_San Telmo                   0.024452
amenity_"Crib"                                     0.024054
amenity_"Building staff"                           0.023746
amenity_"Dryer"                                    0.022723
accommodates                                       0.022697
bathrooms                                          0.021097
verification_['email', 'phone', 'work_email']      0.018683
host_response_time_numeric                         0.018364
availability_30                                    0.017678
bedrooms                                           0.017262
availability_60                                    0.016656
amenity_"Exterior security cameras on property"    0.016493
beds                                    

In [14]:
splt.boxplot

# Function to cap outliers using IQR (based on training set)
def cap_outliers(df, column, lower_bound, upper_bound):
    df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
    df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
    return df

# Apply outlier capping for important numerical columns using train set bounds
numerical_columns = ['price', 'bathrooms', 'bedrooms', 'beds']  # Add more relevant columns as needed

# Calculate IQR for each column from the training set and apply the bounds
for col in numerical_columns:
    Q1 = train_set[col].quantile(0.25)
    Q3 = train_set[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Apply capping to both train and test sets using the bounds from the training set
    train_set = cap_outliers(train_set, col, lower_bound, upper_bound)
    test_set = cap_outliers(test_set, col, lower_bound, upper_bound)

# Verify the result
print(train_set[numerical_columns].describe())

NameError: name 'splt' is not defined

In [None]:
from sklearn.preprocessing import StandardScaler

# Instantiate the StandardScaler
scaler = StandardScaler()

# List of numerical columns to scale (excluding target 'price' for scaling)
num_cols_to_scale = train_set.select_dtypes(include=['float64', 'int64']).columns.tolist()
num_cols_to_scale.remove('price')  # We don't want to scale the target variable

# Apply scaling to the training set
train_set[num_cols_to_scale] = scaler.fit_transform(train_set[num_cols_to_scale])

# Apply the same transformation to the test set
test_set[num_cols_to_scale] = scaler.transform(test_set[num_cols_to_scale])

# Verify the scaling
print(train_set[num_cols_to_scale].head())


In [None]:
corr_matrix = train_set.corr()
print(corr_matrix['price'].sort_values(ascending=False))  # Correlation of all features with 'price'


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import matplotlib.pyplot as plt

# Assuming train_set and test_set have already been prepared

# 1. Separate features and target
X_train = train_set.drop('price', axis=1)
y_train = train_set['price']

X_test = test_set.drop('price', axis=1)
y_test = test_set['price']

# 2. Instantiate and train the linear regression model
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

# 3. Make predictions on the test set
y_pred = lin_reg.predict(X_test)

# 4. Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(lin_reg.coef_)
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R² Score: {r2}")

# 5. Plot the residuals
residuals = y_test - y_pred
plt.scatter(y_test, residuals)
plt.xlabel('Actual Price')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

# Create DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define XGBoost parameters
params = {
    'objective': 'reg:squarederror',  # Use squared error for regression
    'max_depth': 6,                   # Max depth of each tree
    'eta': 0.1,                       # Learning rate
    'subsample': 0.8,                 # Use 80% of data for training each tree
    'colsample_bytree': 0.8,          # Use 80% of features for each tree
    'eval_metric': 'rmse',            # Root Mean Squared Error as evaluation metric
    'seed': 42                        # Set a random seed for reproducibility
}

# Train the XGBoost model
xgb_model = xgb.train(params, dtrain, num_boost_round=100)

# Make predictions on the test set
y_pred = xgb_model.predict(dtest)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R² Score: {r2}")


In [None]:
import matplotlib.pyplot as plt

residuals = y_test - y_pred
plt.scatter(y_test, residuals)
plt.xlabel('Actual Price')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()
