In [None]:
import pandas as pd

# Load the uploaded CSV file to inspect its contents
df = pd.read_csv('listings.csv')

#print(df.info())

# Temporarily display all columns
pd.set_option('display.max_columns', None)
# Now print the head of the DataFrame with all columns visible
#print(df.head())
# Reset back to the default settings after displaying the data (optional)
#pd.reset_option('display.max_columns')

#print(df.columns.tolist())
#print(len(df['amenities'].unique()))

# Split the 'amenities' column into individual amenities
df['amenities'] = df['amenities'].str.replace('[{}"]', '', regex=True)  # Clean up the amenities
df['amenities_list'] = df['amenities'].apply(lambda x: x.split(','))

# Flatten the list of lists into a single list of all amenities
all_amenities = [amenity.strip() for sublist in df['amenities_list'] for amenity in sublist]

# Get the number of unique amenities
unique_amenities = set(all_amenities)
print(f"Total number of unique amenities: {len(unique_amenities)}")


# Display the 10 most common amenities
#print(amenity_counts.most_common(10))


In [None]:
# Display only columns with missing data (out of 36807 data points)
missing_data = df.isnull().sum()
missing_data[missing_data > 0]

In [None]:
df_imputated = pd.read_csv('listings.csv')

df_imputated.drop_duplicates()
df_imputated.drop(['license', 'calendar_updated', 'neighbourhood_group_cleansed'], axis=1, inplace=True)
df_imputated.drop(['id', 'listing_url', 'scrape_id', 'picture_url', 'host_id', 'host_url', 'host_thumbnail_url', 'host_picture_url'], axis=1, inplace=True)

In [None]:
# Extract all unique values from the 'bathrooms_text' column
unique_bathrooms_text = df['bathrooms_text'].unique()
print(unique_bathrooms_text)


In [None]:
print(df[['latitude', 'longitude', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'price', 'number_of_reviews', 'review_scores_rating']].describe())


In [None]:
import numpy as np
import pandas as pd

# Function to extract bathroom numbers from the text
def extract_bathrooms(row):
    bathroom_text = row['bathrooms_text']
    bathroom = row['bathrooms']
    
    # If both bathrooms and bathrooms_text are NaN, return NaN
    if pd.isnull(bathroom) and pd.isnull(bathroom_text):
        return np.nan
    
    # If bathrooms_text contains 'half', return 0.5
    if isinstance(bathroom_text, str):
        bathroom_text = bathroom_text.lower()
        if 'half' in bathroom_text:
            return 0.5
        # Extract digits if they exist
        num = ''.join([ch for ch in bathroom_text if ch.isdigit() or ch == '.'])
        return float(num) if num else np.nan
    
    # Otherwise, return the existing value of bathrooms
    return bathroom

# Function to classify bathrooms into 'private', 'shared', 'no bathroom', or NaN
def classify_bathroom(row):
    bathrooms_text = row['bathrooms_text']
    bathrooms = row['bathrooms']
    
    # If bathrooms is NaN, return NaN for category as well
    if pd.isnull(bathrooms):
        return 'no bathroom'
    
    # Convert bathrooms_text to lowercase for comparison
    if isinstance(bathrooms_text, str):
        text = bathrooms_text.lower()
        
        if bathrooms == 0:  # If bathrooms is 0, it indicates 'no bathroom'
            return 'no bathroom'
        elif 'shared' in text:  # If 'shared' is mentioned in text
            return 'shared'
        elif 'private' in text:  # If 'private' is mentioned in text
            return 'private'
    
    # Default to 'private' if not explicitly mentioned
    return 'private'

df_imputated['bathrooms'] = df_imputated.apply(extract_bathrooms, axis=1)

df_imputated['bathroom_category'] = df_imputated.apply(classify_bathroom, axis=1)

pd.set_option('display.max_rows', None)
print(df_imputated[['bathrooms_text', 'bathrooms', 'bathroom_category']].iloc[0: 100])
print(df[['bathrooms_text', 'bathrooms']].iloc[0: 100])
#pd.reset_option('display.max_rows')

In [None]:
print(df_imputated[['latitude', 'longitude', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'price', 'number_of_reviews', 'review_scores_rating']].describe())


In [None]:
# Fill missing values for categorical variables with 'missing' or similar labels
df_imputated['description'].fillna('No description', inplace=True)
df_imputated['host_about'].fillna('No host info', inplace=True)
df_imputated['host_location'].fillna('No host location', inplace=True)
df_imputated['bathrooms_text'].fillna('No bathroom text', inplace=True)
df_imputated['neighborhood_overview'].fillna('No neighborhood description', inplace=True)

# Fill missing review-related data with 0 (indicating no reviews)
df_imputated['review_scores_rating'].fillna(0, inplace=True)
df_imputated['review_scores_accuracy'].fillna(0, inplace=True)
df_imputated['review_scores_cleanliness'].fillna(0, inplace=True)
df_imputated['review_scores_checkin'].fillna(0, inplace=True)
df_imputated['review_scores_communication'].fillna(0, inplace=True)
df_imputated['review_scores_location'].fillna(0, inplace=True)
df_imputated['review_scores_value'].fillna(0, inplace=True)
df_imputated['reviews_per_month'].fillna(0, inplace=True)
# Set 'first_review' and 'last_review' as NaT (Not a Timestamp) to indicate no reviews
df_imputated['first_review'].fillna(0, inplace=True)
df_imputated['last_review'].fillna(0, inplace=True)

df_imputated.to_csv('listings_imputated.csv', index=False)

In [None]:
import pandas as pd

# Example DataFrame (replace this with your actual DataFrame loading)
# df = pd.read_csv('your_data.csv')

# 1. Fill 'host_response_time' with 'unknown'
df_imputated['host_response_time'].fillna('unknown', inplace=True)

# 2. Fill 'host_response_rate' with 0
df_imputated['host_response_rate'].fillna(0, inplace=True)

# 3. Fill 'host_acceptance_rate' with 0
df_imputated['host_acceptance_rate'].fillna(0, inplace=True)

# 4. Fill 'host_is_superhost' with False
df_imputated['host_is_superhost'].fillna('f', inplace=True)

# 5. Fill 'host_neighbourhood' with 'unknown'
df_imputated['host_neighbourhood'].fillna('unknown', inplace=True)

# 6. Fill 'neighbourhood' with 'unknown'
df_imputated['neighbourhood'].fillna('unknown', inplace=True)

# Convert 'price' to numeric (removing currency symbols and commas)
df['price'] = pd.to_numeric(df['price'].replace({'\$': '', ',': ''}, regex=True))
df_imputated['price'].fillna(df['price'].median(), inplace=True)

# 11. Fill 'has_availability' with False
df_imputated['has_availability'].fillna(False, inplace=True)

# Display the updated DataFrame
print(df_imputated.head())  # Check the first few rows to ensure everything worked as expected


# Seperate Test and Train Dataset

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets (80% train, 20% test)
train_set, test_set = train_test_split(df_imputated, test_size=0.2, random_state=42)
train_set = train_set.copy()
test_set = test_set.copy()

# Check the size of the splits
print("Training set size:", len(train_set))
print("Test set size:", len(test_set))


In [None]:
print(train_set[['latitude', 'longitude', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'price', 'number_of_reviews', 'review_scores_rating']].describe())


In [None]:
# 2. Calculate the median values from the training set
median_bathrooms_train = train_set['bathrooms'].median()
median_bedrooms_train = train_set['bedrooms'].median()
median_beds_train = train_set['beds'].median()

# 3. Fill missing values in the training set using the medians from the training set
train_set['bathrooms'].fillna(median_bathrooms_train, inplace=True)
train_set['bedrooms'].fillna(median_bedrooms_train, inplace=True)
train_set['beds'].fillna(median_beds_train, inplace=True)

# 4. Fill missing values in the test set using the same medians from the training set
test_set['bathrooms'].fillna(median_bathrooms_train, inplace=True)
test_set['bedrooms'].fillna(median_bedrooms_train, inplace=True)
test_set['beds'].fillna(median_beds_train, inplace=True)

In [None]:
print(train_set[['latitude', 'longitude', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'price', 'number_of_reviews', 'review_scores_rating']].describe())


In [None]:
missing_data_train_set = train_set.isnull().sum()
print(f'missing_data_train_set: {missing_data_train_set[missing_data_train_set > 0]}')

missing_data_test_set = test_set.isnull().sum()
print(f'missing_data_test_set: {missing_data_test_set[missing_data_test_set > 0]}')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np
import gender_guesser.detector as gender



# Remove '%' and convert to float
def convert_percentage(column):
    return column.apply(lambda x: float(str(x).rstrip('%')) / 100 if '%' in str(x) else float(x))

# 7. Convert price from string to numeric by removing currency symbols
train_set['price'] = train_set['price'].replace('[\$,]', '', regex=True).astype(float)
test_set['price'] = test_set['price'].replace('[\$,]', '', regex=True).astype(float)



# Optionally drop the original date columns if they are no longer needed
train_set.drop(['last_scraped', 'host_since'], axis=1, inplace=True)
test_set.drop(['last_scraped', 'host_since'], axis=1, inplace=True)


# Drop the original 'source' and 'host_response_time' columns since they've been encoded
train_set.drop(['source'], axis=1, inplace=True)
test_set.drop(['source'], axis=1, inplace=True)



# Drop original 'host_location' and 'host_country' columns
train_set.drop(['host_location'], axis=1, inplace=True)
test_set.drop(['host_location'], axis=1, inplace=True)


# Drop 'host_about' as we have the length now
train_set.drop('host_about', axis=1, inplace=True)
test_set.drop('host_about', axis=1, inplace=True)


# Drop the original 'name' and 'description'
train_set.drop(['name', 'description'], axis=1, inplace=True)
test_set.drop(['name', 'description'], axis=1, inplace=True)

# Drop 'neighborhood_overview'
train_set.drop('neighborhood_overview', axis=1, inplace=True)
test_set.drop('neighborhood_overview', axis=1, inplace=True)



# Drop 'host_name' and 'host_gender'
train_set.drop(['host_name'], axis=1, inplace=True)
test_set.drop(['host_name'], axis=1, inplace=True)
  

# Drop the original 'host_verifications' column
train_set.drop(['host_verifications'], axis=1, inplace=True)
test_set.drop(['host_verifications'], axis=1, inplace=True)




# Drop the original 'neighbourhood_cleansed' columns
train_set.drop(['neighbourhood_cleansed'], axis=1, inplace=True)
test_set.drop(['neighbourhood_cleansed'], axis=1, inplace=True)


# One-Hot Encoding for 'property_type'
train_property_encoded = pd.get_dummies(train_set['property_type'], prefix='property_type').astype('int64')
test_property_encoded = pd.get_dummies(test_set['property_type'], prefix='property_type').astype('int64')

# One-Hot Encoding for 'room_type'
train_room_encoded = pd.get_dummies(train_set['room_type'], prefix='room_type').astype('int64')
test_room_encoded = pd.get_dummies(test_set['room_type'], prefix='room_type').astype('int64')

# One-Hot Encoding for 'bathroom_category'
train_bathroom_category_encoded = pd.get_dummies(train_set['bathroom_category'], prefix='bathroom_category').astype('int64')
test_bathroom_category_encoded = pd.get_dummies(test_set['bathroom_category'], prefix='bathroom_category').astype('int64')


# Drop the original date columns if they are no longer needed
train_set.drop(['calendar_last_scraped', 'first_review', 'last_review'], axis=1, inplace=True)
test_set.drop(['calendar_last_scraped', 'first_review', 'last_review'], axis=1, inplace=True)

train_set.drop(['host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'has_availability', 'instant_bookable', 'host_listings_count', 'host_total_listings_count', 'minimum_nights', 'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'availability_30', 'availability_60', 'availability_90', 'availability_365', 'number_of_reviews_ltm', 'number_of_reviews_l30d', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms', 'reviews_per_month'], axis=1, inplace=True)
test_set.drop(['host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'has_availability', 'instant_bookable', 'host_listings_count', 'host_total_listings_count', 'minimum_nights', 'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'availability_30', 'availability_60', 'availability_90', 'availability_365', 'number_of_reviews_ltm', 'number_of_reviews_l30d', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms', 'reviews_per_month'], axis=1, inplace=True)
# Drop the original 'property_type', 'room_type', 'amenities' and bathrooms_text  columns
train_set.drop(['property_type', 'room_type', 'amenities', 'bathrooms_text', 'bathroom_category', 'host_neighbourhood', 'neighbourhood'], axis=1, inplace=True)
test_set.drop(['property_type', 'room_type', 'amenities', 'bathrooms_text', 'bathroom_category', 'host_neighbourhood', 'neighbourhood'], axis=1, inplace=True)

In [None]:
print(train_set[['latitude', 'longitude', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'price', 'number_of_reviews', 'review_scores_rating']].describe())


In [None]:
pd.set_option('display.max_columns', None)
# Display columns that are not of type int64 or float64
non_numeric_columns = train_set.select_dtypes(exclude=['int64', 'float64']).columns

# Show the column names and their corresponding data types
print(f'dtypes: {train_set[non_numeric_columns].dtypes}')
print(train_set.columns.tolist())
print(train_set.head())

missing_data_train_set = train_set.isnull().sum()
print(f'missing_data_train_set: {missing_data_train_set[missing_data_train_set > 0]}')
missing_data_test_set = test_set.isnull().sum()
print(f'missing_data_test_set: {missing_data_test_set[missing_data_test_set > 0]}')
# Final processed train_set and test_set
#print("Processed Training Set:")
#print(train_set.head())

#print("\nProcessed Test Set:")
#print(test_set.head())

In [None]:
print(train_set[['latitude', 'longitude', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'price', 'number_of_reviews', 'review_scores_rating']].describe())
# Print unique counts for bathrooms and bedrooms using a standard print statement
print('Unique bathrooms count:\n', train_set['bathrooms'].value_counts())
print('Unique bedrooms count:\n', train_set['bedrooms'].value_counts())

import seaborn as sns
import matplotlib.pyplot as plt

# List of columns for which you want to create separate boxplots
columns = ['latitude', 'longitude', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'price', 'number_of_reviews', 'review_scores_rating']

# Loop through the list and create a boxplot for each column
for col in columns:
    plt.figure(figsize=(8, 4))  # Set the figure size (adjust as needed)
    sns.boxplot(x=train_set[col])  # Create a boxplot for the current column
    plt.title(f'Boxplot of {col}')  # Set a title for the plot
    plt.show()  # Show the plot


In [None]:
# Function to cap outliers manually for specific columns
def manual_cap_outliers(df, column, upper_limit):
    df[column] = np.where(df[column] > upper_limit, upper_limit, df[column])
    return df

# Manually set reasonable upper bounds for bathrooms and bedrooms
train_set = manual_cap_outliers(train_set, 'bathrooms', 6)  # Cap bathrooms at 5
train_set = manual_cap_outliers(train_set, 'bedrooms', 10)   # Cap bedrooms at 5

# Optionally apply the same to the test set
test_set = manual_cap_outliers(test_set, 'bathrooms', 6)
test_set = manual_cap_outliers(test_set, 'bedrooms', 10)

# Verify the result
print(train_set[['bathrooms', 'bedrooms']].describe())

# Function to cap outliers using IQR (based on training set)
def cap_outliers(df, column, lower_bound, upper_bound):
    df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
    df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
    return df

# Apply outlier capping for important numerical columns using train set bounds
numerical_columns = ['price', 'beds']  # Add more relevant columns as needed

# Calculate IQR for each column from the training set and apply the bounds
for col in columns:
    Q1 = train_set[col].quantile(0.25)
    Q3 = train_set[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Apply capping to both train and test sets using the bounds from the training set
    train_set = cap_outliers(train_set, col, lower_bound, upper_bound)
    test_set = cap_outliers(test_set, col, lower_bound, upper_bound)

# Verify the result
print(train_set[numerical_columns].describe())

In [None]:
from sklearn.preprocessing import StandardScaler

# Separate lists of continuous features
continuous_cols = ['latitude', 'longitude', 'accommodates', 'beds', 'number_of_reviews', 'review_scores_rating']

# Apply StandardScaler to continuous features
scaler_cont = StandardScaler()
    train_set[continuous_cols] = scaler_cont.fit_transform(train_set[continuous_cols])
test_set[continuous_cols] = scaler_cont.transform(test_set[continuous_cols])

# No scaling for bathrooms and bedrooms, use them as is
print(train_set[['bathrooms', 'bedrooms']].head())


In [None]:
corr_matrix = train_set.corr()
print(corr_matrix['price'].sort_values(ascending=False))  # Correlation of all features with 'price'


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import matplotlib.pyplot as plt

# Assuming train_set and test_set have already been prepared

# 1. Separate features and target
X_train = train_set.drop('price', axis=1)
y_train = train_set['price']

X_test = test_set.drop('price', axis=1)
y_test = test_set['price']

# 2. Instantiate and train the linear regression model
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

# 3. Make predictions on the test set
y_pred = lin_reg.predict(X_test)

# 4. Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(lin_reg.coef_)
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R² Score: {r2}")

# 5. Plot the residuals
residuals = y_test - y_pred
plt.scatter(y_test, residuals)
plt.xlabel('Actual Price')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()

# 6. Plot actual vs predicted price
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)  # Diagonal line for perfect predictions
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted Price')
plt.show()


In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

# Create DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define XGBoost parameters
params = {
    'objective': 'reg:squarederror',  # Use squared error for regression
    'max_depth': 6,                   # Max depth of each tree
    'eta': 0.1,                       # Learning rate
    'subsample': 0.8,                 # Use 80% of data for training each tree
    'colsample_bytree': 0.8,          # Use 80% of features for each tree
    'eval_metric': 'rmse',            # Root Mean Squared Error as evaluation metric
    'seed': 42                        # Set a random seed for reproducibility
}

# Train the XGBoost model
xgb_model = xgb.train(params, dtrain, num_boost_round=100)

# Make predictions on the test set
y_pred = xgb_model.predict(dtest)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R² Score: {r2}")


# 6. Plot actual vs predicted price
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)  # Diagonal line for perfect predictions
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted Price')
plt.show()

import matplotlib.pyplot as plt

residuals = y_test - y_pred
plt.scatter(y_test, residuals)
plt.xlabel('Actual Price')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()
