In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import os

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [4]:
df_host = pd.read_csv('host_data.csv')
print(df_host.columns)

df_loc = pd.read_csv('location_data.csv')

df_pric = pd.read_csv('pricing_data.csv')

df_prop = pd.read_csv('property_data.csv')

df_rev = pd.read_csv('review_data.csv')

Index(['customer_id', 'host_since', 'host_response_rate',
       'host_has_profile_pic', 'host_identity_verified'],
      dtype='object')


In [10]:
# Merge df_host and df_loc
merge1 = pd.merge(df_host, df_loc, on='customer_id')

# Merge the result with df_pric
merge2 = pd.merge(merge1, df_pric, on='customer_id')

# Merge the result with df_prop
merge3 = pd.merge(merge2, df_prop, on='customer_id')

# Merge the result with df_rev
df_merged = pd.merge(merge3, df_rev, on='customer_id', how='right')  # Use 'right' join as originally intended

In [12]:
#Info on all columns of the merged dataset
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110466 entries, 0 to 110465
Data columns (total 23 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   customer_id             110466 non-null  object 
 1   host_since              110083 non-null  object 
 2   host_response_rate      83463 non-null   object 
 3   host_has_profile_pic    110083 non-null  object 
 4   host_identity_verified  110083 non-null  object 
 5   neighbourhood           95456 non-null   object 
 6   latitude                110466 non-null  float64
 7   longitude               110466 non-null  float64
 8   city                    110466 non-null  object 
 9   zipcode                 109032 non-null  object 
 10  log_price               110466 non-null  float64
 11  cleaning_fee            110466 non-null  bool   
 12  cancellation_policy     110466 non-null  object 
 13  property_type           110466 non-null  object 
 14  room_type           

In [None]:
CUSTOMER ID

In [14]:
df_host.columns

Index(['customer_id', 'host_since', 'host_response_rate',
       'host_has_profile_pic', 'host_identity_verified'],
      dtype='object')

In [16]:
# Check for missing values in the customer_id column
missing_customer_id = df_host['customer_id'].isnull().sum()

# Display the result
print(f"Number of missing values in 'customer_id': {missing_customer_id}")

Number of missing values in 'customer_id': 0


In [26]:
# Total number of rows in the original dataset
total_rows = len(df_host)
print(f"Total rows in the original dataset: {total_rows}")

# Check for duplicate rows
duplicate_rows = df_host.duplicated().sum()
print(f"Number of duplicate rows in the dataset: {duplicate_rows}")

Total rows in the original dataset: 69546
Number of duplicate rows in the dataset: 1364


In [28]:
print(df_host[df_host.duplicated(keep=False)].nunique())

customer_id               1364
host_since                1007
host_response_rate          43
host_has_profile_pic         2
host_identity_verified       2
dtype: int64


In [30]:
# Identify all exact duplicate rows
exact_duplicates = df_host[df_host.duplicated(keep=False)]

# Check if all columns are identical in these rows
are_exact_duplicates = (exact_duplicates.nunique() == 1).all()

# Display the result
print("Are all duplicates exact duplicates?:", are_exact_duplicates)

Are all duplicates exact duplicates?: False


In [32]:
# Step 1: Find all exact duplicates (including all columns)
exact_duplicates = df_host[df_host.duplicated(keep=False)]

# Step 2: Count exact duplicate rows
exact_duplicates_count = exact_duplicates.shape[0]

# Step 3: Drop duplicates and find the unique rows
unique_rows = df_host.drop_duplicates(keep=False)

# Step 4: Display results
print(f"Number of exact duplicate rows: {exact_duplicates_count}")
print(f"Number of completely unique rows: {unique_rows.shape[0]}")

Number of exact duplicate rows: 2728
Number of completely unique rows: 66818


In [None]:
host_since, host_year/month/day/duration

In [36]:
#convert to datetime object
df_host['host_since'] = pd.to_datetime(df_host['host_since'], errors='coerce')

#Add Feature for experience of host.
df_host['host_duration'] = (pd.Timestamp.now() - df_host['host_since']).dt.days

df_host['host_year'] = df_host['host_since'].dt.year
df_host['host_month'] = df_host['host_since'].dt.month
df_host['host_day'] = df_host['host_since'].dt.day

In [38]:
# Check for missing values in the relevant columns
missing_values = df_host[['host_since', 'host_year', 'host_month', 'host_day', 'host_duration']].isnull().sum()

# Display the result
print("Missing values in each column:")
print(missing_values)

Missing values in each column:
host_since       173
host_year        173
host_month       173
host_day         173
host_duration    173
dtype: int64


In [40]:
# Filter rows where host_since, host_year, host_month, host_day, and host_duration are all missing
missing_rows = df_host[
    df_host[['host_since', 'host_year', 'host_month', 'host_day', 'host_duration']].isnull().all(axis=1)
]

# Display the rows
print("Rows where all the columns are missing:")
print(missing_rows)

Rows where all the columns are missing:
                                             customer_id host_since  \
471    n1pcV5tqZnJdiP07RrdvGGWf9jmCTFOsI9qXP36Dm8e4Tj...        NaT   
1340   Jq8GcSZEBy10PDl6cQalRyAC8z9A5cRV9jkGuwZ1OZMzjm...        NaT   
1426   57D1gD5LQC6P8CQFzPRUySBgQYd1Vor3lRaf7MWM6FjBqo...        NaT   
2205   msazL1DAKw4IPDC46wxWLBGLak8WyHjvZtgdt7LQ4Ph3uJ...        NaT   
3152   l0RFQ3elJmwn93xpFf0HSjGkLRgjKkiyNo54pwhhoj0jYr...        NaT   
...                                                  ...        ...   
67920  ndUAGXG2P0DLID8gGKEpl29YEW89gmvGSv43E0dcirqh96...        NaT   
68354  JaTW5rNRIUrkCiBxeegNU8mokA1JUNoVd22qlQV3AydySe...        NaT   
68932  B4vaxeueFJEeJreMMRk9xpxaGkxPvqgnl32yWckpZ5fz3m...        NaT   
68933  DYE37JTStXxxqQDJ6JXizKfNjFPNEXDjKLMfIP4ijMgMsa...        NaT   
69349  HGcCot3i7ZRJ5AY4lpkykyXcqTVVZqB48X4H53TOSBC55E...        NaT   

      host_response_rate host_has_profile_pic host_identity_verified  \
471                  NaN           

In [42]:
# Check other columns for patterns in rows with missing host_since
missing_rows = df_host[df_host['host_since'].isnull()]
print(missing_rows.describe())

##Probably these hosts have not verified their profile nor updated their picture. 
##Validate hypothesis by checking for other patterns with listings or reviews for example.

      host_since  host_duration  host_year  host_month  host_day
count          0            0.0        0.0         0.0       0.0
mean         NaT            NaN        NaN         NaN       NaN
min          NaT            NaN        NaN         NaN       NaN
25%          NaT            NaN        NaN         NaN       NaN
50%          NaT            NaN        NaN         NaN       NaN
75%          NaT            NaN        NaN         NaN       NaN
max          NaT            NaN        NaN         NaN       NaN
std          NaN            NaN        NaN         NaN       NaN


In [44]:
df_host = df_host.dropna(subset=['host_since'])
print(f"Remaining rows: {len(df_host)}")

Remaining rows: 69373


In [None]:
host_response_rate

In [54]:
# Step 1: Ensure the column is a string before stripping '%'
df_host.loc[:, 'host_response_rate'] = df_host['host_response_rate'].astype(str).str.rstrip('%')

# Step 2: Remove '%' and convert to numeric
df_host.loc[:, 'host_response_rate'] = pd.to_numeric(df_host['host_response_rate'], errors='coerce')

# Step 3: Check for missing values
missing_response_rate = df_host['host_response_rate'].isnull().sum()
print(f"Missing values in 'host_response_rate': {missing_response_rate}")

# Step 4: Impute missing values with median
median_response_rate = df_host['host_response_rate'].median()
df_host['host_response_rate'].fillna(median_response_rate, inplace=True)

# Step 5: Verify cleaning
print(df_host['host_response_rate'].describe())

Missing values in 'host_response_rate': 0
count     69373.0
unique       79.0
top         100.0
freq      57608.0
Name: host_response_rate, dtype: float64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_host['host_response_rate'].fillna(median_response_rate, inplace=True)


In [48]:
# Check total number of rows in df_host
total_rows = len(df_host)
print(f"Total rows in the dataset: {total_rows}")

# Check the percentage of missing values in 'host_response_rate'
missing_percentage = (17021 / total_rows) * 100
print(f"Percentage of missing values in 'host_response_rate': {missing_percentage:.2f}%")

Total rows in the dataset: 69373
Percentage of missing values in 'host_response_rate': 24.54%


In [None]:
host_has_profile_pic

In [56]:
# Step 1: Check for missing values
missing_profile_pic = df_host['host_has_profile_pic'].isnull().sum()
print(f"Missing values in 'host_has_profile_pic': {missing_profile_pic}")

# Step 2: Verify binary encoding
print("Unique values in 'host_has_profile_pic':")
print(df_host['host_has_profile_pic'].unique())

# Step 3: Analyze the distribution
print("Distribution of 'host_has_profile_pic':")
print(df_host['host_has_profile_pic'].value_counts())

# Impute missing value with the mode (most common value)
mode_profile_pic = df_host['host_has_profile_pic'].mode()[0]
df_host['host_has_profile_pic'].fillna(mode_profile_pic, inplace=True)

# Verify that there are no missing values left
print(df_host['host_has_profile_pic'].isnull().sum())

Missing values in 'host_has_profile_pic': 0
Unique values in 'host_has_profile_pic':
['t' 'f']
Distribution of 'host_has_profile_pic':
host_has_profile_pic
t    69150
f      223
Name: count, dtype: int64
0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_host['host_has_profile_pic'].fillna(mode_profile_pic, inplace=True)


In [60]:
#Change t/f to True and False
df_host['host_has_profile_pic'] = df_host['host_has_profile_pic'].map({'t': True, 'f': False}).astype(bool)

In [None]:
host_identity_verified

In [62]:
# Step 1: Check for missing values
missing_identity_verified = df_host['host_identity_verified'].isnull().sum()
print(f"Missing values in 'host_identity_verified': {missing_identity_verified}")

# Step 2: Verify binary encoding
print("Unique values in 'host_identity_verified':")
print(df_host['host_identity_verified'].unique())

# Step 3: Analyze the distribution
print("Distribution of 'host_identity_verified':")
print(df_host['host_identity_verified'].value_counts())

# Step 4: Impute the missing value with the mode (most common value)
mode_identity_verified = df_host['host_identity_verified'].mode()[0]
df_host['host_identity_verified'].fillna(mode_identity_verified, inplace=True)

# Step 5: Verify that there are no missing values left
print(f"Missing values in 'host_identity_verified': {df_host['host_identity_verified'].isnull().sum()}")

Missing values in 'host_identity_verified': 0
Unique values in 'host_identity_verified':
['t' 'f']
Distribution of 'host_identity_verified':
host_identity_verified
t    46676
f    22697
Name: count, dtype: int64
Missing values in 'host_identity_verified': 0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_host['host_identity_verified'].fillna(mode_identity_verified, inplace=True)


In [68]:
#Change t/f to True and False
df_host['host_identity_verified'] = df_host['host_identity_verified'].map({'t': True, 'f': False}).astype(bool)

In [None]:
Summarize

In [70]:
print(df_host.info())
print(df_host.describe())

<class 'pandas.core.frame.DataFrame'>
Index: 69373 entries, 0 to 69545
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   customer_id             69373 non-null  object        
 1   host_since              69373 non-null  datetime64[ns]
 2   host_response_rate      69373 non-null  object        
 3   host_has_profile_pic    69373 non-null  bool          
 4   host_identity_verified  69373 non-null  bool          
 5   host_duration           69373 non-null  float64       
 6   host_year               69373 non-null  float64       
 7   host_month              69373 non-null  float64       
 8   host_day                69373 non-null  float64       
dtypes: bool(2), datetime64[ns](1), float64(4), object(2)
memory usage: 4.4+ MB
None
                          host_since  host_duration     host_year  \
count                          69373   69373.000000  69373.000000   
mean   2015-09