# Collaborative Filtering and Attribute Modelling

## 1. Prerequisites

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [6]:
listings = pd.read_csv('listings_metadata (1).csv')
user_analysis = pd.read_csv('user_41399220_analysis (1).csv')
listings.head()

Unnamed: 0,id,listing_id,listing_status,listing_user_id,superhost,price,currency,city,lat,lng,...,Unnamed: 213,Unnamed: 214,Unnamed: 215,Unnamed: 216,Unnamed: 217,Unnamed: 218,Unnamed: 219,Unnamed: 220,Unnamed: 221,Unnamed: 222
0,397371,35873806,,269825819,True,50,,Scottsville,37.94958,-78.4652,...,,,,,,,,,,
1,397538,39962641,,150543717,False,200,,Charlottesville,37.99846,-78.48698,...,,,,,,,,,,
2,397705,17027835,,50937853,False,388,,Charlottesville,37.97931,-78.42316,...,,,,,,,,,,
3,397764,15581613,,41399220,True,967,,Charlottesville,37.99289,-78.43639,...,,,,,,,,,,
4,4163522,5997063,,703086,True,141,,Stanardsville,38.26894,-78.52328,...,,,,,,,,,,


In [8]:
user_analysis.head()

Unnamed: 0,listing_id,m_revenue,status_amenities,score_amenities,score_star_rating,score_guest_favorite,score_review_count,score_listing_title,score_listing_description,score_review_summary,...,property_manager_count,property_manager_cat,lat,lng,bedrooms,l_revenue,user_listings,features_to_review,listing_age,extracted_review_count
0,24076293,16997.7,SUCCESS,A,A,B,A,B,B,A,...,1,Individual,38.04735,-78.50238,5,0,False,[],1628,14
1,48088408,16997.7,SUCCESS,A,,B,C,B,B,A,...,4,Small,38.04851,-78.495,5,32930,False,['score_review_count'],1300,2
2,53544297,16997.7,SUCCESS,A,A,A,A,B,B,A,...,3,Small,38.04159,-78.49038,5,7936,False,[],733,45
3,45073863,16997.7,SUCCESS,A,A,A,A,B,B,A,...,2,Small,38.02811,-78.51718,5,17413,False,[],1420,122
4,887637685970978714,16997.7,SUCCESS,A,A,A,A,B,B,A,...,3,Small,38.01961,-78.5156,5,12699,False,[],479,74


## 2. Data Cleaning and Preprocessing 

In [11]:
listings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1209 entries, 0 to 1208
Columns: 223 entries, id to Unnamed: 222
dtypes: float64(10), object(213)
memory usage: 2.1+ MB


In [13]:
user_analysis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1129 entries, 0 to 1128
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   listing_id                 1129 non-null   int64  
 1   m_revenue                  1129 non-null   float64
 2   status_amenities           1129 non-null   object 
 3   score_amenities            1129 non-null   object 
 4   score_star_rating          1073 non-null   object 
 5   score_guest_favorite       1129 non-null   object 
 6   score_review_count         1129 non-null   object 
 7   score_listing_title        1129 non-null   object 
 8   score_listing_description  1129 non-null   object 
 9   score_review_summary       1129 non-null   object 
 10  score_image_score          1129 non-null   object 
 11  property_manager_count     1129 non-null   int64  
 12  property_manager_cat       1129 non-null   object 
 13  lat                        1129 non-null   float

In [22]:

# Select the attributes that are relevant 
list_select = listings[['id','listing_id', 'superhost', 'price', 'city', 'lat', 'lng']]

# merging datasets on listing_id - using correct variable name 'list_select'
merged_df = user_analysis.merge(list_select, on='listing_id', how='left')

# dropping the unnamed columns, if any
merged_df = merged.loc[:, ~merged_df.columns.str.contains('^Unnamed')]

# Display cleaned dataset info
merged_df.info()
merged_df.head()

ValueError: You are trying to merge on int64 and object columns. If you wish to proceed you should use pd.concat

In [24]:
# Converting listing_id to string in both datasets for consistency
listings["listing_id"] = listings["listing_id"].astype(str)
user_analysis["listing_id"] = user_analysis["listing_id"].astype(str)

# Selecting relevant columns from listings dataset
listings_selected = listings[['id', 'listing_id', 'superhost', 'price', 'city', 'lat', 'lng']]

# Merging datasets on listing_id
merged_df = user_analysis.merge(listings_selected, on='listing_id', how='left')

# Dropping unnamed columns if any
merged_df = merged_df.loc[:, ~merged_df.columns.str.contains('^Unnamed')]

# Display cleaned dataset info
merged_df.info(), merged_df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1129 entries, 0 to 1128
Data columns (total 27 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   listing_id                 1129 non-null   object 
 1   m_revenue                  1129 non-null   float64
 2   status_amenities           1129 non-null   object 
 3   score_amenities            1129 non-null   object 
 4   score_star_rating          1073 non-null   object 
 5   score_guest_favorite       1129 non-null   object 
 6   score_review_count         1129 non-null   object 
 7   score_listing_title        1129 non-null   object 
 8   score_listing_description  1129 non-null   object 
 9   score_review_summary       1129 non-null   object 
 10  score_image_score          1129 non-null   object 
 11  property_manager_count     1129 non-null   int64  
 12  property_manager_cat       1129 non-null   object 
 13  lat_x                      1129 non-null   float

(None,
            listing_id  m_revenue status_amenities score_amenities  \
 0            24076293    16997.7          SUCCESS               A   
 1            48088408    16997.7          SUCCESS               A   
 2            53544297    16997.7          SUCCESS               A   
 3            45073863    16997.7          SUCCESS               A   
 4  887637685970978714    16997.7          SUCCESS               A   
 
   score_star_rating score_guest_favorite score_review_count  \
 0                 A                    B                  A   
 1               NaN                    B                  C   
 2                 A                    A                  A   
 3                 A                    A                  A   
 4                 A                    A                  A   
 
   score_listing_title score_listing_description score_review_summary  ...  \
 0                   B                         B                    A  ...   
 1                   B       

## 3. Collaborative Filtering

In [27]:
# Converting 'price' to numeric, forcing errors to NaN
merged_df['price'] = pd.to_numeric(merged_df['price'], errors='coerce')

# Converting 'lng_y' to numeric (fixing possible formatting issues)
merged_df['lng_y'] = pd.to_numeric(merged_df['lng_y'], errors='coerce')

# Verifying data type corrections
merged_df.dtypes[['price', 'lng_y']]


price    float64
lng_y    float64
dtype: object

In [29]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Selecting numeric features for similarity calculation
cf_features = ['price', 'score_amenities', 'score_star_rating', 'listing_age']
cf_df = merged_df.dropna(subset=cf_features)  # Dropping rows with missing values

# Computing cosine similarity between listings
listing_similarity = cosine_similarity(cf_df[cf_features])

# Creating a similarity matrix DataFrame
similarity_df = pd.DataFrame(listing_similarity, index=cf_df['listing_id'], columns=cf_df['listing_id'])

# Display top 5 most similar listings for a sample listing
sample_listing = similarity_df.iloc[0].sort_values(ascending=False).index[1:6]  # Top 5 excluding itself
sample_listing


ValueError: could not convert string to float: 'A'

In [31]:
# Checking for non-numeric values in selected columns
non_numeric_entries = cf_df[cf_features].applymap(lambda x: isinstance(x, str)).sum()
non_numeric_entries


price                   0
score_amenities      1072
score_star_rating    1072
listing_age             0
dtype: int64

In [33]:
# Converting score_amenities and score_star_rating to numeric, forcing errors to NaN
merged_df['score_amenities'] = pd.to_numeric(merged_df['score_amenities'], errors='coerce')
merged_df['score_star_rating'] = pd.to_numeric(merged_df['score_star_rating'], errors='coerce')

# Checking again for non-numeric values
merged_df.dtypes[['score_amenities', 'score_star_rating']]


score_amenities      float64
score_star_rating    float64
dtype: object

In [35]:
# Re-selecting numeric features for similarity calculation
cf_features = ['price', 'score_amenities', 'score_star_rating', 'listing_age']
cf_df = merged_df.dropna(subset=cf_features)  # Dropping rows with missing values

# Computing cosine similarity between listings
listing_similarity = cosine_similarity(cf_df[cf_features])

# Creating a similarity matrix DataFrame
similarity_df = pd.DataFrame(listing_similarity, index=cf_df['listing_id'], columns=cf_df['listing_id'])

# Display top 5 most similar listings for a sample listing
sample_listing = similarity_df.iloc[0].sort_values(ascending=False).index[1:6]  # Top 5 excluding itself
sample_listing


ValueError: Found array with 0 sample(s) (shape=(0, 4)) while a minimum of 1 is required by check_pairwise_arrays.

In [37]:
# Checking missing value counts in selected columns
merged_df[cf_features].isnull().sum()


price                   1
score_amenities      1129
score_star_rating    1129
listing_age             0
dtype: int64

In [39]:
# Using only available numeric features for similarity calculation
cf_features = ['price', 'listing_age']
cf_df = merged_df.dropna(subset=cf_features)  # Dropping rows with missing values

# Computing cosine similarity between listings
listing_similarity = cosine_similarity(cf_df[cf_features])

# Creating a similarity matrix DataFrame
similarity_df = pd.DataFrame(listing_similarity, index=cf_df['listing_id'], columns=cf_df['listing_id'])

# Display top 5 most similar listings for a sample listing
sample_listing = similarity_df.iloc[0].sort_values(ascending=False).index[1:6]  # Top 5 excluding itself
sample_listing


Index(['1174039367328281192', '1128550901045718893', '549402827309964514',
       '723981062304581833', '907921492539010529'],
      dtype='object', name='listing_id')

In [2]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load datasets
listings_df = pd.read_csv('listings_metadata (1).csv')
user_data_df = pd.read_csv('user_41399220_analysis (1).csv')

# Ensure 'listing_id' is of the same type in both datasets
listings_df['listing_id'] = listings_df['listing_id'].astype(str)
user_data_df['listing_id'] = user_data_df['listing_id'].astype(str)

# Merge datasets on listing_id
merged_df = pd.merge(listings_df, user_data_df, on='listing_id', how='inner')

# Convert necessary columns to numeric
numeric_cols = ['price', 'listing_age']
for col in numeric_cols:
    merged_df[col] = pd.to_numeric(merged_df[col], errors='coerce')

# Select features for Collaborative Filtering (excluding missing ones)
cf_df = merged_df.dropna(subset=numeric_cols)  # Remove rows with missing values

# Compute cosine similarity between listings
listing_similarity = cosine_similarity(cf_df[numeric_cols])

# Create similarity matrix DataFrame
similarity_df = pd.DataFrame(listing_similarity, index=cf_df['listing_id'], columns=cf_df['listing_id'])

# Display top 5 most similar listings for a sample listing
sample_listing = similarity_df.iloc[0].sort_values(ascending=False).index[1:6]
print("Top 5 similar listings:", sample_listing)

# Attribution Modeling Placeholder (to be expanded)
def attribution_model(merged_df):
    # Example: Assigning weights based on feature importance (placeholder)
    merged_df['impact_score'] = merged_df['price'] * 0.4 + merged_df['listing_age'] * 0.6
    return merged_df[['listing_id', 'impact_score']]

# Apply attribution modeling
attribution_results = attribution_model(merged_df)
print(attribution_results.head())

# SHAP Analysis Placeholder (to be expanded)
def shap_analysis(merged_df):
    print("SHAP analysis would be applied here using a trained ML model.")

shap_analysis(merged_df)


Top 5 similar listings: Index(['35873806', '30445193', '30448133', '30444903', '11557016'], dtype='object', name='listing_id')
  listing_id  impact_score
0   35873806        1061.6
1   35873806        1061.6
2   39962641        1121.6
3   17027835        1196.8
4   15581613        1428.4
SHAP analysis would be applied here using a trained ML model.
