In [5]:
import pandas as pd

# Load the dataset
file_path = 'house_sales.csv'  # Adjust the path as needed
df = pd.read_csv(file_path)

# Convert 'date' column to datetime
df['date'] = pd.to_datetime(df['date'])
null_values = df.isnull().sum()

duplicates = df.duplicated().sum()

df = df.drop_duplicates()

df = df.dropna()


In [6]:
from sklearn.preprocessing import StandardScaler

# Select relevant features for the house vectors
features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 
            'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'lat', 'long']

# Extract the house feature vectors
house_vectors = df[features]

# Standardize the feature values
scaler = StandardScaler()
house_vectors_standardized = scaler.fit_transform(house_vectors)

# Convert the standardized data back to a DataFrame
house_vectors_standardized_df = pd.DataFrame(house_vectors_standardized, columns=features)

house_vectors_standardized_df.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,lat,long
0,-0.402903,-1.451073,-0.980652,-0.228182,-0.915552,-0.087205,-0.305715,-0.629986,-0.5608,-0.735251,-0.659031,-0.544678,-0.21021,-0.352894,-0.305695
1,-0.402903,0.174486,0.533369,-0.189739,0.937431,-0.087205,-0.305715,-0.629986,-0.5608,0.460776,0.244602,-0.68085,4.744842,1.161376,-0.746286
2,-1.482493,-1.451073,-1.427234,-0.123139,-0.915552,-0.087205,-0.305715,-0.629986,-1.41319,-1.230575,-0.659031,-1.293626,-0.21021,1.283355,-0.135144
3,0.676687,1.149821,-0.131057,-0.243878,-0.915552,-0.087205,-0.305715,2.444427,-0.5608,-0.892305,1.396734,-0.204247,-0.21021,-0.283604,-1.272151
4,-0.402903,-0.150626,-0.43604,-0.169503,-0.915552,-0.087205,-0.305715,-0.629986,0.291589,-0.131197,-0.659031,0.5447,-0.21021,0.409293,1.20084


In [7]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute the cosine similarity matrix
similarity_matrix = cosine_similarity(house_vectors_standardized_df)

# Convert the similarity matrix to a DataFrame for better readability
similarity_matrix_df = pd.DataFrame(similarity_matrix, index=df.index, columns=df.index)

similarity_matrix_df.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21603,21604,21605,21606,21607,21608,21609,21610,21611,21612
0,1.0,-0.144706,0.80523,-0.177236,0.341733,-0.741042,0.188475,0.905206,0.736699,-0.067935,...,-0.310803,-0.118597,-0.490855,-0.732198,0.049556,-0.234506,-0.315422,0.556919,-0.124843,0.561258
1,-0.144706,1.0,0.038718,-0.123622,-0.241672,-0.007448,-0.110534,-0.198724,-0.068094,-0.178454,...,-0.120915,0.052763,-0.10896,0.072649,-0.001907,0.13727,-0.021717,-0.016783,-0.149408,-0.015625
2,0.80523,0.038718,1.0,-0.137661,0.336525,-0.666751,-0.126526,0.607315,0.586338,-0.227178,...,-0.277288,-0.111427,-0.701056,-0.756778,0.024063,-0.077116,-0.538598,0.54116,-0.135142,0.54622
3,-0.177236,-0.123622,-0.137661,1.0,-0.477892,0.008325,-0.140218,-0.012118,0.141323,-0.40376,...,-0.587366,-0.361314,-0.422284,0.136584,-0.031616,-0.270104,-0.18071,-0.345972,-0.499671,-0.346467
4,0.341733,-0.241672,0.336525,-0.477892,1.0,-0.00154,-0.172708,0.206418,-0.029388,0.309495,...,0.5551,-0.101824,0.158012,-0.387746,-0.057586,-0.140525,-0.173166,0.222748,0.509791,0.220945


In [8]:
def recommend_zipcodes(zipcode, n_recommendations=5):
    houses_in_zipcode = df[df['zipcode'] == zipcode].index

    mean_similarity_scores = similarity_matrix_df.loc[houses_in_zipcode].mean(axis=0)

    mean_similarity_scores = mean_similarity_scores.drop(houses_in_zipcode)

    top_n_similar_houses = mean_similarity_scores.nlargest(n_recommendations).index

    recommended_zipcodes = df.loc[top_n_similar_houses, 'zipcode'].unique()

    return recommended_zipcodes

zipcode_to_recommend = 98052
recommended_zipcodes = recommend_zipcodes(zipcode_to_recommend, n_recommendations=5)

print(f"Zipcodes recommended for zipcode {zipcode_to_recommend}:\n", recommended_zipcodes)


Zipcodes recommended for zipcode 98052:
 [98019]
