## Modeling

For background on this project, please see the [README](../README.md).

**Notebooks**
- [Data Acquisition & Cleaning](./01_data_acq_clean.ipynb)
- [Exploratory Data Analysis](./02_eda.ipynb)
- Modeling (this notebook)
- [Results and Recommendations](./04_results.ipynb)

**In this notebook, you'll find:**
- Classification models using content-based filtering
- Brief analyses for each model

In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

from scipy import sparse, spatial
import sys
from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity, cosine_distances

from numpy import dot
from numpy.linalg import norm

---
#### **Importing All-Feature Dataframe**

This is the aggregated dataframe of all of our datasets.

In [2]:
model_df = pd.read_csv('../datasets/cleaned_data/all_features.csv')

model_df.head()

Unnamed: 0,place,state,price,auto_accss_idx,pct_no_vehicle,pct_one_or_more_vehicles,transit_accss_idx,walkability_idx,cost_of_living,grocery_cost_idx,...,nov_avg_rain,dec_avg_rain,winter_avg_temp,spring_avg_temp,summer_avg_temp,autumn_avg_temp,winter_avg_rain,spring_avg_rain,summer_avg_rain,autumn_avg_rain
0,Asheville,NC,225.830385,0.580614,0.053756,0.946244,0.14985,9.382035,105.7,103.1,...,3.11,3.69,40.166667,53.333333,70.2,55.333333,3.71,4.16,4.713333,3.423333
1,Austin,TX,436.273137,0.718323,0.059187,0.940813,0.286952,11.516379,129.4,96.7,...,2.52,2.42,53.4,67.433333,82.333333,70.366667,2.243333,3.256667,2.386667,3.026667
2,Boston,MA,272.879054,0.759096,0.301252,0.698748,0.409262,14.936791,153.4,111.4,...,4.04,4.01,33.1,49.7,72.233333,56.0,3.81,3.79,3.463333,3.703333
3,Cambridge,MA,246.035242,0.563985,0.102769,0.897231,0.160956,12.458517,173.7,115.5,...,4.0,3.85,31.1,49.166667,71.633333,54.533333,3.57,3.706667,3.523333,3.676667
4,Chicago,IL,245.904608,0.636256,0.169022,0.830978,0.310866,13.635529,107.4,97.2,...,2.44,2.04,28.133333,50.833333,73.933333,55.766667,1.76,3.223333,3.643333,2.853333


In [3]:
model_df.columns

Index(['place', 'state', 'price', 'auto_accss_idx', 'pct_no_vehicle',
       'pct_one_or_more_vehicles', 'transit_accss_idx', 'walkability_idx',
       'cost_of_living', 'grocery_cost_idx', 'transportation_cost_idx',
       'miscellaneous_cost_idx', 'population', 'median_age',
       'summer_comfort_index', 'winter_comfort_index',
       'perc_growth_since_2020', 'political_left_lean', 'political_right_lean',
       'chain_ratio', 'jan_avg_temp', 'feb_avg_temp', 'march_avg_temp',
       'april_avg_temp', 'may_avg_temp', 'june_avg_temp', 'july_avg_temp',
       'aug_avg_temp', 'sept_avg_temp', 'oct_avg_temp', 'nov_avg_temp',
       'dec_avg_temp', 'jan_avg_rain', 'feb_avg_rain', 'mar_avg_rain',
       'apr_avg_rain', 'may_avg_rain', 'jun_avg_rain', 'jul_avg_rain',
       'aug_avg_rain', 'sep_avg_rain', 'oct_avg_rain', 'nov_avg_rain',
       'dec_avg_rain', 'winter_avg_temp', 'spring_avg_temp', 'summer_avg_temp',
       'autumn_avg_temp', 'winter_avg_rain', 'spring_avg_rain',
       'sum

In [4]:
model_df['miscellaneous_cost_idx'].mean()

129.34484848484846

In [5]:
model_df[['place', 'winter_avg_temp', 'spring_avg_temp', 'summer_avg_temp', 'autumn_avg_temp']]

Unnamed: 0,place,winter_avg_temp,spring_avg_temp,summer_avg_temp,autumn_avg_temp
0,Asheville,40.166667,53.333333,70.2,55.333333
1,Austin,53.4,67.433333,82.333333,70.366667
2,Boston,33.1,49.7,72.233333,56.0
3,Cambridge,31.1,49.166667,71.633333,54.533333
4,Chicago,28.133333,50.833333,73.933333,55.766667
5,Columbus,32.866667,52.8,74.1,56.733333
6,Dallas,49.566667,64.8,82.666667,68.9
7,Denver,33.166667,47.2,73.466667,55.6
8,Fort Lauderdale,70.3,75.933333,82.533333,77.633333
9,Fort Worth,49.166667,64.1,82.133333,68.533333


In [6]:
model_df['place'] = model_df[['place', 'state']].apply(lambda x: ', '.join(x), axis=1)
model_df.drop(columns='state', inplace=True)
model_df.head()
# model_df = model_df[['']]

Unnamed: 0,place,price,auto_accss_idx,pct_no_vehicle,pct_one_or_more_vehicles,transit_accss_idx,walkability_idx,cost_of_living,grocery_cost_idx,transportation_cost_idx,...,nov_avg_rain,dec_avg_rain,winter_avg_temp,spring_avg_temp,summer_avg_temp,autumn_avg_temp,winter_avg_rain,spring_avg_rain,summer_avg_rain,autumn_avg_rain
0,"Asheville, NC",225.830385,0.580614,0.053756,0.946244,0.14985,9.382035,105.7,103.1,82.8,...,3.11,3.69,40.166667,53.333333,70.2,55.333333,3.71,4.16,4.713333,3.423333
1,"Austin, TX",436.273137,0.718323,0.059187,0.940813,0.286952,11.516379,129.4,96.7,109.9,...,2.52,2.42,53.4,67.433333,82.333333,70.366667,2.243333,3.256667,2.386667,3.026667
2,"Boston, MA",272.879054,0.759096,0.301252,0.698748,0.409262,14.936791,153.4,111.4,131.8,...,4.04,4.01,33.1,49.7,72.233333,56.0,3.81,3.79,3.463333,3.703333
3,"Cambridge, MA",246.035242,0.563985,0.102769,0.897231,0.160956,12.458517,173.7,115.5,95.7,...,4.0,3.85,31.1,49.166667,71.633333,54.533333,3.57,3.706667,3.523333,3.676667
4,"Chicago, IL",245.904608,0.636256,0.169022,0.830978,0.310866,13.635529,107.4,97.2,138.5,...,2.44,2.04,28.133333,50.833333,73.933333,55.766667,1.76,3.223333,3.643333,2.853333


In [7]:
trial_model_df = model_df[['place', 'winter_avg_temp', 'summer_avg_temp', 'chain_ratio', 'walkability_idx', 'political_left_lean', 'political_right_lean', 'miscellaneous_cost_idx', 'price']]

In [8]:
trial_model_df.head()

Unnamed: 0,place,winter_avg_temp,summer_avg_temp,walkability_idx,chain_ratio,political_left_lean,political_right_lean,miscellaneous_cost_idx,price
0,"Asheville, NC",40.166667,70.2,9.382035,0.3516,0.597,0.386,111.2,225.830385
1,"Austin, TX",53.4,82.333333,11.516379,0.3814,0.714,0.264,105.9,436.273137
2,"Boston, MA",33.1,72.233333,14.936791,0.2796,0.806,0.175,157.9,272.879054
3,"Cambridge, MA",31.1,71.633333,12.458517,0.3277,0.715,0.263,176.7,246.035242
4,"Chicago, IL",28.133333,73.933333,13.635529,0.3717,0.742,0.24,139.8,245.904608


In [9]:
# Converting price column to monthly cost with Airbnb's estimated 30+day-stay discount
trial_model_df['avg_monthly_price'] = (trial_model_df['price'] * 30) - ((trial_model_df['price'] * 30) * 0.25)
trial_model_df.drop(columns=['price'], inplace=True)
trial_model_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trial_model_df['avg_monthly_price'] = (trial_model_df['price'] * 30) - ((trial_model_df['price'] * 30) * 0.25)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trial_model_df.drop(columns=['price'], inplace=True)


Unnamed: 0,place,winter_avg_temp,summer_avg_temp,walkability_idx,chain_ratio,political_left_lean,political_right_lean,miscellaneous_cost_idx,avg_monthly_price
0,"Asheville, NC",40.166667,70.2,9.382035,0.3516,0.597,0.386,111.2,5081.183654
1,"Austin, TX",53.4,82.333333,11.516379,0.3814,0.714,0.264,105.9,9816.145585
2,"Boston, MA",33.1,72.233333,14.936791,0.2796,0.806,0.175,157.9,6139.778716
3,"Cambridge, MA",31.1,71.633333,12.458517,0.3277,0.715,0.263,176.7,5535.792952
4,"Chicago, IL",28.133333,73.933333,13.635529,0.3717,0.742,0.24,139.8,5532.853672


In [10]:
trial_model_df

Unnamed: 0,place,winter_avg_temp,summer_avg_temp,walkability_idx,chain_ratio,political_left_lean,political_right_lean,miscellaneous_cost_idx,avg_monthly_price
0,"Asheville, NC",40.166667,70.2,9.382035,0.3516,0.597,0.386,111.2,5081.183654
1,"Austin, TX",53.4,82.333333,11.516379,0.3814,0.714,0.264,105.9,9816.145585
2,"Boston, MA",33.1,72.233333,14.936791,0.2796,0.806,0.175,157.9,6139.778716
3,"Cambridge, MA",31.1,71.633333,12.458517,0.3277,0.715,0.263,176.7,5535.792952
4,"Chicago, IL",28.133333,73.933333,13.635529,0.3717,0.742,0.24,139.8,5532.853672
5,"Columbus, OH",32.866667,74.1,11.556746,0.5012,0.647,0.334,107.5,3835.510118
6,"Dallas, TX",49.566667,82.666667,11.899341,0.4783,0.649,0.333,107.8,5658.053279
7,"Denver, CO",33.166667,73.466667,14.462231,0.3274,0.796,0.182,152.2,4313.462492
8,"Fort Lauderdale, FL",70.3,82.533333,12.445745,0.3837,0.645,0.347,107.6,5663.896761
9,"Fort Worth, TX",49.166667,82.133333,9.585953,0.5595,0.493,0.491,109.2,4079.534543


In [11]:
walk_mean = trial_model_df['walkability_idx'].mean()
walk_std = trial_model_df['walkability_idx'].std()

In [12]:
walk_input_options = ['Not important at all', 'Not very important', 'Neutral', 'A little important', 'Very important']

In [13]:
walk_input_index = [walk_mean - (2*walk_std), walk_mean - (1*walk_std), walk_mean, walk_mean + (1*walk_std), walk_mean + (2*walk_std)]

In [14]:
walk_input_index

[7.077144672571502,
 9.576295608226786,
 12.07544654388207,
 14.574597479537353,
 17.073748415192636]

In [53]:
walkability_mapper = dict(zip(walk_input_options, walk_input_index))
walkability_mapper

{'Not important at all': 7.077144672571502,
 'Not very important': 9.576295608226786,
 'Neutral': 12.07544654388207,
 'A little important': 14.574597479537353,
 'Very important': 17.073748415192636}

In [16]:
trial_model_df.describe()

Unnamed: 0,winter_avg_temp,summer_avg_temp,walkability_idx,chain_ratio,political_left_lean,political_right_lean,miscellaneous_cost_idx,avg_monthly_price
count,33.0,33.0,33.0,33.0,33.0,33.0,33.0,33.0
mean,46.471082,74.791385,12.075447,0.328676,0.7012,0.27773,129.344848,6387.315367
std,14.479006,6.471081,2.499151,0.100857,0.100124,0.100504,25.397655,2347.730358
min,17.985714,62.2,4.98645,0.1136,0.489,0.054,92.9,3437.97496
25%,35.533333,70.9,11.519265,0.2848,0.645,0.202,107.6,4606.418306
50%,47.033333,75.133333,12.445745,0.3135,0.714,0.2665,124.0,5663.896761
75%,51.333333,80.166667,13.635529,0.3837,0.779,0.346,152.9,7618.616379
max,78.333333,87.2,15.957258,0.5595,0.921,0.491,176.7,14795.287657


In [64]:
trial_model_df[['place', 'miscellaneous_cost_idx']].sort_values(by='miscellaneous_cost_idx', ascending=False)

Unnamed: 0,place,miscellaneous_cost_idx
3,"Cambridge, MA",176.7
32,"Washington DC, DC",168.0
26,"San Francisco, CA",162.6
27,"San Mateo, CA",161.2
21,"Oakland, CA",158.5
28,"Santa Clara, CA",158.4
2,"Boston, MA",157.9
29,"Santa Cruz, CA",155.6
25,"San Diego, CA",152.9
7,"Denver, CO",152.2


In [17]:
user_input = ['User Input', 65, 75, 'I like to try new restaurants', 'Neutral', 'Moderate left lean preferred', 5, 6000]

# can we turn this into a dictionary?

In [18]:
user_input[5]

'Moderate left lean preferred'

In [19]:
for i in walkability_mapper:
    print(i[0])

Not important at all
Not very important
Neutral
A little important
Very important


In [20]:
pol_pref_options = ['Strong left lean preferred', 'Moderate left lean preferred', 'Moderate right lean preferred', 'Strong right lean preferred']
pol_pref_nums = [.9, .8, .6, .5]

pol_dem_mapper = list(zip(pol_pref_options, pol_pref_nums))
pol_dem_mapper

[('Strong left lean preferred', 0.9),
 ('Moderate left lean preferred', 0.8),
 ('Moderate right lean preferred', 0.6),
 ('Strong right lean preferred', 0.5)]

In [21]:
trial_model_df['chain_ratio'].std()

0.10085718873208491

In [22]:
[i[1] for i in pol_dem_mapper if user_input[5] == i[0]] 

[0.8]

In [23]:
for i in walkability_mapper:
    if user_input[4] == i[0]:
        print(i[1])
        user_walk_index = i[1]

12.07544654388207


In [54]:
walkability_mapper

{'Not important at all': 7.077144672571502,
 'Not very important': 9.576295608226786,
 'Neutral': 12.07544654388207,
 'A little important': 14.574597479537353,
 'Very important': 17.073748415192636}

In [62]:
for k, v in walkability_mapper.items():
    print(k)
    print(v)

Not important at all
7.077144672571502
Not very important
9.576295608226786
Neutral
12.07544654388207
A little important
14.574597479537353
Very important
17.073748415192636


In [24]:
if user_input[5] == 'No':
    user_pol_dem = .7
else:
    user_pol_dem = [i[1] for i in pol_dem_mapper if user_input[5] == i[0]]   

print(user_pol_dem[0])

0.8


In [25]:
user_input = ['User Input', 65, 75, 12.07544654388207, 0.25, 0.8, 0.2, 155, 6000]

In [26]:
len(user_input)

9

In [27]:
user_input_2 = ['User Input_2', 70, 80, 12.07544654388207, 0.30, 0.7, 0.3, 140, 6000]

In [28]:
user_input_3 = ['User Input_3', 65, 75, 14.574597479537353, 0.2, 0.85, 0.15, 135, 5500]

In [29]:
trial_model_df.loc[len(trial_model_df)] = user_input_3

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trial_model_df.loc[len(trial_model_df)] = user_input_3


In [30]:
trial_model_df.describe()

Unnamed: 0,winter_avg_temp,summer_avg_temp,walkability_idx,chain_ratio,political_left_lean,political_right_lean,miscellaneous_cost_idx,avg_monthly_price
count,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0
mean,47.01605,74.797521,12.148951,0.324891,0.705576,0.273974,129.511176,6361.217857
std,14.607755,6.372381,2.498037,0.101739,0.101844,0.101364,25.028679,2316.88782
min,17.985714,62.2,4.98645,0.1136,0.489,0.054,92.9,3437.97496
25%,35.883333,71.083333,11.528635,0.2809,0.645,0.18925,107.65,4636.817211
50%,47.733333,75.066667,12.452131,0.31315,0.7145,0.26525,128.6,5660.97502
75%,52.508333,79.616667,13.757222,0.383125,0.78275,0.343,152.725,7490.267552
max,78.333333,87.2,15.957258,0.5595,0.921,0.491,176.7,14795.287657


In [31]:
trial_model_df

Unnamed: 0,place,winter_avg_temp,summer_avg_temp,walkability_idx,chain_ratio,political_left_lean,political_right_lean,miscellaneous_cost_idx,avg_monthly_price
0,"Asheville, NC",40.166667,70.2,9.382035,0.3516,0.597,0.386,111.2,5081.183654
1,"Austin, TX",53.4,82.333333,11.516379,0.3814,0.714,0.264,105.9,9816.145585
2,"Boston, MA",33.1,72.233333,14.936791,0.2796,0.806,0.175,157.9,6139.778716
3,"Cambridge, MA",31.1,71.633333,12.458517,0.3277,0.715,0.263,176.7,5535.792952
4,"Chicago, IL",28.133333,73.933333,13.635529,0.3717,0.742,0.24,139.8,5532.853672
5,"Columbus, OH",32.866667,74.1,11.556746,0.5012,0.647,0.334,107.5,3835.510118
6,"Dallas, TX",49.566667,82.666667,11.899341,0.4783,0.649,0.333,107.8,5658.053279
7,"Denver, CO",33.166667,73.466667,14.462231,0.3274,0.796,0.182,152.2,4313.462492
8,"Fort Lauderdale, FL",70.3,82.533333,12.445745,0.3837,0.645,0.347,107.6,5663.896761
9,"Fort Worth, TX",49.166667,82.133333,9.585953,0.5595,0.493,0.491,109.2,4079.534543


In [32]:
test = model_df[['place', 'price', 'pct_no_vehicle', 'cost_of_living', 'walkability_idx', 'chain_ratio']]

In [33]:
trial_model_df.columns

Index(['place', 'winter_avg_temp', 'summer_avg_temp', 'walkability_idx',
       'chain_ratio', 'political_left_lean', 'political_right_lean',
       'miscellaneous_cost_idx', 'avg_monthly_price'],
      dtype='object')

In [34]:
feature_cols = ['winter_avg_temp', 'summer_avg_temp', 'walkability_idx',
       'chain_ratio', 'political_left_lean', 'political_right_lean',
       'miscellaneous_cost_idx', 'avg_monthly_price']

sc = MinMaxScaler()
scaled_trial = sc.fit_transform(trial_model_df[feature_cols])

print(scaled_trial[:2])

[[0.36755307 0.32       0.40066191 0.53375196 0.25       0.7597254
  0.21837709 0.14468288]
 [0.58683816 0.80533333 0.59520954 0.60058309 0.52083333 0.4805492
  0.15513126 0.56159153]]


In [35]:
indices = pd.Series(trial_model_df.index, index=trial_model_df['place'])

cosine = cosine_similarity(scaled_trial)

def place_recommender(place_name, model=cosine):
    index = indices[place_name]

    scores = list(enumerate(model[index]))

    sim_scores = sorted(scores, key= lambda x:x[1], reverse=True)
    sim_scores = sim_scores[1:11]

    top_recs_index = [i[0] for i in sim_scores]

    top_places = trial_model_df['place'].iloc[top_recs_index]

    return top_places

In [65]:
indices

place
Asheville, NC           0
Austin, TX              1
Boston, MA              2
Cambridge, MA           3
Chicago, IL             4
Columbus, OH            5
Dallas, TX              6
Denver, CO              7
Fort Lauderdale, FL     8
Fort Worth, TX          9
Hawaii, HI             10
Jersey City, NJ        11
Kauai, HI              12
Las Vegas, NV          13
Los Angeles, CA        14
Maui, HI               15
Nashville, TN          16
New Orleans, LA        17
New York City, NY      18
Newark, NJ             19
Oahu, HI               20
Oakland, CA            21
Portland, OR           22
Rhode Island, RI       23
Salem, OR              24
San Diego, CA          25
San Francisco, CA      26
San Mateo, CA          27
Santa Clara, CA        28
Santa Cruz, CA         29
Seattle, WA            30
Twin Cities, MN        31
Washington DC, DC      32
User Input_3           33
dtype: int64

In [36]:
print('Recommended Places:')
print(place_recommender('User Input_3', cosine).values)

Recommended Places:
['Oakland, CA' 'New Orleans, LA' 'Portland, OR' 'New York City, NY'
 'Washington DC, DC' 'Boston, MA' 'Newark, NJ' 'San Francisco, CA'
 'Los Angeles, CA' 'Denver, CO']


In [38]:
similarity_scores = sorted(list(enumerate(cosine[indices['San Francisco, CA']])), key= lambda x:x[1], reverse=True)

In [39]:
[i[0] for i in similarity_scores]

[26,
 29,
 22,
 21,
 30,
 2,
 33,
 27,
 32,
 7,
 28,
 3,
 18,
 4,
 14,
 19,
 11,
 25,
 17,
 31,
 23,
 20,
 1,
 8,
 0,
 6,
 16,
 5,
 10,
 24,
 13,
 15,
 12,
 9]

In [41]:
trial_model_df['place'].iloc[[i[0] for i in similarity_scores]]

26      San Francisco, CA
29         Santa Cruz, CA
22           Portland, OR
21            Oakland, CA
30            Seattle, WA
2              Boston, MA
33           User Input_3
27          San Mateo, CA
32      Washington DC, DC
7              Denver, CO
28        Santa Clara, CA
3           Cambridge, MA
18      New York City, NY
4             Chicago, IL
14        Los Angeles, CA
19             Newark, NJ
11        Jersey City, NJ
25          San Diego, CA
17        New Orleans, LA
31        Twin Cities, MN
23       Rhode Island, RI
20               Oahu, HI
1              Austin, TX
8     Fort Lauderdale, FL
0           Asheville, NC
6              Dallas, TX
16          Nashville, TN
5            Columbus, OH
10             Hawaii, HI
24              Salem, OR
13          Las Vegas, NV
15               Maui, HI
12              Kauai, HI
9          Fort Worth, TX
Name: place, dtype: object

In [44]:
list(enumerate(cosine[indices['San Francisco, CA']]))

[(0, 0.6193912742600769),
 (1, 0.6663204108168225),
 (2, 0.9357412021223088),
 (3, 0.8657516959803817),
 (4, 0.8395501792898792),
 (5, 0.5982463256055518),
 (6, 0.6021465000932469),
 (7, 0.9031081772719182),
 (8, 0.6470090012206144),
 (9, 0.39795593080947744),
 (10, 0.508528500018575),
 (11, 0.8026399584354277),
 (12, 0.4351730226961039),
 (13, 0.4970843593989582),
 (14, 0.8213301133178844),
 (15, 0.48973877309779135),
 (16, 0.5999687496506284),
 (17, 0.7483638945425544),
 (18, 0.8631268510337504),
 (19, 0.8089178836193812),
 (20, 0.7030865468392643),
 (21, 0.9416791504997746),
 (22, 0.9422408807616666),
 (23, 0.7171311974160652),
 (24, 0.5064231347577338),
 (25, 0.755175841221089),
 (26, 1.0000000000000002),
 (27, 0.9174870444316416),
 (28, 0.8899594919509353),
 (29, 0.9448331426029035),
 (30, 0.938926867318185),
 (31, 0.7249665443563678),
 (32, 0.905169599864383),
 (33, 0.9184636375596953)]

In [42]:
indices

place
Asheville, NC           0
Austin, TX              1
Boston, MA              2
Cambridge, MA           3
Chicago, IL             4
Columbus, OH            5
Dallas, TX              6
Denver, CO              7
Fort Lauderdale, FL     8
Fort Worth, TX          9
Hawaii, HI             10
Jersey City, NJ        11
Kauai, HI              12
Las Vegas, NV          13
Los Angeles, CA        14
Maui, HI               15
Nashville, TN          16
New Orleans, LA        17
New York City, NY      18
Newark, NJ             19
Oahu, HI               20
Oakland, CA            21
Portland, OR           22
Rhode Island, RI       23
Salem, OR              24
San Diego, CA          25
San Francisco, CA      26
San Mateo, CA          27
Santa Clara, CA        28
Santa Cruz, CA         29
Seattle, WA            30
Twin Cities, MN        31
Washington DC, DC      32
User Input_3           33
dtype: int64

In [45]:
cosine_distances(model_df.iloc[:, 2:])

array([[0.00000000e+00, 4.29554703e-06, 3.87045136e-06, ...,
        4.06008214e-06, 4.11825903e-06, 3.83185116e-06],
       [4.29554703e-06, 0.00000000e+00, 1.85363105e-08, ...,
        1.21248582e-08, 8.32707103e-09, 1.91021039e-08],
       [3.87045136e-06, 1.85363105e-08, 0.00000000e+00, ...,
        3.12543202e-09, 6.59206745e-09, 3.81653376e-10],
       ...,
       [4.06008214e-06, 1.21248582e-08, 3.12543202e-09, ...,
        0.00000000e+00, 5.97219252e-09, 4.69485928e-09],
       [4.11825903e-06, 8.32707103e-09, 6.59206745e-09, ...,
        5.97219252e-09, 0.00000000e+00, 7.30497374e-09],
       [3.83185116e-06, 1.91021039e-08, 3.81653376e-10, ...,
        4.69485928e-09, 7.30497374e-09, 0.00000000e+00]])

In [46]:
pairwise_distances(model_df.iloc[:, 2:], metric='cosine')

array([[4.44089210e-16, 4.29554703e-06, 3.87045136e-06, ...,
        4.06008214e-06, 4.11825903e-06, 3.83185116e-06],
       [4.29554703e-06, 4.44089210e-16, 1.85363105e-08, ...,
        1.21248582e-08, 8.32707103e-09, 1.91021039e-08],
       [3.87045136e-06, 1.85363105e-08, 0.00000000e+00, ...,
        3.12543202e-09, 6.59206745e-09, 3.81653376e-10],
       ...,
       [4.06008214e-06, 1.21248582e-08, 3.12543202e-09, ...,
        5.55111512e-16, 5.97219252e-09, 4.69485928e-09],
       [4.11825903e-06, 8.32707103e-09, 6.59206745e-09, ...,
        5.97219252e-09, 0.00000000e+00, 7.30497374e-09],
       [3.83185116e-06, 1.91021038e-08, 3.81653376e-10, ...,
        4.69485939e-09, 7.30497351e-09, 0.00000000e+00]])

In [47]:
similarity_matrix= cosine_similarity(model_df.iloc[:, 2:])

sys.getsizeof(similarity_matrix)

8840

In [48]:
similarity_matrix

array([[1.        , 0.9999957 , 0.99999613, ..., 0.99999594, 0.99999588,
        0.99999617],
       [0.9999957 , 1.        , 0.99999998, ..., 0.99999999, 0.99999999,
        0.99999998],
       [0.99999613, 0.99999998, 1.        , ..., 1.        , 0.99999999,
        1.        ],
       ...,
       [0.99999594, 0.99999999, 1.        , ..., 1.        , 0.99999999,
        1.        ],
       [0.99999588, 0.99999999, 0.99999999, ..., 0.99999999, 1.        ,
        0.99999999],
       [0.99999617, 0.99999998, 1.        , ..., 1.        , 0.99999999,
        1.        ]])

In [49]:
pd.DataFrame(cosine_similarity(model_df.iloc[:, 2:]), index = model_df['place'], columns=model_df['place'])

place,"Asheville, NC","Austin, TX","Boston, MA","Cambridge, MA","Chicago, IL","Columbus, OH","Dallas, TX","Denver, CO","Fort Lauderdale, FL","Fort Worth, TX",...,"Rhode Island, RI","Salem, OR","San Diego, CA","San Francisco, CA","San Mateo, CA","Santa Clara, CA","Santa Cruz, CA","Seattle, WA","Twin Cities, MN","Washington DC, DC"
place,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Asheville, NC",1.0,0.999996,0.999996,1.0,0.999995,0.999996,0.999995,0.999996,0.999999,0.999996,...,0.999996,0.999999,0.999995,0.999996,0.999996,0.999995,0.999998,0.999996,0.999996,0.999996
"Austin, TX",0.999996,1.0,1.0,0.999996,1.0,1.0,1.0,1.0,0.999998,1.0,...,1.0,0.999999,1.0,1.0,1.0,1.0,0.999999,1.0,1.0,1.0
"Boston, MA",0.999996,1.0,1.0,0.999997,1.0,1.0,1.0,1.0,0.999999,1.0,...,1.0,0.999999,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"Cambridge, MA",1.0,0.999996,0.999997,1.0,0.999996,0.999996,0.999996,0.999997,0.999999,0.999996,...,0.999996,0.999999,0.999996,0.999997,0.999997,0.999996,0.999999,0.999997,0.999997,0.999997
"Chicago, IL",0.999995,1.0,1.0,0.999996,1.0,1.0,1.0,1.0,0.999998,1.0,...,1.0,0.999999,1.0,1.0,1.0,1.0,0.999999,1.0,1.0,1.0
"Columbus, OH",0.999996,1.0,1.0,0.999996,1.0,1.0,1.0,1.0,0.999998,1.0,...,1.0,0.999999,1.0,1.0,1.0,1.0,0.999999,1.0,1.0,1.0
"Dallas, TX",0.999995,1.0,1.0,0.999996,1.0,1.0,1.0,1.0,0.999998,1.0,...,1.0,0.999999,1.0,1.0,1.0,1.0,0.999999,1.0,1.0,1.0
"Denver, CO",0.999996,1.0,1.0,0.999997,1.0,1.0,1.0,1.0,0.999999,1.0,...,1.0,0.999999,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"Fort Lauderdale, FL",0.999999,0.999998,0.999999,0.999999,0.999998,0.999998,0.999998,0.999999,1.0,0.999998,...,0.999998,1.0,0.999998,0.999999,0.999999,0.999998,1.0,0.999999,0.999999,0.999999
"Fort Worth, TX",0.999996,1.0,1.0,0.999996,1.0,1.0,1.0,1.0,0.999998,1.0,...,1.0,0.999999,1.0,1.0,1.0,1.0,0.999999,1.0,1.0,1.0


In [50]:
model_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 51 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   place                     33 non-null     object 
 1   price                     33 non-null     float64
 2   auto_accss_idx            33 non-null     float64
 3   pct_no_vehicle            33 non-null     float64
 4   pct_one_or_more_vehicles  33 non-null     float64
 5   transit_accss_idx         33 non-null     float64
 6   walkability_idx           33 non-null     float64
 7   cost_of_living            33 non-null     float64
 8   grocery_cost_idx          33 non-null     float64
 9   transportation_cost_idx   33 non-null     float64
 10  miscellaneous_cost_idx    33 non-null     float64
 11  population                33 non-null     int64  
 12  median_age                33 non-null     float64
 13  summer_comfort_index      33 non-null     float64
 14  winter_comfo

In [51]:
# We're seeing the most nulls in the neighborhood group column

model_df.isnull().sum().sum()

0

In [52]:
# Fortunately, we don't have much use for the columns that have null values, so we'll be dropping these and a few others
# Neighborhood group included here because we'll only need the neighborhood column for our analysis

airbnb.drop(columns=['name', 'host_id', 'host_name', 'neighbourhood_group', 'minimum_nights'], inplace=True) 
old_shape = airbnb.shape
old_shape

NameError: name 'airbnb' is not defined

In [None]:
# Checking for and removing any duplicate listings in the data
# Keeping none as I don't feel that we could accurately/fairly determine which city to leave the listing in for

airbnb.drop_duplicates(subset=['id'], keep=False, inplace=True)
print(f'Total listings dropped: {old_shape[0] - airbnb.shape[0]}')

Total listings dropped: 10151


In [None]:
airbnb['room_type'].value_counts()

Entire home/apt    202199
Private room        62695
Shared room          2765
Hotel room           1184
Name: room_type, dtype: int64