In [None]:
# https://www.sbert.net/docs/pretrained_models.html

In [1]:
import pandas as pd
import os
from langdetect import detect
from sentence_transformers import SentenceTransformer
import sklearn
import numpy as np
import re
import pickle 
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
def detect_eng(text):
    try:
        if detect(text) == 'en':
            return True
        else:
            return False
    except:
        return False

In [3]:
def clean_verbatim(text):
    
    text = re.sub("<br />", "", text)
    text = re.sub("<br/>", "", text)
    text = re.sub("<b>", "", text)
    text = re.sub("</b>", "", text)    
    return text

In [4]:
def perform_pca(X, output_dim):
    
    X = StandardScaler().fit_transform(X)
    pca = PCA(n_components = output_dim)
    
    return pca.fit_transform(X)

In [42]:
def get_most_similar_10listings(sim_matrix, df):
    
    result = pd.DataFrame()
    
    for i in range(sim_matrix.shape[0]):
        #print(i)
        top10sim_index = list(cossim[i].argsort()[-11:-1])
        top10sim_listing_id = [df.iloc[index,]['listing_id'] for index in top10sim_index]
        top10sim_val = [cossim[i][index] for index in top10sim_index]
        
        top10sim = pd.DataFrame({'listing_id': df.iloc[i,]['listing_id'],
                                 'sim_listing_id': top10sim_listing_id,
                                 'similarity': top10sim_val})
        
        result = pd.concat( [result, top10sim], axis = 0 )
    return result

In [5]:
path = os.getcwd()
path = path.split('CIS550_Group_Project')[0]

In [6]:
review = pd.read_csv(path + '/data/reviews.csv', keep_default_na = False, na_values=['_'])

In [7]:
listing = pd.read_csv(path + '/data/listing_removehost_cleaned.csv', keep_default_na = False, na_values = ['_'])

# process Listing Description 

In [8]:
listing['desc_overview'] = listing['description'] + listing['neighborhood_overview']

In [9]:
listing['eng_ind'] = listing['desc_overview'].apply(detect_eng)

In [10]:
listing2 = listing[listing['eng_ind'] == True]

In [11]:
listing2['desc_overview_cleaned'] = listing2['desc_overview'].astype(str).apply(clean_verbatim)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listing2['desc_overview_cleaned'] = listing2['desc_overview'].astype(str).apply(clean_verbatim)


In [19]:
listing.shape

(36724, 33)

In [18]:
listing2.shape

(35205, 34)

#### get embeddings for listing description + neighborhood overview

In [12]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [13]:
desc_overview_embd = model.encode(list(listing2['desc_overview_cleaned']))

In [17]:
desc_overview_embd.shape

(35205, 384)

#### perform PCA to reduce dimension

In [20]:
desc_overview_embd200 = perform_pca(desc_overview_embd, 200)

In [21]:
desc_overview_embd200.shape

(35205, 200)

In [22]:
pd.DataFrame(desc_overview_embd200).to_csv("desc_overview_embd200.csv", index = False)

In [5]:
#desc_overview_embd100 = pd.read_csv("desc_overview_embd100.csv").to_numpy()

### cosine similarity

In [23]:
cossim = cosine_similarity(desc_overview_embd200)

In [25]:
cossim.shape

(35205, 35205)

In [43]:
desc_overview_sim_listing = get_most_similar_10listings(cossim, listing2)

In [45]:
desc_overview_sim_listing.to_csv("desc_overview_sim_listing.csv", index = False)