Baseline Recommender System Notebook

In [117]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml import recommendation
import pandas as pd

In [4]:
df = pd.read_csv('../data/housing-data-new-test.csv')

In [5]:
df_fav = pd.read_csv('../data/favorites_test.csv')

In [2]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class Recommending:
    def __init__(self, n_clusters=30):
        '''Initializes the TFIDF Vectorizer and KMeans Obj'''

        self.n_clusters = n_clusters
        self.tfidf = TfidfVectorizer(stop_words='english', max_features=200)
        self.km = KMeans(n_clusters=self.n_clusters)
        self.results = {}


    def fit_transform(self, X):
        '''Fits and transforms TFIDF and fits KMeans.

        Params:
            X (array): Array of the descriptions of houses

        '''
        self.tfidf.fit(X)
        desc_tfidf = self.tfidf.transform(X)
        self.km.fit(desc_tfidf.todense())
        return desc_tfidf

    def cosine_sim(self,tfidf):
        ''' Creates a dictionary of the houses to consider based on cosine similarity.
        
        Params:
            tfidf (Tfidf object): object created in fit method
        '''
        
        cosine_similarities = cosine_similarity(tfidf,tfidf)
        for idx, row in df.iterrows():
            if idx < 2822:
                similar_indices = cosine_similarities[idx].argsort()[:-5:-1]
                similar_items = [(cosine_similarities[idx][i], i) for i in similar_indices]
                self.results[row['ID']] = similar_items[1:]
                
    def item(id):
        ''' Helper method for returning item in dataframe when looking for recommendations.
        
        Params:
            id (int): id of recommended house
        
        '''
        
        return df.loc[df.index == id][['ADDRESS', 'URL']].tolist()[0]

    
    def recommend(id, num):
        ''' Prints the recommendations for that house.
        
        Params:
            id (int): id of house that needs recommendations
            num (int): num of recommendations
        '''
        
        try:
            if (num == 0):
                print("Unable to recommend any house")
            elif (num==1):
                print("Recommending " + str(num) + " house similar to " + item(id))
            else :
                print("Recommending " + str(num) + " houses similar to " + item(id))

            print("----------------------------------------------------------")
            recs = results[id][:num]
            for rec in recs:
                print("You may also like to look at: " + item(rec[1]) + " (score:" + str(rec[0]) + ")")
        except IndexError:
            print('These are the houses most similar to your house')
            pass


    def result(self, df):
        '''Takes the df and builds a column with the labels for each house.

        Params:
            df (DataFrame): dataframe with all the housing data


        Returns:
            df (DataFrame): dataframe including new column for label

        '''
        df['LABEL'] = pd.Series(self.km.labels_)
        return df

    def predictions(self, df):
        '''Returns houses that are in the same clusters as their favorites.

        Params:
            df (DataFrame): entire dataframe with the favorites and the cluster labels

        Returns:
            pos (DataFrame): dataframe of houses that have similar descriptions 
                to those that they favorited

        '''
        list_of_rows = []
        possible_clusters = df[df['FAVORITED'] == 'Y']['LABEL'].unique()
        for idx, row in df.iterrows():
            if row['LABEL'] in possible_clusters and row['FAVORITED'] == 'N':
                list_of_rows.append(row)
        return pd.concat(list_of_rows)

def get_data(file, fave_file=None):
    '''Takes in a filename and returns it as a dataframe.


    Params:
        file (csv): file in csv format

    Returns:
        df (DataFrame): pandas dataframe of data from file
    '''
    df = pd.read_csv(file)
    df['FAVORITE'] = 'N'    
    if fave_file != None:
        df_faves = pd.read_csv(fave_file)
        for idx, row in df.iterrows():
            if row['ADDRESS'] in list(df_faves['ADDRESS']):
                df.loc[idx,'FAVORITE'] = 'Y'
    df.rename(columns={'$/SQUARE FEET': 'PRICE/SQUAREFT'})
    df['DESC'] = df['DESC'].fillna('No Description')
    df = df.fillna(0)
    df.drop(df[df['STATE'] != 'WA'].index, inplace=True)
    if 'Unnamed: 0' in df.columns:
        df.drop('Unnamed: 0', inplace=True, axis=1)
    df.drop_duplicates(inplace=True)
    return df

In [201]:
df = get_data('../data/housing-data-new-test.csv', '../data/favorites_test.csv')
cluster = Recommending()
tfidf = cluster.fit_transform(df.DESC.values)
df = cluster.result(df)
# preds = cluster.predictions(df)

In [203]:
df['DESC'].value_counts()

No Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          

In [190]:
tfidf_matrix = pd.concat([pd.DataFrame(tfidf.todense()),
                          df['PRICE'] / 1000, df['PRICE/SQUAREFT'] / 10], axis=1)

In [191]:
import numpy as np
tfidf_matrix = tfidf_matrix.fillna(0)

In [192]:
tfidf_matrix[tfidf_matrix.index == 20]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,192,193,194,195,196,197,198,199,PRICE,PRICE/SQUAREFT
20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.138766,0.0,0.0,0.0,0.0,0.0,0.0,0.0,325.0,29.0


Recommender Based on TFIDF Matrix and Cosine Similarity

In [193]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [194]:
df['ID'] = df.index
results = {}
for idx, row in df.iterrows():
    if idx < 2822:
        similar_indices = cosine_similarities[idx].argsort()[:-5:-1]
        similar_items = [(cosine_similarities[idx][i], i) for i in similar_indices]
        results[row['ID']] = similar_items[1:]

In [195]:
def item(id):
    return df.loc[df.index == id]['ADDRESS'].tolist()[0]
def recommend(id, num):
    try:
        if (num == 0):
            print("Unable to recommend any house")
        elif (num==1):
            print("Recommending " + str(num) + " house similar to " + item(id))
        else :
            print("Recommending " + str(num) + " houses similar to " + item(id))

        print("----------------------------------------------------------")
        recs = results[id][:num]
        for rec in recs:
            print("You may also like to look at: " + item(rec[1]) + " (score:" + str(rec[0]) + ")")
    except IndexError:
        print('These are the houses most similar to your house')
        pass

In [196]:
recommend(1905, 1)

These are the houses most similar to your house


Recommendations Based on Other Users

In [126]:
import numpy as np

In [123]:
users = pd.DataFrame()

fave = pd.read_csv('../data/favorites_test.csv')

fave['user_id'] = 1

df.index.rename('house_id', inplace=True)

fave['rating'] = 1

users = pd.concat([fave['user_id'], pd.Series(df[df['FAVORITE'] == 'Y'].index), fave['rating']], axis=1)

users = users.fillna(1)

users.to_csv('../data/users.csv')

In [127]:
def build_user_matrix(users, new_user):
    ''' Adds new user data to users dataframe
    
    Params:
        users (file): existing file for the users data
        new_user (file): df with favorited col updated for user
    '''
    users = pd.read_csv(users)
    df = get_data('../data/housing-data.csv', new_user)
    house_id = list(users['house_id'])
    ratings = list(users['rating'])
    user_id = list(users['user_id'])
    df['rating'] = df['FAVORITE'].apply(lambda x: 1 if x == 'Y' else 0)
    df['user_id'] = df['FAVORITE'].apply(lambda x: (users.user_id.max() + 1) if x == 'Y' else 0)
    df = df.drop(df[df['user_id'] == 0].index)
    user_id.extend(list(df['user_id']))
    house_id.extend(list(df[df['FAVORITE'] == 'Y'].index))
    ratings.extend(list(df['rating']))
    users = pd.concat([pd.Series(user_id), pd.Series(house_id), pd.Series(ratings)], axis=1)
    users = users.rename(columns={0:'user_id', 1:'house_id', 2:'rating'})
    users.to_csv('../data/users.csv')
    return users

In [128]:
build_user_matrix('../data/users.csv', '../data/redfin-favorites_erepp.csv')
build_user_matrix('../data/users.csv', '../data/redfin-favorites_repp-el.csv')
build_user_matrix('../data/users.csv', '../data/redfin-favorites_travels.csv')

Unnamed: 0,user_id,house_id,rating
0,1.0,840,1.0
1,1.0,999,1.0
2,1.0,1003,1.0
3,1.0,1043,1.0
4,1.0,1273,1.0
5,1.0,1331,1.0
6,1.0,1345,1.0
7,1.0,1463,1.0
8,2.0,231,1.0
9,2.0,238,1.0


In [129]:
users = pd.read_csv('../data/users.csv')

In [149]:
R_df = users.pivot(index = 'user_id', columns ='house_id', values = 'rating').fillna(0)

In [310]:
# Setup a SparkSession
spark = SparkSession.builder.getOrCreate()

spark_df = spark.createDataFrame(users)

df = spark_df.toPandas()

train, test = spark_df.randomSplit([0.8, 0.2], seed=427471138)

als_model = recommendation.ALS(
    itemCol='house_id',
    userCol='user_id',
    ratingCol='rating',
    nonnegative=True,    
    regParam=0.1,
    rank=10) 

recommender = als_model.fit(train)

In [311]:
rec = recommender.recommendForAllUsers(1)

In [312]:
item_features = recommender.itemFactors.toPandas()

user_features = recommender.userFactors.toPandas()

In [313]:
predictions = recommender.transform(train)

In [315]:
predictions.toPandas()

Unnamed: 0,user_id,house_id,rating,prediction
0,11.0,471,1.0,0.901173
1,14.0,471,1.0,0.901173
2,22.0,1829,1.0,0.900920
3,19.0,1829,1.0,0.900920
4,23.0,1829,1.0,0.900920
5,22.0,2142,1.0,0.900920
6,23.0,2142,1.0,0.900920
7,31.0,2366,1.0,0.900494
8,28.0,2366,1.0,0.900494
9,29.0,2366,1.0,0.900494
