Baseline Recommender System Notebook

In [3]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml import recommendation
import pandas as pd

In [77]:
spark_df = (spark.read.format("csv").options(header="true")
    .load("../data/housing-data-new-test.csv"))

In [78]:
df = spark_df.toPandas()

In [11]:
train, test = spark_df.randomSplit([0.8, 0.2], seed=427471138)

In [89]:
als_model = recommendation.ALS(
#     itemCol='1',
#     userCol='',
#     ratingCol='2',
    nonnegative=True,    
    regParam=0.1,
    rank=10) 

In [224]:
recommender = als_model.fit(train)

IllegalArgumentException: 'requirement failed: Column LABEL must be of type NumericType but was actually of type StringType.'

In [14]:
df = pd.read_csv('../data/housing-data-new-test.csv')

In [15]:
df_fav = pd.read_csv('../data/favorites_test.csv')

In [49]:
tfidf = TfidfVectorizer(stop_words='english', max_features=500)
km = KMeans(n_clusters=50)

tfidf.fit(df.DESC.values)
desc_tfidf = tfidf.transform(df.DESC.values)
km.fit(desc_tfidf.todense())

df['LABEL'] = pd.Series(km.labels_)

In [16]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

class Clustering:
    def __init__(self, n_clusters=30):
        '''Initializes the TFIDF Vectorizer and KMeans Obj'''

        self.n_clusters = n_clusters
        self.tfidf = TfidfVectorizer(stop_words='english', max_features=500)
        self.km = KMeans(n_clusters=self.n_clusters)


    def fit_transform(self, X):
        '''Fits and transforms TFIDF and fits KMeans.

        Params:
            X (array): Array of the descriptions of houses

        '''
        self.tfidf.fit(X)
        desc_tfidf = self.tfidf.transform(X)
        self.km.fit(desc_tfidf.todense())
        return desc_tfidf

    def result(self, df):
        '''Takes the df and builds a column with the labels for each house.

        Params:
            df (DataFrame): dataframe with all the housing data


        Returns:
            df (DataFrame): dataframe including new column for label

        '''
        df['LABEL'] = pd.Series(self.km.labels_)
        return df

    def predictions(self, df):
        '''Returns houses that are in the same clusters as their favorites.

        Params:
            df (DataFrame): entire dataframe with the favorites and the cluster labels

        Returns:
            pos (DataFrame): dataframe of houses that have similar descriptions 
                to those that they favorited

        '''
        list_of_rows = []
        possible_clusters = df[df['FAVORITED'] == 'Y']['LABEL'].unique()
        for idx, row in df.iterrows():
            if row['LABEL'] in possible_clusters and row['FAVORITED'] == 'N':
                list_of_rows.append(row)
        return pd.concat(list_of_rows)

def get_data(file, fave_file=None):
    '''Takes in a filename and returns it as a dataframe.


    Params:
        file (csv): file in csv format

    Returns:
        df (DataFrame): pandas dataframe of data from file
    '''
    df = pd.read_csv(file)
    df['FAVORITE'] = 'N'    
    if fave_file != None:
        df_faves = pd.read_csv(fave_file)
        for idx, row in df.iterrows():
            if row['ADDRESS'] in list(df_faves['ADDRESS']):
                df.loc[idx,'FAVORITE'] = 'Y'
    df.rename(columns={'$/SQUARE FEET': 'PRICE/SQUAREFT'})
    df['DESC'] = df['DESC'].fillna('No Description')
    df = df.fillna(0)
    df.drop(df[df['STATE'] != 'WA'].index, inplace=True)
    if 'Unnamed: 0' in df.columns:
        df.drop('Unnamed: 0', inplace=True, axis=1)
    df.drop_duplicates(inplace=True)
    return df

In [43]:
df = get_data('../data/housing-data.csv', '../data/favorites_test.csv')
cluster = Clustering()
tfidf = cluster.fit_transform(df.DESC.values)
df = cluster.result(df)
# preds = cluster.predictions(df)

AttributeError: 'KMeans' object has no attribute 'labels_'

In [18]:
matrix = pd.DataFrame(tfidf.todense())

In [123]:
users = pd.DataFrame()

fave = pd.read_csv('../data/favorites_test.csv')

fave['user_id'] = 1

df.index.rename('house_id', inplace=True)

fave['rating'] = 1

users = pd.concat([fave['user_id'], pd.Series(df[df['FAVORITE'] == 'Y'].index), fave['rating']], axis=1)

users = users.fillna(1)

users.to_csv('../data/users.csv')

In [126]:
import numpy as np

In [127]:
def build_user_matrix(users, new_user):
    ''' Adds new user data to users dataframe
    
    Params:
        users (file): existing file for the users data
        new_user (file): df with favorited col updated for user
    '''
    users = pd.read_csv(users)
    df = get_data('../data/housing-data.csv', new_user)
    house_id = list(users['house_id'])
    ratings = list(users['rating'])
    user_id = list(users['user_id'])
    df['rating'] = df['FAVORITE'].apply(lambda x: 1 if x == 'Y' else 0)
    df['user_id'] = df['FAVORITE'].apply(lambda x: (users.user_id.max() + 1) if x == 'Y' else 0)
    df = df.drop(df[df['user_id'] == 0].index)
    user_id.extend(list(df['user_id']))
    house_id.extend(list(df[df['FAVORITE'] == 'Y'].index))
    ratings.extend(list(df['rating']))
    users = pd.concat([pd.Series(user_id), pd.Series(house_id), pd.Series(ratings)], axis=1)
    users = users.rename(columns={0:'user_id', 1:'house_id', 2:'rating'})
    users.to_csv('../data/users.csv')
    return users

In [128]:
build_user_matrix('../data/users.csv', '../data/redfin-favorites_erepp.csv')
build_user_matrix('../data/users.csv', '../data/redfin-favorites_repp-el.csv')
build_user_matrix('../data/users.csv', '../data/redfin-favorites_travels.csv')

Unnamed: 0,user_id,house_id,rating
0,1.0,840,1.0
1,1.0,999,1.0
2,1.0,1003,1.0
3,1.0,1043,1.0
4,1.0,1273,1.0
5,1.0,1331,1.0
6,1.0,1345,1.0
7,1.0,1463,1.0
8,2.0,231,1.0
9,2.0,238,1.0


In [129]:
users = pd.read_csv('../data/users.csv')

In [149]:
R_df = users.pivot(index = 'user_id', columns ='house_id', values = 'rating').fillna(0)

In [150]:
R_df

house_id,1,17,23,35,36,57,60,64,80,82,...,2810,2811,2816,2819,2821,2828,2841,2862,2867,2885
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [153]:
# Setup a SparkSession
spark = SparkSession.builder.getOrCreate()

spark_df = spark.createDataFrame(users)

df = spark_df.toPandas()

train, test = spark_df.randomSplit([0.8, 0.2], seed=427471138)

als_model = recommendation.ALS(
    itemCol='house_id',
    userCol='user_id',
    ratingCol='rating',
    nonnegative=True,    
    regParam=0.1,
    rank=10) 

recommender = als_model.fit(train)

In [147]:
rec = recommender.recommendForAllUsers(1)

In [154]:
item_features = recommender.itemFactors.toPandas()

user_features = recommender.userFactors.toPandas()

In [157]:
predictions = recommender.transform(train)

In [138]:
price_sorted = df.sort_values(by=['PRICE', 'LABEL'], ascending=[False, False])

price_sorted['LABEL'] = price_sorted['LABEL'].fillna(0)

price_sorted.shape[0]/30

one = price_sorted[0:200].sample(150)
two = price_sorted[200:400].sample(150)
three = price_sorted[400:600].sample(150)
four = price_sorted[600:800].sample(150)
five = price_sorted[800:1000].sample(150)
six = price_sorted[1000:1200].sample(150)
seven = price_sorted[1200:1400].sample(150)
eight = price_sorted[1400:1600].sample(150)
nine = price_sorted[1600:1800].sample(150)
ten = price_sorted[1800:2000].sample(150)
el = price_sorted[2000:].sample(150)

import matplotlib.pyplot as plt
%matplotlib inline
fig, ax = plt.subplots(6)
ax[0]=df.plot.scatter('PRICE', 'LABEL')
ax[1]=df.plot.scatter('LATITUDE', 'LABEL')
ax[2]=df.plot.scatter('LONGITUDE', 'LABEL')
ax[3]=df.plot.scatter('BATHS', 'LABEL')
ax[4]=df.plot.scatter('BEDS', 'LABEL')
ax[5]=df[df['YEAR BUILT']!=0].plot.scatter('YEAR BUILT', 'LABEL')

samples = [one, two, three, four, five, six, seven, eight, nine, ten, el]

for i, item in enumerate(samples):
    for j in range(3):
        for idx in list(item.sample(frac=.3).index):
            item.loc[idx, 'FAVORITE'] = 'Y'
        item = item.drop(item[item['FAVORITE'] != 'Y'].index)
        item.to_csv('../data/favorites-{}-{}.csv'.format(i, j))

import os
list_of_files = []
directory = os.fsencode('/Users/elisereppond/galvanize/capstone/housing-recommender/data/')
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.startswith('favorites-'):
        list_of_files.append(filename)

for file in list_of_files:
    users = build_user_matrix('../data/users.csv', '../data/{}'.format(file))