Baseline Recommender System Notebook

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml import recommendation
import pandas as pd

In [77]:
spark_df = (spark.read.format("csv").options(header="true")
    .load("../data/housing-data-new-test.csv"))

In [78]:
df = spark_df.toPandas()

In [11]:
train, test = spark_df.randomSplit([0.8, 0.2], seed=427471138)

In [89]:
als_model = recommendation.ALS(
#     itemCol='1',
#     userCol='',
#     ratingCol='2',
    nonnegative=True,    
    regParam=0.1,
    rank=10) 

In [224]:
recommender = als_model.fit(train)

IllegalArgumentException: 'requirement failed: Column LABEL must be of type NumericType but was actually of type StringType.'

In [2]:
df = pd.read_csv('../data/housing-data-new-test.csv')

In [3]:
df_fav = pd.read_csv('../data/favorites_test.csv')

In [105]:
import pandas as pd

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer



class Clustering:
    def __init__(self, n_clusters=30):
        '''Initializes the TFIDF Vectorizer and KMeans Obj'''

        self.n_clusters = n_clusters
        self.tfidf = TfidfVectorizer(stop_words='english', max_features=500)
        self.km = KMeans(n_clusters=self.n_clusters)


    def fit_transform(self, X):
        '''Fits and transforms TFIDF and fits KMeans.

        Params:
            X (array): Array of the descriptions of houses

        '''
        self.tfidf.fit(X)
        desc_tfidf = self.tfidf.transform(X)
        return desc_tfidf
#         self.km.fit(desc_tfidf.todense())


    def result(self, df):
        '''Takes the df and builds a column with the labels for each house.

        Params:
            df (DataFrame): dataframe with all the housing data


        Returns:
            df (DataFrame): dataframe including new column for label

        '''
        df['LABEL'] = pd.Series(self.km.labels_)
        return df

    def predictions(self, df):
        '''Returns houses that are in the same clusters as their favorites.

        Params:
            df (DataFrame): entire dataframe with the favorites and the cluster labels

        Returns:
            pos (DataFrame): dataframe of houses that have similar descriptions 
                to those that they favorited

        '''
        list_of_rows = []
        possible_clusters = df[df['FAVORITED'] == 'Y']['LABEL'].unique()
        for idx, row in df.iterrows():
            if row['LABEL'] in possible_clusters and row['FAVORITED'] == 'N':
                list_of_rows.append(row)
        return pd.concat(list_of_rows)



def get_data(file, fave_file=None):
    '''Takes in a filename and returns it as a dataframe.


    Params:
        file (csv): file in csv format

    Returns:
        df (DataFrame): pandas dataframe of data from file
    '''
    df = pd.read_csv(file)
    df['FAVORITE'] = 'N'    
    if fave_file != None:
        df_faves = pd.read_csv(fave_file)
        for idx, row in df.iterrows():
            if row['ADDRESS'] in list(df_faves['ADDRESS']):
                df.loc[idx,'FAVORITE'] = 'Y'
    df.rename(columns={'$/SQUARE FEET': 'PRICE/SQUAREFT'})
    df['DESC'] = df['DESC'].fillna('No Description')
    df = df.fillna(0)
    df.drop(df[df['STATE'] != 'WA'].index, inplace=True)
    if 'Unnamed: 0' in df.columns:
        df.drop('Unnamed: 0', inplace=True, axis=1)
    df.drop_duplicates(inplace=True)
    return df

In [93]:
df['SALE TYPE'] = df['SALE TYPE'].apply(lambda x: 1 if x == 'MLS Listing' or x == 'For-Sale-by-Owner Listing' else 0)

In [94]:
df['SALE TYPE'].value_counts()

1    2590
0     232
Name: SALE TYPE, dtype: int64

In [90]:
df.columns

Index(['SALE TYPE', 'SOLD DATE', 'PROPERTY TYPE', 'ADDRESS', 'CITY', 'STATE',
       'ZIP', 'PRICE', 'BEDS', 'BATHS', 'LOCATION', 'SQUARE FEET', 'LOT SIZE',
       'YEAR BUILT', 'DAYS ON MARKET', '$/SQUARE FEET', 'HOA/MONTH', 'STATUS',
       'NEXT OPEN HOUSE START TIME', 'NEXT OPEN HOUSE END TIME', 'URL',
       'SOURCE', 'MLS#', 'FAVORITE', 'INTERESTED', 'LATITUDE', 'LONGITUDE',
       'DESC'],
      dtype='object')

In [106]:
df = get_data('../data/housing-data.csv', '../data/favorites_test.csv')
cluster = Clustering()
tfidf = cluster.fit_transform(df.DESC.values)
# df = cluster.result(df)
# preds = cluster.predictions(df)

In [85]:
matrix = pd.DataFrame(tfidf.todense())

In [100]:
users = pd.DataFrame()

In [101]:
fave = pd.read_csv('../data/favorites_test.csv')

In [184]:
fave['user_id'] = 1

In [107]:
fave

Unnamed: 0,SALE TYPE,HOME TYPE,ADDRESS,CITY,STATE,ZIP,LIST PRICE,BEDS,BATHS,LOCATION,...,URL (SEE http://www.redfin.com/buy-a-home/comparative-market-analysis FOR INFO ON PRICING),SOURCE,LISTING ID,ORIGINAL SOURCE,FAVORITE,INTERESTED,LATITUDE,LONGITUDE,IS SHORT SALE,user_id
0,MLS Listing,Single Family Residential,10336 Wallingford Ave N,Seattle,WA,98133,1499950,5,4.5,Licton Springs,...,http://www.redfin.com/WA/Seattle/10336-Walling...,Northwest Multiple Listing Service (NWMLS),1189373,"RE/MAX Metro Realty, Inc.",Y,Y,47.704363,-122.335341,False,1
1,MLS Listing,Condo/Co-op,939 N 101st St #303,Seattle,WA,98133,330000,1,1.0,Greenwood,...,http://www.redfin.com/WA/Seattle/939-N-101st-S...,Northwest Multiple Listing Service (NWMLS),1275356,Best Choice Realty LLC,Y,Y,47.701992,-122.345518,False,1
2,MLS Listing,Single Family Residential,14548 35th Ave NE,Lake Forest Park,WA,98155,684990,3,2.25,Lake Forest Park,...,http://www.redfin.com/WA/Lake-Forest-Park/1454...,Northwest Multiple Listing Service (NWMLS),1278442,Pulte Homes of Washington Inc,Y,Y,47.733993,-122.291057,False,1
3,MLS Listing,Single Family Residential,1800 NE 96th St,Seattle,WA,98115,499950,2,1.0,Maple Leaf,...,http://www.redfin.com/WA/Seattle/1800-NE-96th-...,Northwest Multiple Listing Service (NWMLS),1280565,Redfin Corp.,Y,Y,47.699028,-122.308573,False,1
4,MLS Listing,Single Family Residential,340 NE 53rd St,Seattle,WA,98105,749950,4,1.75,Wallingford,...,http://www.redfin.com/WA/Seattle/340-NE-53rd-S...,Northwest Multiple Listing Service (NWMLS),1283898,Redfin Corp.,Y,Y,47.667392,-122.323787,False,1
5,MLS Listing,Single Family Residential,345 N 101st St,Seattle,WA,98133,849000,4,3.0,Greenwood,...,http://www.redfin.com/WA/Seattle/345-N-101st-S...,Northwest Multiple Listing Service (NWMLS),1284605,Keller Williams Downtown Sea,Y,Y,47.701971,-122.353408,False,1
6,MLS Listing,Single Family Residential,805 NE 98th St,Seattle,WA,98115,775000,3,2.0,Maple Leaf,...,http://www.redfin.com/WA/Seattle/805-NE-98th-S...,Northwest Multiple Listing Service (NWMLS),1285275,RE/MAX Northwest Realtors,Y,Y,47.70018,-122.319861,False,1


In [109]:
df.index.rename('house_id', inplace=True)

In [306]:
fave['rating'] = 1

In [307]:
users = pd.concat([fave['user_id'], pd.Series(df[df['FAVORITE'] == 'Y'].index), fave['rating']], axis=1)

In [308]:
users = users.fillna(1)

In [309]:
users

Unnamed: 0,user_id,house_id,rating
0,1.0,840,1.0
1,1.0,999,1.0
2,1.0,1003,1.0
3,1.0,1043,1.0
4,1.0,1273,1.0
5,1.0,1331,1.0
6,1.0,1345,1.0
7,1.0,1463,1.0


In [310]:
users.to_csv('../data/users.csv')

In [146]:
df[df['FAVORITE'] == 'Y']

Unnamed: 0_level_0,SALE TYPE,SOLD DATE,PROPERTY TYPE,ADDRESS,CITY,STATE,ZIP,PRICE,BEDS,BATHS,...,NEXT OPEN HOUSE START TIME,NEXT OPEN HOUSE END TIME,URL,SOURCE,MLS#,FAVORITE,INTERESTED,LATITUDE,LONGITUDE,DESC
house_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
840,MLS Listing,0.0,Single Family Residential,14548 35th Ave NE,Lake Forest Park,WA,98155,684990.0,3,2.25,...,0,0,http://www.redfin.com/WA/Lake-Forest-Park/1454...,NWMLS,1278442,Y,Y,47.733993,-122.291057,Introducing 12 Degrees North a collection of 1...
999,MLS Listing,0.0,Single Family Residential,340 NE 53rd St,Seattle,WA,98105,749950.0,4,1.75,...,0,0,http://www.redfin.com/WA/Seattle/340-NE-53rd-S...,NWMLS,1283898,Y,Y,47.667392,-122.323787,"Perched above the street, enjoy Mt. Rainier vi..."
1003,MLS Listing,0.0,Single Family Residential,1800 NE 96th St,Seattle,WA,98115,499950.0,2,1.0,...,0,0,http://www.redfin.com/WA/Seattle/1800-NE-96th-...,NWMLS,1280565,Y,Y,47.699028,-122.308573,Invalid URL
1043,MLS Listing,0.0,Single Family Residential,805 NE 98th St,Seattle,WA,98115,775000.0,3,2.0,...,0,0,http://www.redfin.com/WA/Seattle/805-NE-98th-S...,NWMLS,1285275,Y,Y,47.70018,-122.319861,Welcome home to your beautiful 3 bed/2 bath ho...
1273,MLS Listing,0.0,Single Family Residential,345 N 101st St,Seattle,WA,98133,849000.0,4,3.0,...,0,0,http://www.redfin.com/WA/Seattle/345-N-101st-S...,NWMLS,1284605,Y,Y,47.701971,-122.353408,Beautifully updated 2 bed/2 bath condo in grea...
1331,MLS Listing,0.0,Single Family Residential,14548 35th Ave NE,Lake Forest Park,WA,98155,684990.0,3,2.25,...,0,0,http://www.redfin.com/WA/Lake-Forest-Park/1454...,NWMLS,1278442,Y,Y,47.733993,-122.291057,Exceptional 4 bedroom Bitter Lake one level co...
1345,MLS Listing,0.0,Condo/Co-op,939 N 101st St #303,Seattle,WA,98133,330000.0,1,1.0,...,0,0,http://www.redfin.com/WA/Seattle/939-N-101st-S...,NWMLS,1275356,Y,Y,47.701992,-122.345518,Imagine stepping onto your terrace overlooking...
1463,MLS Listing,0.0,Single Family Residential,10336 Wallingford Ave N,Seattle,WA,98133,1499950.0,5,4.5,...,0,0,http://www.redfin.com/WA/Seattle/10336-Walling...,NWMLS,1189373,Y,Y,47.704363,-122.335341,No Description


In [79]:
matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.109952,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.109242,0.0,0.0,0.0,0.133652,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.128775,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.115833,...,0.0,0.0,0.0,0.0,0.0,0.0,0.130961,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.164144,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [198]:
import numpy as np

In [158]:
users.user_id.max() + 1

2.0

In [311]:
def build_user_matrix(users, new_user):
    ''' Adds new user data to users dataframe
    
    Params:
        users (file): existing file for the users data
        new_user (file): df with favorited col updated for user
    '''
    users = pd.read_csv(users)
    df = get_data('../data/housing-data.csv', new_user)
    house_id = list(users['house_id'])
    ratings = list(users['rating'])
    user_id = list(users['user_id'])
    df['rating'] = df['FAVORITE'].apply(lambda x: 1 if x == 'Y' else 0)
    df['user_id'] = df['FAVORITE'].apply(lambda x: (users.user_id.max() + 1) if x == 'Y' else 0)
    df = df.drop(df[df['user_id'] == 0].index)
    user_id.extend(list(df['user_id']))
    house_id.extend(list(df[df['FAVORITE'] == 'Y'].index))
    ratings.extend(list(df['rating']))
    users = pd.concat([pd.Series(user_id), pd.Series(house_id), pd.Series(ratings)], axis=1)
    users = users.rename(columns={0:'user_id', 1:'house_id', 2:'rating'})
    users.to_csv('../data/users.csv')
    return users

In [314]:
build_user_matrix('../data/users.csv', '../data/redfin-favorites_travels.csv')

Unnamed: 0,user_id,house_id,rating
0,1.0,840,1.0
1,1.0,999,1.0
2,1.0,1003,1.0
3,1.0,1043,1.0
4,1.0,1273,1.0
5,1.0,1331,1.0
6,1.0,1345,1.0
7,1.0,1463,1.0
8,2.0,231,1.0
9,2.0,238,1.0


In [339]:
matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,491,492,493,494,495,496,497,498,499,house_id
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.109952,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.000000,...,0.000000,0.0,0.000000,0.133652,0.000000,0.000000,0.000000,0.000000,0.000000,1
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.128775,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.115833,...,0.000000,0.0,0.000000,0.000000,0.000000,0.130961,0.000000,0.000000,0.000000,3
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.164144,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,4
5,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,5
6,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.330856,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,6
7,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.000000,...,0.000000,0.0,0.000000,0.000000,0.219135,0.000000,0.000000,0.000000,0.000000,7
8,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.22249,0.000000,0.0,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.141118,0.000000,0.000000,0.000000,8
9,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.136612,...,0.000000,0.0,0.000000,0.000000,0.000000,0.154454,0.000000,0.000000,0.000000,9


In [96]:
user_matrix = df[[]]

In [99]:
'item_id' = item_matrix.index


True

In [330]:
len(item_matrix['house_id'])

1413822

In [None]:
item_matrix