In [1]:
%matplotlib inline

from __future__ import print_function
from apiclient.discovery import build
from httplib2 import Http
from oauth2client import file, client, tools
import googlemaps  # pip install -U googlemaps
from geopy.distance import geodesic
import requests
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#sklearn libraries
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor

In [174]:
user_evaluation = pd.read_csv('../survey/User_Evaluation_20180727.csv')

def user_input(userID):
    property_list = ['20174233022047', '20174151021003', '20174216015034', \
                     '20178706014038', '20174156016063', '20174339007052', \
                     '20172841038054', '20172728020017', '20177564020019', '20174230008070']
    property_list = list(map(int, property_list))
    
    input = user_evaluation.iloc[userID,17:30]
    ranks = user_evaluation.iloc[userID,30:-1]
    ranks = ranks.astype(int)
    
    property_ranks = pd.concat([pd.Series(property_list).reset_index(), \
                                ranks.reset_index()], axis=1, ignore_index=True)
    property_ranks = property_ranks.loc[:,[1,3]]
    property_ranks.columns = ['rowID', 'user_ranks']

    LA = user_evaluation.iloc[userID,-1]
    return [input, property_ranks, LA]

In [262]:
def read_data():
    # read original dataset and standardized dataset
    df_transform = pd.read_csv('Model_output/df_transform_July2016.csv')
    df = pd.read_csv('../Simple Model with Integrated Dataframe/df_integrated_0714.csv')
    property_list = ['20174233022047', '20174151021003', '20174216015034', \
                  '20178706014038', '20174156016063', '20174339007052', \
                  '20172841038054', '20172728020017', '20177564020019', '20174230008070']
    property_list = list(map(int, property_list))
    df = df[df['rowID'].isin(property_list)] 
    propertyIndex = dict(zip(property_list,range(len(property_list))))
    df['survey_order'] = df['rowID'].map(propertyIndex) + 1       
    df_transform = df_transform.iloc[df.index, :]
    df_features = ['zip_rank', 'SQFTmain', 'Units', 'Bedrooms', 'EffectiveYearBuilt', \
                    'house', 'condo', 'pud', 'pool', 'HH_Kids', 'Owner', 'lot_area', \
                    'num_school_choices', 'closest_school_rating', 'geography', 'parks', 'groceries', 'walkscore', 'transit_score']
    personal_features = ['SQFTmain', 'lot_area', 'EffectiveYearBuilt', \
                         'closest_school_rating', 'HH_Kids', 'Owner', 'crime', 'geography', \
                         'walkscore', 'transit_score',  'closest_schools', 'groceries', 'parks']
    return [df, df_transform, df_features, personal_features]

In [296]:
def model_output_ranks(model, userID, price_wt):
    user_numbers = user_input(userID)[0]
    user_numbers = list(map(int, user_numbers))
    user_scores = list(np.array(user_numbers[0:3])/sum(user_numbers[0:3]) * 100) + \
                        list(np.array(user_numbers[3:8])/sum(user_numbers[3:8]) * 100) + \
                        list(np.array(user_numbers[8:13])/sum(user_numbers[8:13]) * 100)
        
    def rf_score(price_wt, df, personal_wt):
        rf_score = price_wt * df['TotalValue']
        df_feature_score = pd.read_csv('Model_output/df_feature_score.csv')
        for feature in personal_wt.keys():
            rf_score += df_feature_score[feature] * personal_wt[feature]
        return rf_score

    def personal_listings_rf(price_wt, personal_scores):
        personal_dict = {}
        personal_wt = {}

        df, df_transform, df_features, personal_features = read_data()

        for i in range(len(personal_scores)):
            personal_dict[personal_features[i]] = personal_scores[i]
        for f in df_features:
            if f in personal_dict.keys():
                personal_wt[f] = personal_dict[f] / 100
            else:
                personal_wt[f] = 0

        rf_scores = rf_score(price_wt, df_transform, personal_wt)

        for i in df.index:
            df.loc[i, 'rf_scores'] = rf_scores[i]

        return df.sort_values(by=['rf_scores'], ascending=False)
    
    def poly_personal(data, personal_wt, row):
        poly_coef = np.loadtxt('Model_output/poly_coef.out', delimiter=',')
        total = 0
        features = data.columns
        feature_number = len(features)
        sq_indices = [feature_number + 1]
        for i in range(feature_number, 1, -1):
            sq_indices.append(sq_indices[-1] + i)
        for i in range(feature_number):
            total += poly_coef[i + 1] * data.loc[row,features[i]] * personal_wt[i]
            total += poly_coef[sq_indices[i]] * (data.loc[row,features[i]] ** 2) * (personal_wt[i] ** 2)
            for j in range(i+1, feature_number):
                total += poly_coef[sq_indices[i] + j - i] * data.loc[row,features[i]] * data.loc[row,features[j]] * personal_wt[i] * personal_wt[j]
        return total

    def personal_listings_poly(price_wt, personal_scores):
        personal_dict = {}
        personal_wt = []

        df, df_transform, df_features, personal_features = read_data()

        for i in range(len(personal_scores)):
            if personal_scores[i] != 0:
                personal_dict[personal_features[i]] = personal_scores[i]
            personal_dict[personal_features[i]] = personal_scores[i]     
        for f in df_features:
            if f in personal_dict.keys():
                personal_wt.append(personal_dict[f] / 100)
            else:
                personal_wt.append(0)

        X = df_transform[df_features]
        
        for i in df.index:
            df.loc[i, 'poly_scores'] = price_wt * df_transform.loc[i, 'TotalValue'] + 10**(-8) * poly_personal(X, np.array(personal_wt), i)
        return df.sort_values(by=['poly_scores'], ascending=False)


    def get_rf_top10(price_wt):
        personal_df = personal_listings_rf(price_wt, user_scores).reset_index()
        personal_df['model_ranks'] = personal_df['rf_scores'].rank(ascending=False).astype(int)
        return personal_df
    
    def get_poly_top10(price_wt):
        personal_df = personal_listings_poly(price_wt, user_scores).reset_index()
        personal_df['model_ranks'] = personal_df['poly_scores'].rank(ascending=False).astype(int)
        return personal_df
    
    def get_ensemble_top10(price_wt):
        df_poly = get_poly_top10(1.9)
        df_rf = get_rf_top10(price_wt)
        personal_df = pd.merge(left = df_rf, right=df_poly, left_on = 'rowID', right_on = 'rowID')
        personal_df['combined_ranks'] = personal_df['model_ranks_x'] + personal_df['model_ranks_y']
        personal_df['model_ranks'] = personal_df['combined_ranks'].rank(ascending=True).astype(int)
        personal_df['survey_order'] = personal_df['survey_order_x']
        return personal_df
    
    if model == 'rf':
        return get_rf_top10(price_wt)[['rowID', 'survey_order', 'model_ranks']]
    if model == 'poly':
        return get_poly_top10(price_wt)[['rowID', 'survey_order', 'model_ranks']]
    if model == 'ensemble':
        return get_ensemble_top10(price_wt)[['rowID', 'survey_order', 'model_ranks']]
    else:
        print ("please enter 'rf', 'poly', or 'ensemble'")

In [None]:
### Checking the property order is correct ###
# user_evaluation = pd.read_csv('../smartRE/survey/User_Evaluation_20180727.csv')
# df = pd.read_csv('../smartRE/sandbox/Simple Model with Integrated Dataframe/df_integrated_0714.csv')      
# property_list = ['20174233022047', '20174151021003', '20174216015034', \
#                      '20178706014038', '20174156016063', '20174339007052', \
#                      '20172841038054', '20172728020017', '20177564020019', '20174230008070']
# property_list = list(map(int, property_list))
# df_miniset = df[df['rowID'].isin(property_list)]  
# propertyIndex = dict(zip(property_list,range(len(property_list))))
# df_miniset['original_order'] = df_miniset['rowID'].map(propertyIndex) + 1
# df_miniset.sort_values(by = 'original_order',ascending = True)

In [265]:
def compare(model, userID, price_wt):
    df_merge = pd.merge(left = model_output_ranks(model, userID, price_wt), \
                        right = user_input(userID)[1], left_on='rowID', right_on='rowID')
    df_merge = df_merge.set_index('rowID')
    return df_merge.sort_values(by=['survey_order'], ascending=True)

In [134]:
def calculate_accuracy(model, userID, price_wt):
    df_compare = compare(model, userID, price_wt)
    correct_pair = 0
    correct_top = 0
    total_pair = 0
    rowID = sorted(list(df_compare.index))
    for i in rowID:
        for j in rowID:
            if i >= j:
                continue
            else:
                total_pair += 1
                if (df_compare.loc[i, 'model_ranks'] - df_compare.loc[j, 'model_ranks']) * \
                (df_compare.loc[i, 'user_ranks'] - df_compare.loc[j, 'user_ranks']) > 0:
                    correct_pair += 1
    pair_accuracy = correct_pair / total_pair
    
#     model_top3 = model_ranks[model_ranks.isin([1,2,3])]
#     for i in model_top3.index:
#         if (user_ranks.loc[i] <= 5).bool():
#             correct_top += 1
    user_top3 = df_compare[df_compare['user_ranks'] <= 3]
    for i in user_top3.index:
        if df_compare.loc[i, 'model_ranks'] <= 5:
            correct_top += 1
    top_accuracy = correct_top / 5
 
    return (pair_accuracy, top_accuracy)

In [None]:
# # price_wts = np.arange(0,2.0,0.5)
# price_wts = [0.01, 0.02]
# plt.figure(figsize=(10,40))
# count = 0
# for userID in tqdm(range(2,24)):
#     ax = plt.subplot(8, 3, count + 1)
#     plt.setp(ax, xticks=(price_wts), yticks=(np.arange(0,1.0,0.1)))
#     plt.plot(price_wts, calculate_accuracy('rf', userID, price_wts)[0])
#     plt.ylim([0,1.0])
#     count += 1
# plt.show()

In [62]:
def evaluate_models(model, price_wts):
    total_pair_accuracy, user_pair_accuracy, best_pair_accuracy, best_pair_wt = {}, {}, {}, {}
    total_top_accuracy, user_top_accuracy, best_top_accuracy, best_top_wt = {}, {}, {}, {}
    
    for price_wt in tqdm(price_wts):
        total_pair_accuracy[price_wt], total_top_accuracy[price_wt] = 0, 0
        
        for userID in range(2,len(user_evaluation)):
            pair_accuracy, top_accuracy = calculate_accuracy(model, userID, price_wt)
            total_pair_accuracy[price_wt] += pair_accuracy
            total_top_accuracy[price_wt] += top_accuracy
            user_pair_accuracy[(price_wt,userID)] = pair_accuracy
            user_top_accuracy[(price_wt,userID)] = top_accuracy
            
            if userID in best_pair_accuracy.keys():
                if pair_accuracy > best_pair_accuracy[userID]:
                    best_pair_accuracy[userID] = pair_accuracy
                    best_pair_wt[userID] = price_wt
            else:
                best_pair_accuracy[userID] = pair_accuracy
                best_pair_wt[userID] = price_wt
                
            if userID in best_top_accuracy.keys():
                if top_accuracy > best_top_accuracy[userID]:
                    best_top_accuracy[userID] = top_accuracy
                    best_top_wt[userID] = price_wt
            else:
                best_top_accuracy[userID] = top_accuracy
                best_top_wt[userID] = price_wt
        total_pair_accuracy[price_wt] = total_pair_accuracy[price_wt] / (len(user_evaluation) - 2)
        total_top_accuracy[price_wt] = total_top_accuracy[price_wt] / (len(user_evaluation) - 2)
        
    avg_pair_acc = max(list(total_pair_accuracy.values()))
    avg_top_acc = max(list(total_top_accuracy.values()))
    best_pair_wt_num = list(total_pair_accuracy.keys())[np.argmax(list(total_pair_accuracy.values()))]
    best_top_wt_num = list(total_top_accuracy.keys())[np.argmax(list(total_top_accuracy.values()))]

    
    LA, user_pair_acc, pair_best, pair_wt, user_top_acc, top_best, top_wt = [], [], [], [], [], [], []
    for userID in range(2,len(user_evaluation)):
        LA.append(user_input(userID)[2])
        user_pair_acc.append(user_pair_accuracy[(best_pair_wt_num, userID)])
        pair_best.append(best_pair_accuracy[userID])
        pair_wt.append(best_pair_wt[userID])
        
        user_top_acc.append(user_top_accuracy[(best_top_wt_num, userID)])
        top_best.append(best_top_accuracy[userID])
        top_wt.append(best_top_wt[userID])

    df_LA = pd.concat([pd.Series(LA), pd.Series(user_pair_acc), pd.Series(pair_best), pd.Series(pair_wt), \
                       pd.Series(user_top_acc), pd.Series(top_best), pd.Series(top_wt)], axis=1)
    df_LA.columns = ['LA', 'user_pair_accuracy', 'best_pair_accuracy', 'best_pair_wt', \
                     'user_top_accuracy', 'best_top_accuracy', 'best_top_wt']

    return [best_pair_wt_num, avg_pair_acc, best_top_wt_num, avg_top_acc, df_LA]

## EVALUATE MODELS

### Random Forest

In [182]:
price_wts = np.arange(0,2.0,0.05)

In [183]:
best_pair_wt, pair_acc, best_top_wt, top_acc, df_LA = evaluate_models('rf', price_wts)



  0%|          | 0/40 [00:00<?, ?it/s][A[A

  2%|▎         | 1/40 [00:06<04:28,  6.90s/it][A[A

  5%|▌         | 2/40 [00:13<04:10,  6.59s/it][A[A

  8%|▊         | 3/40 [00:20<04:18,  7.00s/it][A[A

 10%|█         | 4/40 [00:27<04:10,  6.96s/it][A[A

 12%|█▎        | 5/40 [00:34<04:03,  6.94s/it][A[A

 15%|█▌        | 6/40 [00:41<03:55,  6.94s/it][A[A

 18%|█▊        | 7/40 [00:48<03:46,  6.87s/it][A[A

 20%|██        | 8/40 [00:54<03:38,  6.84s/it][A[A

 22%|██▎       | 9/40 [01:01<03:32,  6.85s/it][A[A

 25%|██▌       | 10/40 [01:08<03:24,  6.82s/it][A[A

 28%|██▊       | 11/40 [01:15<03:18,  6.83s/it][A[A

 30%|███       | 12/40 [01:21<03:10,  6.81s/it][A[A

 32%|███▎      | 13/40 [01:28<03:04,  6.84s/it][A[A

 35%|███▌      | 14/40 [01:35<02:57,  6.82s/it][A[A

 38%|███▊      | 15/40 [01:41<02:49,  6.78s/it][A[A

 40%|████      | 16/40 [01:47<02:41,  6.75s/it][A[A

 42%|████▎     | 17/40 [01:54<02:34,  6.72s/it][A[A

 45%|████▌     | 18/40 [02

In [184]:
print ('best pair wt =', best_pair_wt)
print ('avg pair accuracy =', pair_acc)
print ('best top wt =', best_top_wt)
print ('avg top accuracy =', top_acc)
print ('LA local pair accuracy =', df_LA[df_LA['LA']=='Yes']['user_pair_accuracy'].mean())
print ('Non-LA   pair accuracy =', df_LA[df_LA['LA']=='No']['user_pair_accuracy'].mean())
print ('LA local top accuracy =', df_LA[df_LA['LA']=='Yes']['user_top_accuracy'].mean())
print ('Non-LA   top accuracy =', df_LA[df_LA['LA']=='No']['user_top_accuracy'].mean())

best pair wt = 1.95
avg pair accuracy = 0.5919191919191918
best top wt = 1.6
avg top accuracy = 0.40000000000000013
LA local pair accuracy = 0.7361111111111112
Non-LA   pair accuracy = 0.5095238095238094
LA local top accuracy = 0.475
Non-LA   top accuracy = 0.35714285714285715


In [185]:
df_LA

Unnamed: 0,LA,user_pair_accuracy,best_pair_accuracy,best_pair_wt,user_top_accuracy,best_top_accuracy,best_top_wt
0,No,0.577778,0.666667,0.1,0.4,0.4,0.0
1,Yes,0.8,0.822222,0.1,0.6,0.6,0.05
2,No,0.4,0.577778,0.0,0.2,0.4,0.0
3,No,0.466667,0.466667,1.95,0.2,0.2,0.0
4,Yes,0.733333,0.755556,0.15,0.4,0.4,0.0
5,No,0.666667,0.711111,0.0,0.6,0.6,0.15
6,No,0.4,0.6,0.0,0.4,0.4,0.1
7,No,0.333333,0.733333,0.0,0.2,0.4,0.0
8,No,0.644444,0.644444,1.95,0.6,0.6,0.15
9,Yes,0.866667,0.866667,1.5,0.6,0.6,0.7


### Polynomial Regression

In [252]:
price_wts = np.arange(-2.0,2.0,0.1)

In [253]:
best_pair_wt, pair_acc, best_top_wt, top_acc, df_LA = evaluate_models('poly', price_wts)

100%|██████████| 40/40 [04:20<00:00,  6.50s/it]


In [254]:
print ('best pair wt =', best_pair_wt)
print ('avg pair accuracy =', pair_acc)
print ('best top wt =', best_top_wt)
print ('avg top accuracy =', top_acc)
print ('LA local pair accuracy =', df_LA[df_LA['LA']=='Yes']['user_pair_accuracy'].mean())
print ('Non-LA   pair accuracy =', df_LA[df_LA['LA']=='No']['user_pair_accuracy'].mean())
print ('LA local top accuracy =', df_LA[df_LA['LA']=='Yes']['user_top_accuracy'].mean())
print ('Non-LA   top accuracy =', df_LA[df_LA['LA']=='No']['user_top_accuracy'].mean())

best pair wt = 1.9
avg pair accuracy = 0.5575757575757575
best top wt = 1.9
avg top accuracy = 0.34545454545454557
LA local pair accuracy = 0.6027777777777777
Non-LA   pair accuracy = 0.5317460317460316
LA local top accuracy = 0.375
Non-LA   top accuracy = 0.3285714285714286


In [255]:
df_LA

Unnamed: 0,LA,user_pair_accuracy,best_pair_accuracy,best_pair_wt,user_top_accuracy,best_top_accuracy,best_top_wt
0,No,0.577778,0.666667,-0.8,0.4,0.6,-1.3
1,Yes,0.711111,0.711111,1.6,0.6,0.6,1.5
2,No,0.266667,0.4,-2.0,0.0,0.4,-2.0
3,No,0.577778,0.666667,-0.9,0.4,0.4,0.5
4,Yes,0.333333,0.333333,0.8,0.2,0.2,0.1
5,No,0.644444,0.733333,0.1,0.6,0.6,0.1
6,No,0.488889,0.577778,-0.8,0.4,0.4,-2.0
7,No,0.422222,0.511111,-1.4,0.2,0.4,-2.0
8,No,0.622222,0.622222,1.3,0.6,0.6,0.1
9,Yes,0.711111,0.711111,1.3,0.4,0.4,0.6


### Ensemble

In [301]:
price_wts = np.arange(0,2.0,0.05)

In [302]:
best_pair_wt, pair_acc, best_top_wt, top_acc, df_LA = evaluate_models('ensemble', price_wts)

100%|██████████| 40/40 [08:58<00:00, 13.46s/it]


In [303]:
print ('best pair wt =', best_pair_wt)
print ('avg pair accuracy =', pair_acc)
print ('best top wt =', best_top_wt)
print ('avg top accuracy =', top_acc)
print ('LA local pair accuracy =', df_LA[df_LA['LA']=='Yes']['user_pair_accuracy'].mean())
print ('Non-LA   pair accuracy =', df_LA[df_LA['LA']=='No']['user_pair_accuracy'].mean())
print ('LA local top accuracy =', df_LA[df_LA['LA']=='Yes']['user_top_accuracy'].mean())
print ('Non-LA   top accuracy =', df_LA[df_LA['LA']=='No']['user_top_accuracy'].mean())

best pair wt = 0.85
avg pair accuracy = 0.51010101010101
best top wt = 1.1
avg top accuracy = 0.3545454545454547
LA local pair accuracy = 0.5583333333333333
Non-LA   pair accuracy = 0.48253968253968255
LA local top accuracy = 0.325
Non-LA   top accuracy = 0.3714285714285715


In [304]:
df_LA

Unnamed: 0,LA,user_pair_accuracy,best_pair_accuracy,best_pair_wt,user_top_accuracy,best_top_accuracy,best_top_wt
0,No,0.577778,0.666667,0.4,0.4,0.4,0.0
1,Yes,0.533333,0.577778,1.3,0.4,0.4,0.3
2,No,0.288889,0.422222,0.0,0.2,0.4,0.1
3,No,0.444444,0.577778,0.15,0.4,0.4,0.15
4,Yes,0.488889,0.511111,1.55,0.2,0.2,0.15
5,No,0.577778,0.6,0.0,0.6,0.6,0.2
6,No,0.488889,0.533333,0.0,0.4,0.4,0.35
7,No,0.311111,0.577778,0.2,0.2,0.4,0.0
8,No,0.533333,0.555556,0.2,0.6,0.6,0.0
9,Yes,0.711111,0.711111,0.85,0.4,0.4,0.15


### Visualize Ranks

In [311]:
pd.concat([compare('rf', 2, 1.6), compare('rf', 10, 1.6)], axis = 1)

Unnamed: 0_level_0,survey_order,model_ranks,user_ranks,survey_order,model_ranks,user_ranks
rowID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
20174233022047,1,6,7,1,6,8
20174151021003,2,3,9,2,3,9
20174216015034,3,8,3,3,8,4
20178706014038,4,4,2,4,4,10
20174156016063,5,5,1,5,5,2
20174339007052,6,2,5,6,2,1
20172841038054,7,10,4,7,10,7
20172728020017,8,1,6,8,1,3
20177564020019,9,9,8,9,9,6
20174230008070,10,7,10,10,7,5
