In [1]:
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
plt.style.use('seaborn-whitegrid')


### Classic Collaborative Filtering

The classic collaborative filtering technique finds similar users (those giving similar ratings) using measures such as cosine similarity: 

$$ cos(\vec{\mathbf{x}}, \vec{\mathbf{y}}) = \frac{\vec{\mathbf{x}} \cdot \vec{\mathbf{y}}}{\|\vec{\mathbf{x}}\| \|\vec{\mathbf{y}}\|} = \frac{\sum_{i=1}^n x_i y_i}{\sqrt{\sum_{i=1}^n x_i^2} \sqrt{\sum_{i=1}^n y_i^2}}  $$

Write a function that computes the pairwise similarities between all users. If $n$ is the number of users, create an $n \times n$ matrix $S \in \mathbb{R}^{n \times n}$ whose $(i,j)^{th}$ entry denotes the similarity between users $i$ and $j$.

In [2]:
# --------------------------------
# - Try e new algorithme  --------
# --------------------------------

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
import dvfdata
df=dvfdata.loadDVF_Maisons(departement='77',refresh_force=False,add_commune=False)
df_prepared=dvfdata.prepare_df(df,remove_categories=False)

Read Mutations
Read Local
Read Parcelle
Read Adresse
Make the join for DVF
Filter data:
Final Calculations
Prepare : filter extrem values
Prepare : drop geo categories
Prepare : update categories


In [5]:
df_prepared.dropna(axis=0, subset=['geolong'], inplace=True) # remove records which we do not know the geolong
df_prepared.dropna(axis=0, subset=['geolat'], inplace=True) # remove records which we do not know the geolat


In [6]:
df_prepared.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47413 entries, 1468388 to 1516439
Data columns (total 6 columns):
valeurfonc    47413 non-null int64
sterr         47413 non-null int64
geolong       47413 non-null float64
geolat        47413 non-null float64
nbpprinc      47413 non-null float64
sbati         47413 non-null float64
dtypes: float64(4), int64(2)
memory usage: 2.5 MB


In [7]:
X_df = df_prepared.drop(columns='valeurfonc')
y = df_prepared['valeurfonc']

columns = np.array(X_df.columns)
columns_geo = columns[(columns == 'geolong') | (columns == 'geolat')]
columns_notgeo=X_df.columns.drop(columns_geo) 

# Split data Train & Test
X_train, X_test, y_train, y_test = train_test_split(X_df, y, random_state=42)

In [8]:
print(columns_geo)
print(columns_notgeo)

['geolong' 'geolat']
Index(['sterr', 'nbpprinc', 'sbati'], dtype='object')


In [9]:
y_train.shape

(35559,)

In [10]:
#from sklearn.neighbors import KDTree
Geo_train=X_train[columns_geo]
Geo_test=X_test[columns_geo]
NotGeo_train=X_train[columns_notgeo]
NotGeo_test=X_test[columns_notgeo]

#tree = KDTree(Geo_train, leaf_size=30, metric='euclidean')
#Neighbors_dist, Neighbors_idx =tree.query(Geo_test, k=10, return_distance=True)

In [11]:
print(X_test.shape)

(11854, 5)


In [12]:
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
Similarities=cosine_similarity(X=NotGeo_test, Y=NotGeo_train, dense_output=True)
Similarities.shape

(11854, 35559)

In [32]:
from sklearn.metrics import pairwise_distances
Distances=pairwise_distances(X=Geo_test, Y=Geo_train
                            , metric='euclidean', n_jobs=-1)

In [33]:
Distances.shape

(11854, 35559)

In [34]:
Distances[1]

array([0.42819814, 0.3960114 , 0.44087986, ..., 0.24434465, 0.00860603,
       0.35758911])

In [36]:
print(Distances[1].sort())

None


In [18]:
Similarities[1]

array([0.98979886, 0.99808337, 0.99529517, ..., 0.88739048, 0.99181848,
       0.9978675 ])

In [19]:
Distances[1]+1

array([1.42819814, 1.3960114 , 1.44087986, ..., 1.24434465, 1.00860603,
       1.35758911])

In [21]:
np.multiply(Similarities[1],1/Distances[1]+1)

array([  3.30134289,   3.51842339,   3.25281533, ...,   4.519107  ,
       116.23868776,   3.78840972])

In [24]:
from tqdm.notebook import tqdm
from sklearn.neighbors import KDTree

from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity

class ChristopheGeoRegressor():
    def __init__(self,col_latitude="",col_longitude="",dist_weight=0.5,k=20):     
        self.col_latitude = col_latitude
        self.col_longitude = col_longitude
        self.k=k # How many Similar Neighbors we keep
        self.dist_weight=dist_weight # Weight 0<=w<=1 of the distance in the score to select similar neighbors

    def fit(self,X, y): # Train phase
        # From X, get columns lists
        self.columns = np.array(X.columns)
        self.columns_geo = self.columns[(self.columns == self.col_latitude) | (self.columns == self.col_longitude)]
        self.columns_notgeo=X.columns.drop(self.columns_geo) 
        self.X_train=X
        self.y_train=y
        return self
    
    def predict(self,X,loops):
        X_test=X
        print("start predict:")
        print("- compute Similarities matrix")
        self.Similarities=cosine_similarity(X=X_test[self.columns_notgeo]
                            , Y=self.X_train[self.columns_notgeo], dense_output=True)
        print("- compute Distances matrix")
        self.Distances=pairwise_distances(X=X_test[self.columns_geo], Y=self.X_train[self.columns_geo]
                            , metric='euclidean', n_jobs=-1)
        print("- compute Scores matrix")
        
        #self.Scores=np.multiply(np.add(  np.multiply(self.Similarities,1-self.dist_weight) \
        #                               , np.multiply(self.Distances, self.dist_weight) ) \
        #                        ,.5)
        self.Scores=np.multiply(self.Similarities,np.divide(1,(np.add(self.Distances,1))))
        
        print("- select best scores")
        test_records=X.shape[0]
        
        y_hat = np.zeros(test_records)
        selected_indexes=np.zeros((test_records,self.k))
        
        for i in tqdm(range(test_records), desc='Predict loop'):
            df_scores=pd.DataFrame(self.Scores[i])
            df_scores.columns = ['score']
            selected_scores=df_scores.sort_values(by=['score'], ascending=False)[:self.k]
            selected_metrics=y_train.iloc[selected_scores.index]
            
            selected_indexes[i]=selected_scores.index
            y_hat[i]=selected_metrics.mean()
            
        self.selected_indexes_=selected_indexes
        print("Predict:Done")
        
        return y_hat

In [25]:
my_model=ChristopheGeoRegressor(col_latitude="geolat",col_longitude="geolong",dist_weight=0.9,k=20)

In [26]:
my_model.fit(X_train,y_train)

<__main__.ChristopheGeoRegressor at 0x1b6c1e2ad0>

In [27]:
y_test_predict=my_model.predict(X_test,1)

start predict:
- compute Similarities matrix
- compute Distances matrix
- compute Scores matrix
- select best scores


HBox(children=(FloatProgress(value=0.0, description='Predict loop', max=11854.0, style=ProgressStyle(descripti…


Predict:Done


In [28]:
y_test_predict.shape

(11854,)

In [29]:
mae,mae_std,mape, mape_std,mse,mse_std,rmse,rmse_std = dvfdata.get_predict_errors(y=y_test, y_pred=y_test_predict)
print("------------ Scoring ------------------")
#print("Cross-Validation Accuracy: %0.2f (+/- %0.2f)" % (-cross_val_scores.mean(), cross_val_scores.std() * 2))
print("Price diff error MAE: %0.2f (+/- %0.2f)" % (mae, mae_std * 2))
print("Percent of Price error MAPE: %0.2f (+/- %0.2f)" % (mape, mape_std * 2))
print("Price error RMSE: %0.2f (+/- %0.2f)" % (rmse, rmse * 2))
print("---------------------------------------")

------------ Scoring ------------------
Price diff error MAE: 54852.76 (+/- 115676.77)
Percent of Price error MAPE: 43.48 (+/- 1337.69)
Price error RMSE: 79710.87 (+/- 159421.73)
---------------------------------------


In [40]:
mae,mae_std,mape, mape_std,mse,mse_std,rmse,rmse_std = dvfdata.get_predict_errors(y=y_test, y_pred=y_test_predict)
print("------------ Scoring ------------------")
#print("Cross-Validation Accuracy: %0.2f (+/- %0.2f)" % (-cross_val_scores.mean(), cross_val_scores.std() * 2))
print("Price diff error MAE: %0.2f (+/- %0.2f)" % (mae, mae_std * 2))
print("Percent of Price error MAPE: %0.2f (+/- %0.2f)" % (mape, mape_std * 2))
print("Price error RMSE: %0.2f (+/- %0.2f)" % (rmse, rmse * 2))
print("---------------------------------------")

------------ Scoring ------------------
Price diff error MAE: 127257.21 (+/- 191089.14)
Percent of Price error MAPE: 58.60 (+/- 707.49)
Price error RMSE: 159130.11 (+/- 318260.23)
---------------------------------------


In [35]:
mae,mae_std,mape, mape_std,mse,mse_std,rmse,rmse_std = dvfdata.get_predict_errors(y=y_test, y_pred=y_test_predict)
print("------------ Scoring ------------------")
#print("Cross-Validation Accuracy: %0.2f (+/- %0.2f)" % (-cross_val_scores.mean(), cross_val_scores.std() * 2))
print("Price diff error MAE: %0.2f (+/- %0.2f)" % (mae, mae_std * 2))
print("Percent of Price error MAPE: %0.2f (+/- %0.2f)" % (mape, mape_std * 2))
print("Price error RMSE: %0.2f (+/- %0.2f)" % (rmse, rmse * 2))
print("---------------------------------------")

------------ Scoring ------------------
Price diff error MAE: 126331.82 (+/- 187716.19)
Percent of Price error MAPE: 59.49 (+/- 780.79)
Price error RMSE: 157379.57 (+/- 314759.14)
---------------------------------------


In [30]:
mae,mae_std,mape, mape_std,mse,mse_std,rmse,rmse_std = dvfdata.get_predict_errors(y=y_test, y_pred=y_test_predict)
print("------------ Scoring ------------------")
#print("Cross-Validation Accuracy: %0.2f (+/- %0.2f)" % (-cross_val_scores.mean(), cross_val_scores.std() * 2))
print("Price diff error MAE: %0.2f (+/- %0.2f)" % (mae, mae_std * 2))
print("Percent of Price error MAPE: %0.2f (+/- %0.2f)" % (mape, mape_std * 2))
print("Price error RMSE: %0.2f (+/- %0.2f)" % (rmse, rmse * 2))
print("---------------------------------------")


------------ Scoring ------------------
Price diff error MAE: 127558.66 (+/- 189896.94)
Percent of Price error MAPE: 58.71 (+/- 705.58)
Price error RMSE: 159014.66 (+/- 318029.32)
---------------------------------------


In [24]:
scores[1]

NameError: name 'scores' is not defined

In [59]:
df=pd.DataFrame(scores[1])
df.columns = ['score']

In [60]:
print(df)

          score
0      0.354499
1      0.348524
2      0.359044
3      0.343679
4      0.268915
...         ...
35554  0.320567
35555  0.327195
35556  0.282934
35557  0.250106
35558  0.338864

[35559 rows x 1 columns]


In [66]:
selected_scores=df.sort_values(by=['score'], ascending=False)[:20]
#selected_scores.reset_index
selected_scores

Unnamed: 0,score
17775,0.424455
5120,0.423834
22401,0.423806
23886,0.423761
29114,0.423563
18574,0.423485
31254,0.423384
7857,0.423262
24391,0.423235
35552,0.423174


In [81]:
selected_metrics=y_train.iloc[selected_scores.index]

print("mean=",selected_metrics.mean())
print("std=",selected_metrics.std())


mean= 140120.0
std= 62912.584298834


In [83]:
y_test.iloc[1]

245400