In [None]:
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
plt.style.use('seaborn-whitegrid')


### Classic Collaborative Filtering

The classic collaborative filtering technique finds similar users (those giving similar ratings) using measures such as cosine similarity: 

$$ cos(\vec{\mathbf{x}}, \vec{\mathbf{y}}) = \frac{\vec{\mathbf{x}} \cdot \vec{\mathbf{y}}}{\|\vec{\mathbf{x}}\| \|\vec{\mathbf{y}}\|} = \frac{\sum_{i=1}^n x_i y_i}{\sqrt{\sum_{i=1}^n x_i^2} \sqrt{\sum_{i=1}^n y_i^2}}  $$

Write a function that computes the pairwise similarities between all users. If $n$ is the number of users, create an $n \times n$ matrix $S \in \mathbb{R}^{n \times n}$ whose $(i,j)^{th}$ entry denotes the similarity between users $i$ and $j$.

In [None]:
# --------------------------------
# - Try e new algorithme  --------
# --------------------------------

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
import dvfdata
df=dvfdata.loadDVF_Maisons(departement='All',refresh_force=False,add_commune=False)
df_prepared=dvfdata.prepare_df(df,remove_categories=False)

In [None]:
df_prepared.dropna(axis=0, subset=['geolong'], inplace=True) # remove records which we do not know the geolong
df_prepared.dropna(axis=0, subset=['geolat'], inplace=True) # remove records which we do not know the geolat


In [None]:
X_df = df_prepared.drop(columns='valeurfonc')
y = df_prepared['valeurfonc']

columns = np.array(X_df.columns)
columns_geo = columns[(columns == 'geolong') | (columns == 'geolat')]
columns_notgeo=X_df.columns.drop(columns_geo) 

# Split data Train & Test
X_train, X_test, y_train, y_test = train_test_split(X_df, y, random_state=42)

In [None]:
df_prepared.info()

In [None]:
print(columns_geo)
print(columns_notgeo)

In [None]:
y_train.shape

In [None]:
#from sklearn.neighbors import KDTree
Geo_train=X_train[columns_geo]
Geo_test=X_test[columns_geo]
NotGeo_train=X_train[columns_notgeo]
NotGeo_test=X_test[columns_notgeo]

#tree = KDTree(Geo_train, leaf_size=30, metric='euclidean')
#Neighbors_dist, Neighbors_idx =tree.query(Geo_test, k=10, return_distance=True)

In [None]:
print(X_test.shape)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
Similarities=cosine_similarity(X=NotGeo_test, Y=NotGeo_train, dense_output=True)
Similarities.shape

In [None]:
from sklearn.metrics import pairwise_distances
Distances=pairwise_distances(X=Geo_test, Y=Geo_train
                            , metric='euclidean', n_jobs=-1)

In [None]:
Distances.shape

In [None]:
Distances[1]

In [None]:
print(Distances[1].sort())

In [None]:
Similarities[1]

In [None]:
Distances[1]+1

In [None]:
np.multiply(Similarities[1],1/Distances[1]+1)

In [None]:
from tqdm.notebook import tqdm
from sklearn.neighbors import KDTree

from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity

class ChristopheGeoRegressor():
    def __init__(self,col_latitude="",col_longitude="",dist_weight=0.5,k=20):     
        self.col_latitude = col_latitude
        self.col_longitude = col_longitude
        self.k=k # How many Similar Neighbors we keep
        self.dist_weight=dist_weight # Weight 0<=w<=1 of the distance in the score to select similar neighbors

    def fit(self,X, y): # Train phase
        # From X, get columns lists
        self.columns = np.array(X.columns)
        self.columns_geo = self.columns[(self.columns == self.col_latitude) | (self.columns == self.col_longitude)]
        self.columns_notgeo=X.columns.drop(self.columns_geo) 
        self.X_train=X
        self.y_train=y
        return self
    
    def predict(self,X,loops):
        print("start predict:")
        print("- compute Similarities matrix")
        self.Similarities=cosine_similarity(X=X[self.columns_notgeo]
                            , Y=self.X_train[self.columns_notgeo], dense_output=True)
        print("- compute Distances matrix")
        self.Distances=pairwise_distances(X=X[self.columns_geo], Y=self.X_train[self.columns_geo]
                            , metric='euclidean', n_jobs=-1)
        print("- compute Normalized Distance (divide by the maximum value)")
        max_distance=np.nanmax(self.Distances)
        self.Distance_Norm=np.divide(self.Distances,max_distance)
        
        print("- compute Scores matrix")
        #self.Scores=np.add(  np.multiply(self.Similarities,1-self.dist_weight) \
        #                   , np.multiply(self.Distances, self.dist_weight) )
        
        #self.Scores=np.divide(self.Similarities,np.add(self.Distance_Norm,1))
        self.Scores=np.subtract(  np.multiply(self.Similarities,1-self.dist_weight) \
                                , np.multiply(self.Distances, self.dist_weight) )
        
        print("- select best scores")
        test_records=X.shape[0]
        y_hat = np.zeros(test_records)
        selected_indexes=np.zeros((test_records,self.k))
        
        n_per_100=test_records//100 # division entière
        
        for i in range(test_records):
            if i % n_per_100 ==0:
                print(i//100),
            df_scores=pd.DataFrame(self.Scores[i])
            df_scores.columns = ['score']
            selected_scores=df_scores.sort_values(by=['score'], ascending=False)[:self.k]
            selected_metrics=y_train.iloc[selected_scores.index]
            
            selected_indexes[i]=selected_scores.index
            y_hat[i]=selected_metrics.median()
            
        self.selected_indexes_=selected_indexes
        print("Predict:Done")
        
        return y_hat

In [None]:
my_model=ChristopheGeoRegressor(col_latitude="geolat",col_longitude="geolong",dist_weight=0.5,k=20)

In [None]:
my_model.fit(X_train,y_train)

In [None]:
y_test_predict=my_model.predict(X_test,1)

In [None]:
my_model.Distance_Norm.shape

In [None]:
my_model.Similarities.shape

In [None]:
# k=20
# Scores = Similarity / (1+Distance) & use the median for prediction
mae,mae_std,mape, mape_std,mse,mse_std,rmse,rmse_std = dvfdata.get_predict_errors(y=y_test, y_pred=y_test_predict)
print("------------ Scoring ------------------")
#print("Cross-Validation Accuracy: %0.2f (+/- %0.2f)" % (-cross_val_scores.mean(), cross_val_scores.std() * 2))
print("Price diff error MAE: %0.2f (+/- %0.2f)" % (mae, mae_std * 2))
print("Percent of Price error MAPE: %0.2f (+/- %0.2f)" % (mape, mape_std * 2))
print("Price error RMSE: %0.2f (+/- %0.2f)" % (rmse, rmse * 2))
print("---------------------------------------")

In [None]:
model_name="Christophe Regressor"
f, ax0 = plt.subplots(1, 1, sharey=True)
ax0.scatter(y_test, y_test_predict,s=0.5)
ax0.set_ylabel('Target predicted')
ax0.set_xlabel('True Target')
ax0.set_title('%s, MAE=%.2f, RMSE=%.2f' % (model_name,mae,rmse))

In [None]:
y_test_predict.shape

In [None]:
scores[1]

In [None]:
df=pd.DataFrame(scores[1])
df.columns = ['score']

In [None]:
print(df)

In [None]:
selected_scores=df.sort_values(by=['score'], ascending=False)[:20]
#selected_scores.reset_index
selected_scores

In [None]:
selected_metrics=y_train.iloc[selected_scores.index]

print("mean=",selected_metrics.mean())
print("std=",selected_metrics.std())


In [None]:
y_test.iloc[1]