## INFOH515 Pyspark code
## Author: Gianluca Bontempi
## Pyspark implementation of the nearest-neighbour recommandation in the INFOH515 slides "Map-reduce analytics"

In [2]:
import numpy as np
import pwd
import getpass
import os
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, sum

# create an instance of SparkSession
spark=SparkSession.builder.appName('s.com').getOrCreate()
sc=spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/19 16:27:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:



U=1000  ## number users
P=150  ## number products

Ratings = np.random.choice(np.arange(6), size=(U, P), replace=True)

## rate=0 corresponds to missing data

NewUser=np.random.choice(np.arange(1,6), size=(1, P), replace=True).flatten()

NewUser[2]=0

broadcastNewUser = sc.broadcast(NewUser)


In [None]:
rdd = sc.parallelize([(i, row) for i, row in enumerate(Ratings)])

In [None]:
rdd.take(2)

## User NN

In [None]:
def distance(u1,u2):
    u1=u1.ravel()
    u2=u2.ravel()
    query=int(np.argwhere(u2==0)[0])
    I=np.where(u1 >0)[0].tolist() ## remove all missing rates
    
    I=[x for x in I if x != query] 
    return np.mean(np.abs(u1[I]-u2[I]))

q=int(np.where(broadcastNewUser.value==0)[0])
rddf=rdd.filter(lambda u: u[1][q]>0)
rdd2=rddf.map(lambda u: ( distance(u[1],broadcastNewUser.value), u[1][q], u[0] ))


In [None]:
def redf(x,y):
    if x[0]<y[0]:
        return (x[0],x[1],x[2])
    else:
        return (y[0],y[1],y[2])
    
NN=rdd2.reduce(lambda x,y: redf(x,y))

print("Nearest neighbour user=", NN[2],"distance=", NN[0], "pred=",NN[1] )

### Check with in-memory operations

In [None]:
dist=100*np.ones((U,1))
for i in np.arange(U):
    I=np.where(Ratings[i,:]!=0)[0].tolist()
    I=[x for x in I if x != q]
    dist[i]=np.mean(np.abs(Ratings[i,I]-NewUser[I]))
print("Nearest neighbour user=",np.argmin(dist),"distance=",np.min(dist),"pred=",Ratings[np.argmin(dist),q] )

In [None]:
u=NN[2]
print('NN=',u, 'Ratings NN=',Ratings[u,:], 'Ratings NewUser=', NewUser, distance(Ratings[u,:],NewUser ))


## Product NN

In [106]:
def mapf(x,u):
    ratings=x[1]
    query=int(np.argwhere(u==0)[0])
    retlist=[]
    for j in np.arange(P):
        if j!=query and ratings[j]!=0:
            retlist.append((j,(np.abs(ratings[j]-ratings[query]),1)))
    return retlist

In [107]:
rdd3=rdd.flatMap(lambda u: mapf(u,broadcastNewUser.value))

In [108]:
rdd4=rdd3.reduceByKey(lambda x,y: (x[0]+y[0],x[1]+y[1]))


In [111]:
def redf(x,y):
    if x[1]<y[1]:
        return (x[0],x[1])
    return (y[0],y[1])

sol=rdd4.mapValues(lambda x: x[0]/x[1]).reduce(lambda a,b: redf(a,b))

print("Nearest neighbour product=",sol[0],  'distance=', sol[1], 'prediction=', NewUser[sol[0]])


Nearest neighbour product= 766 distance= 1.7579963789981894


### Check with in-memory operations

In [112]:
dist=100*np.ones((P,1))
for j in np.arange(P):
    if j !=q:
        I=np.where(Ratings[:,j]!=0)[0].tolist()
        dist[j]=np.mean(np.abs(Ratings[I,j]-Ratings[I,q]))

print("Nearest neighbour product=",np.argmin(dist),'distance=',np.min(dist), 'prediction=', NewUser[np.argmin(dist)])

Nearest neighbour product= 766 distance= 1.7579963789981894
