In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine
from scipy.spatial import distance
from scipy.stats import stats

from sklearn.model_selection import train_test_split


In [None]:
data = pd.read_csv(files[1],delimiter="::").to_numpy()
trX,tX,trY,tY = train_test_split(data[:,:2],data[:,2])
users = np.max(data[:,0])
items = np.max(data[:,1])

In [None]:
dataset = np.zeros((users,items))
for i in range(len(trX)):
    dataset[(trX[i,0])-1,(trX[i,1])-1] = trY[i]

In [None]:
class pip:
    def __init__(self,data):
        self.data = data
        self.r_max = np.max(data)
        self.r_min = np.min(data)
        self.r_range = self.r_max - self.r_min
        self.r_med = (self.r_max+self.r_min)/2
        self.agg = np.vectorize(self.agreement)
        self.r_avg_items = np.mean(data,axis=0)
        self.dist = np.vectorize(self.distance)
        self.prox = np.vectorize(self.proximity)
        self.pop = np.vectorize(self.popularity)
        self.im = np.vectorize(self.impact)
    def agreement(self,r1,r2):#to calculate the agreement between the two ratings 
        if (r1>self.r_med and r2>self.r_med) or (r1<self.r_med and r2<self.r_med):
            return 1
        else :
            return 0
    def distance(self,r1,r2,k): # to find the absolute difference in two ratings
        if k:
            return abs(r1-r2)
        return abs(2*(r1-r2))
    def proximity(self,r1,r2,d,k):
        return( 2*(self.r_range)+1)-d**2
    def impact(self,r1,r2,k): #how strong the affinity is of the user towards the item
        if k:
            return (abs(r1-self.r_med)+1)*(abs(r2-self.r_med)+1)
        return 1/ ((abs(r1-self.r_med)+1)*(abs(r2-self.r_med)+1))
    def popularity(self,r1,r2,r_avg_i,k):
        if k :
            return 1 + (((r1+r2)/2)-r_avg_i)**2
        return 1
    def PIP(self,u1,u2):
        k = self.agg(u1,u2)
        d = self.dist(u1,u2,k)
        px = self.prox(u1,u2,d,k)
        i = self.im(u1,u2,k)
        pp = self.pop(u1,u2,self.r_avg_items,k)
        return np.sum(px*i*pp)
    def simi(self):
        users = self.data.shape[0]
        self.sim = np.zeros((users,users))
        for i in range(users):
            for j in range(i,users):
                self.sim[i,j] = self.PIP(self.data[i,:],self.data[j,:])
                self.sim[j,i] = self.sim[i,j]
        return self.sim


In [None]:
class mpip:
    def __init__(self,data):
        self.data = data
        self.median = np.median(data)
        self.rmax = np.max(data)
        self.rmin = np.min(data)
        self.med_p = np.median(np.append(data[data>self.median],data[data>self.median]))
        self.med_m = np.median(np.append(data[data<self.median],data[data<self.median]))
        self.r_avg_items = np.mean(data,axis=0)
        self.dist = np.vectorize(self.distance)
        self.prox = np.vectorize(self.proximity)
        self.pop = np.vectorize(self.popularity)
        self.im = np.vectorize(self.impact)
        self.agg = np.vectorize(self.agreement)
        self.r_med = (self.rmax+self.rmin)/2
    def agreement(self,r1,r2):
        if (r1>self.r_med and r2>self.r_med) or (r1<self.r_med and r2<self.r_med):
            return 1
        else :
            return 0
    def distance(self,r1,r2):
        return abs(r1-r2)
    def proximity(self,k,d):
        if k:
            return ((d - ((self.med_m+self.med_p)/2))/(self.rmax-self.rmin))**2
        elif d>self.median:
            return 0.75 * (((1/d)/(self.rmax-self.rmin))**2)
        elif d ==self.median:
            return 0.5 * (((1/d)/(self.rmax-self.rmin))**2)
        return 0.25 * (((1/d)/(self.rmax-self.rmin))**2)
    def impact(self,r1,r2,k):
        if k:
            return math.e**-(1/ ((abs(r1-self.r_med)+1)*(abs(r2-self.r_med)+1)))
        return 1/ ((abs(r1-self.r_med)+1)*(abs(r2-self.r_med)+1))
    def popularity(self,r1,r2,k,rI):
        if k:
            return math.log10(2+(((r1+r2)/2)-rI)**2)
        return 0.3010
    def MPIP(self,u1,u2):
        k = self.agg(u1,u2)
        d = self.dist(u1,u2)
        px = self.prox(k,d)
        i = self.im(u1,u2,k)
        pp = self.pop(u1,u2,k,self.r_avg_items)
        return np.sum(px*i*pp)
    def simi(self):
        users = self.data.shape[0]
        print(users)
        self.sim = np.zeros((users,users))
        for i in range(users):
            for j in range(i,users):
                self.sim[i,j] = self.MPIP(self.data[i],self.data[j])
                self.sim[j,i] = self.sim[i,j]

In [None]:
class Cosine:
    def __init__(self,data):
        self.sim = 1- pairwise_distances(data,metric="cosine")
class Jaccard:
    def __init__(self,data):
        self.sim = np.zeros([data.shape[0],data.shape[0]])
        for i in range(data.shape[0]):
            for j in range(i,data.shape[0]):
                self.sim[i,j] = distance.jaccard(data[i],data[j])
                self.sim[j,i] = self.sim[i,j]
class pearson:
    def __init__(self,data):
        self.sim = np.zeros([data.shape[0],data.shape[0]])
        for i in range(data.shape[0]):
            for j in range(i,data.shape[0]):
                r,p = stats.pearsonr(i,j)
                self.sim[i,j] = r
                self.sim[j,i] = self.sim[i,j]
        

In [None]:
class rating_pred:
    def __init__(self,
                    matrix,
                    test,
                    pip = 0,
                    mpip = 0,
                    cosine = 0,
                    jaccard = 0,
                    pearson = 0
                    ):
        self.test = test
        self.matrix = matrix
        self.items = matrix.shape[1]
        self.pip = pip
        self.mpip = mpip
        self.cosine = cosine 
        self.jaccard = jaccard
        self.pearson = pearson
        self.pip_pred = []
        self.mpip_pred = []
        self.jaccard_pred = []
        self.pearson_pred = []
        
    def PredRating(self,user,item,similarity):
        try:
            top = similarity[user].argsort()[1:100]
        except IndexError:
            sum,count = 0,0
            for j in range(self.items):
                if self.matrix[user,j] != 0:
                    count+=1
                    sum+=1
            return sum/count
        temp,avgUh,simi = [],[],[]
        for i in top:
            if self.matrix[i,item] !=0:
                temp.append(i)
                simi.append(similarity[user,i])
        temp.append(user)
        for i in temp:
            sum,count = 0,0
            for j in range(self.items):
                if self.matrix[i,j] != 0:
                    count+=1
                    sum+=self.matrix[i,j]
            avgUh.append(sum/count)
        avgU = np.nan_to_num(np.array(avgUh.pop()))
        temp.pop()
        if len(temp)==0:
            return avgU
        simi = np.nan_to_num(np.array(simi))
        num = (simi*(avgUh-avgU)).sum()
        den = simi.sum()
        # for i in range(len(temp)):
        #     num+=similarity[user,temp[i]]*avgUh[i]
        #     den+=similarity[user,temp[i]]
        try:
            result = (round((avgU+num/den),0))
        except ZeroDivisionError:
            result = 0
        return result
    def co(self):
        l = []
        for user in self.test:
            l.append(self.PredRating(int(user[0])-1,int(user[1])-1,self.cosine))
        self.cosine_pred = np.array(l)
    def ja(self):
        l = []
        for user in self.test:
            l.append(self.PredRating(int(user[0])-1,int(user[1])-1,self.jaccard))
        self.jaccard_pred = np.array(l)
    def pe(self):
        l = []
        for user in self.test:
            l.append(self.PredRating(int(user[0])-1,int(user[1])-1,self.pearson))
        self.pearson_pred = np.array(l)
    def p(self):
        l = []
        for user in self.test:
            l.append(self.PredRating(int(user[0])-1,int(user[1])-1,self.pip))
        self.pip_pred = np.array(l)      
    def mp(self):
        l = []
        for user in self.test:
            l.append(self.PredRating(int(user[0])-1,int(user[1])-1,self.mpip))
        self.mpip_pred = np.array(l)
        

In [None]:
test_dataset = np.concatenate((tX,np.array([tY]).T),axis=1)
pip_obj = (pip(dataset))
pip_obj.simi()
pred_obj = rating_pred(dataset,test_dataset,pip_obj.sim)
pred_obj.p()
pd.DataFrame(pred_obj.pip_pred).to_csv("../../results/ml-100k/pip.csv")
pip_obj = None
pred_obj = None

In [None]:
mpip_obj = (mpip(dataset))
mpip_obj.simi()
pred_obj = rating_pred(dataset,test_dataset,mpip = mpip_obj.sim)
pred_obj.mp()
pd.DataFrame(pred_obj.mpip_pred).to_csv("../../results/ml-100k/mpip.csv")
mpip_obj = None
pred_obj = None

In [None]:
cosine_obj = (Cosine(dataset))
pred_obj = rating_pred(dataset,test_dataset,cosine = cosine_obj.sim)
pred_obj.co()
pd.DataFrame(pred_obj.mpip_pred).to_csv("../../results/ml-100k/cosine.csv")
cosine_obj = None
pred_obj = None


In [None]:
jaccard_obj = (Jaccard(dataset))
pred_obj = rating_pred(dataset,test_dataset,jaccard = jaccard_obj.sim)
pred_obj.ja()
pd.DataFrame(pred_obj.mpip_pred).to_csv("../../results/ml-100k/jaccard.csv")
jaccard_obj = None
pred_obj = None


In [None]:
pearson_obj = (pearson(dataset))
pred_obj = rating_pred(dataset,test_dataset,pearson = pearson.sim)
pred_obj.pe()
pd.DataFrame(pred_obj.mpip_pred).to_csv("../../results/ml-1M/pearson.csv")
pearson = None
pred_obj = None