In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine
from scipy.spatial import distance
from scipy.stats import stats

from sklearn.model_selection import train_test_split


In [None]:
data = pd.read_csv(files[1],delimiter="::").to_numpy()
trX,tX,trY,tY = train_test_split(data[:,:2],data[:,2])
users = np.max(data[:,0])
items = np.max(data[:,1])

In [None]:
dataset = np.zeros((users,items))
for i in range(len(trX)):
    dataset[(trX[i,0])-1,(trX[i,1])-1] = trY[i]

In [None]:
class pip:
    def __init__(self,data):
        self.data = data
        self.r_max = np.max(data)
        self.r_min = np.min(data)
        self.r_range = self.r_max - self.r_min
        self.r_med = (self.r_max+self.r_min)/2
        self.agg = np.vectorize(self.agreement)
        self.r_avg_items = np.mean(data,axis=0)
        self.dist = np.vectorize(self.distance)
        self.prox = np.vectorize(self.proximity)
        self.pop = np.vectorize(self.popularity)
        self.im = np.vectorize(self.impact)
    def agreement(self,r1,r2):#to calculate the agreement between the two ratings 
        if (r1>self.r_med and r2>self.r_med) or (r1<self.r_med and r2<self.r_med):
            return 1
        else :
            return 0
    def distance(self,r1,r2,k): # to find the absolute difference in two ratings
        if k:
            return abs(r1-r2)
        return abs(2*(r1-r2))
    def proximity(self,r1,r2,d,k):
        return( 2*(self.r_range)+1)-d**2
    def impact(self,r1,r2,k): #how strong the affinity is of the user towards the item
        if k:
            return (abs(r1-self.r_med)+1)*(abs(r2-self.r_med)+1)
        return 1/ ((abs(r1-self.r_med)+1)*(abs(r2-self.r_med)+1))
    def popularity(self,r1,r2,r_avg_i,k):
        if k :
            return 1 + (((r1+r2)/2)-r_avg_i)**2
        return 1
    def PIP(self,u1,u2):
        k = self.agg(u1,u2)
        d = self.dist(u1,u2,k)
        px = self.prox(u1,u2,d,k)
        i = self.im(u1,u2,k)
        pp = self.pop(u1,u2,self.r_avg_items,k)
        return np.sum(px*i*pp)
    def simi(self):
        users = self.data.shape[0]
        self.sim = np.zeros((users,users))
        for i in range(users):
            for j in range(i,users):
                self.sim[i,j] = self.PIP(self.data[i,:],self.data[j,:])
                self.sim[j,i] = self.sim[i,j]
        return self.sim


In [None]:
class mpip:
    def __init__(self,data):
        self.data = data
        self.median = np.median(data)
        self.rmax = np.max(data)
        self.rmin = np.min(data)
        self.med_p = np.median(np.append(data[data>self.median],data[data>self.median]))
        self.med_m = np.median(np.append(data[data<self.median],data[data<self.median]))
        self.r_avg_items = np.mean(data,axis=0)
        self.dist = np.vectorize(self.distance)
        self.prox = np.vectorize(self.proximity)
        self.pop = np.vectorize(self.popularity)
        self.im = np.vectorize(self.impact)
        self.agg = np.vectorize(self.agreement)
        self.r_med = (self.rmax+self.rmin)/2
    def agreement(self,r1,r2):
        if (r1>self.r_med and r2>self.r_med) or (r1<self.r_med and r2<self.r_med):
            return 1
        else :
            return 0
    def distance(self,r1,r2):
        return abs(r1-r2)
    def proximity(self,k,d):
        if k:
            return ((d - ((self.med_m+self.med_p)/2))/(self.rmax-self.rmin))**2
        elif d>self.median:
            return 0.75 * (((1/d)/(self.rmax-self.rmin))**2)
        elif d ==self.median:
            return 0.5 * (((1/d)/(self.rmax-self.rmin))**2)
        return 0.25 * (((1/d)/(self.rmax-self.rmin))**2)
    def impact(self,r1,r2,k):
        if k:
            return math.e**-(1/ ((abs(r1-self.r_med)+1)*(abs(r2-self.r_med)+1)))
        return 1/ ((abs(r1-self.r_med)+1)*(abs(r2-self.r_med)+1))
    def popularity(self,r1,r2,k,rI):
        if k:
            return math.log10(2+(((r1+r2)/2)-rI)**2)
        return 0.3010
    def MPIP(self,u1,u2):
        k = self.agg(u1,u2)
        d = self.dist(u1,u2)
        px = self.prox(k,d)
        i = self.im(u1,u2,k)
        pp = self.pop(u1,u2,k,self.r_avg_items)
        return np.sum(px*i*pp)
    def simi(self):
        users = self.data.shape[0]
        print(users)
        self.sim = np.zeros((users,users))
        for i in range(users):
            for j in range(i,users):
                self.sim[i,j] = self.MPIP(self.data[i],self.data[j])
                self.sim[j,i] = self.sim[i,j]

In [None]:
class Cosine:
    def __init__(self,data):
        self.sim = 1- pairwise_distances(data,metric="cosine")
class Jaccard:
    def __init__(self,data):
        self.sim = np.zeros([data.shape[0],data.shape[0]])
        for i in range(data.shape[0]):
            for j in range(i,data.shape[0]):
                self.sim[i,j] = distance.jaccard(data[i],data[j])
                self.sim[j,i] = self.sim[i,j]
class pearson:
    def __init__(self,data):
        self.sim = np.zeros([data.shape[0],data.shape[0]])
        for i in range(data.shape[0]):
            for j in range(i,data.shape[0]):
                r,p = stats.pearsonr(i,j)
                self.sim[i,j] = r
                self.sim[j,i] = self.sim[i,j]
        

In [None]:
class rating_pred:
    def __init__(self,
                    matrix,
                    test,
                    pip = 0,
                    mpip = 0,
                    cosine = 0,
                    jaccard = 0,
                    pearson = 0
                    ):
        self.test = test
        self.matrix = matrix
        self.items = matrix.shape[1]
        self.pip = pip
        self.mpip = mpip
        self.cosine = cosine 
        self.jaccard = jaccard
        self.pearson = pearson
        self.pip_pred = []
        self.mpip_pred = []
        self.jaccard_pred = []
        self.pearson_pred = []
        
    def PredRating(self,user,item,similarity):
        try:
            top = similarity[user].argsort()[1:100]
        except IndexError:
            sum,count = 0,0
            for j in range(self.items):
                if self.matrix[user,j] != 0:
                    count+=1
                    sum+=1
            return sum/count
        temp,avgUh,simi = [],[],[]
        for i in top:
            if self.matrix[i,item] !=0:
                temp.append(i)
                simi.append(similarity[user,i])
        temp.append(user)
        for i in temp:
            sum,count = 0,0
            for j in range(self.items):
                if self.matrix[i,j] != 0:
                    count+=1
                    sum+=self.matrix[i,j]
            avgUh.append(sum/count)
        avgU = np.nan_to_num(np.array(avgUh.pop()))
        temp.pop()
        if len(temp)==0:
            return avgU
        simi = np.nan_to_num(np.array(simi))
        num = (simi*(avgUh-avgU)).sum()
        den = simi.sum()
        # for i in range(len(temp)):
        #     num+=similarity[user,temp[i]]*avgUh[i]
        #     den+=similarity[user,temp[i]]
        try:
            result = (round((avgU+num/den),0))
        except ZeroDivisionError:
            result = 0
        return result
    def co(self):
        l = []
        for user in self.test:
            l.append(self.PredRating(int(user[0])-1,int(user[1])-1,self.cosine))
        self.cosine_pred = np.array(l)
    def ja(self):
        l = []
        for user in self.test:
            l.append(self.PredRating(int(user[0])-1,int(user[1])-1,self.jaccard))
        self.jaccard_pred = np.array(l)
    def pe(self):
        l = []
        for user in self.test:
            l.append(self.PredRating(int(user[0])-1,int(user[1])-1,self.pearson))
        self.pearson_pred = np.array(l)
    def p(self):
        l = []
        for user in self.test:
            l.append(self.PredRating(int(user[0])-1,int(user[1])-1,self.pip))
        self.pip_pred = np.array(l)      
    def mp(self):
        l = []
        for user in self.test:
            l.append(self.PredRating(int(user[0])-1,int(user[1])-1,self.mpip))
        self.mpip_pred = np.array(l)
        

In [None]:
test_dataset = np.concatenate((tX,np.array([tY]).T),axis=1)
pip_obj = (pip(dataset))
pip_obj.simi()
pd.DataFrame(pip_obj.sim).to_csv("pip.csv",sep=",",index=False)
pred_obj = rating_pred(dataset,test_dataset,pip_obj.sim)
pred_obj.p()
pd.DataFrame(pred_obj.pip_pred).to_csv("../../results/ml-100k/pip.csv")
pip_obj = None
pred_obj = None

In [None]:
mpip_obj = (mpip(dataset))
mpip_obj.simi()
pd.DataFrame(mpip_obj.sim).to_csv("mpip.csv",sep=",",index=False)
pred_obj = rating_pred(dataset,test_dataset,mpip = mpip_obj.sim)
pred_obj.mp()
pd.DataFrame(pred_obj.mpip_pred).to_csv("../../results/ml-100k/mpip.csv")
mpip_obj = None
pred_obj = None

In [None]:
cosine_obj = (Cosine(dataset))
pd.DataFrame(cosine_obj.sim).to_csv("cosine.csv",sep=",",index=False)
pred_obj = rating_pred(dataset,test_dataset,cosine = cosine_obj.sim)
pred_obj.co()
pd.DataFrame(pred_obj.mpip_pred).to_csv("../../results/ml-100k/cosine.csv")
cosine_obj = None
pred_obj = None


In [None]:
jaccard_obj = (Jaccard(dataset))
pd.DataFrame(jaccard_obj.sim).to_csv("jaccard.csv",sep=",",index=False)
pred_obj = rating_pred(dataset,test_dataset,jaccard = jaccard_obj.sim)
pred_obj.ja()
pd.DataFrame(pred_obj.mpip_pred).to_csv("../../results/ml-100k/jaccard.csv")
jaccard_obj = None
pred_obj = None


In [None]:
pearson_obj = (pearson(dataset))
pd.DataFrame(pearson_obj.sim).to_csv("pearson.csv",sep=",",index=False)
pred_obj = rating_pred(dataset,test_dataset,pearson = pearson.sim)
pred_obj.pe()
pd.DataFrame(pred_obj.mpip_pred).to_csv("../../results/ml-100k/pearson.csv")
pearson = None
pred_obj = None

In [None]:
Mpip_Scaled = mpip_obj.sim.copy()
  
# apply normalization techniques
for column in Mpip_Scaled.columns:
    Mpip_Scaled[column] = (Mpip_Scaled[column] - Mpip_Scaled[column].min()) / (Mpip_Scaled[column].max() - Mpip_Scaled[column].min())   

In [None]:
Mpip_Scaled = pd.DataFrame(Mpip_Scaled)
Mpip_Scaled.to_csv('mpipscaled.csv',sep=',',index=False)

In [None]:
mpip_sim = pd.read_csv('mpipscaled.csv')

In [None]:
mpip_sim.shape

In [None]:
mpip_sim.head(5)

In [None]:
mpip_sim.iloc[:,3]

In [None]:
column_names = [str(x) for x in range(0,943)]
df = pd.DataFrame(columns = column_names)

In [None]:
for i in range(0,943):
    a_ = mpip_sim.iloc[:,i]
    s = np.array(a_)
    sort_index = np.argsort(s)
    sort_index = sort_index[0:200]
    df.iloc[:,i] = sort_index

In [None]:
df.shape

In [None]:
df.to_csv('Top_200_sim.csv')

In [None]:
#Sparsity reduction using similar users (MPIP) and imputation
r_cols = ['user_id','movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('u.data', sep='\t', names=r_cols)
ratings.drop("unix_timestamp", inplace = True, axis = 1)
ratings.head()

In [None]:
utility_matrix = ratings.pivot(index='movie_id',columns='user_id',values='rating')
utility_matrix.head()

In [None]:
utility_matrix = utility_matrix.fillna(0)

In [None]:
utility_matrix.astype(np.int64)

In [None]:
for i in range (0,943):
    for j in range(0,943):
        if (utility_matrix.iloc[i,j]==0):
            for k in range (0,100):
                us = df.iloc[k,i]
                if (utility_matrix.iloc[i,k]==0):
                    print(i)
                else:
                    utility_matrix.iloc[i,j] = utility_matrix.iloc[i,k]
                    break

In [None]:
utility_matrix

In [None]:
utility_matrix.replace(0, np.nan, inplace=True)

In [None]:
# number of empty cells in the matrix (movie_user_matrix2)
empty_cells = utility_matrix.isna().sum().sum()
empty_cells

In [None]:
# Calculation of the sparsity of the matrix (movie_user_matrix2)
sparsity = empty_cells/utility_matrix.size
print("The sparsity of the matrix is: ", sparsity)

In [None]:
#Final dataset after sparsity reduction in form of (User, Movie, Rating)
# Final dataframe after unpivoting the deter_data matrix
final_df = utility_matrix.stack().reset_index()
final_df.columns=['movie_id','user_id','rating']
final_df

In [None]:
columns_titles = ["user_id","movie_id","ratings"]
final_df=final_df.reindex(columns=columns_titles)

In [None]:
final_df

In [None]:
# Ignore warnings :
import warnings
warnings.filterwarnings('ignore')

# Handle table-like data and matrices :
import numpy as np
import pandas as pd
import math 
import itertools

# Modelling Helpers :
#from sklearn.preprocessing import Imputer , Normalizer , scale
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV , KFold , cross_val_score

# Evaluation metrics :
# Regression
from sklearn.metrics import mean_squared_log_error,mean_squared_error, r2_score,mean_absolute_error, mean_squared_error

# Classification
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score


# Deep Learning Libraries
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D, BatchNormalization
from tensorflow.keras.optimizers import Adam,SGD,Adagrad,Adadelta,RMSprop
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ReduceLROnPlateau, LearningRateScheduler
from tensorflow.keras.utils import to_categorical


# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
import missingno as msno


# Configure visualisations
%matplotlib inline
mpl.style.use( 'ggplot' )
plt.style.use('fivethirtyeight')
sns.set(context="notebook", palette="dark", style = 'whitegrid' , color_codes=True)

In [None]:
from IPython.core.display import HTML
HTML("""
<style>
.output_png {
    display: table-cell;
    text-align: center;
    vertical-align: middle;
}
</style>
""");


# Make Visualizations better
params = { 
    'axes.labelsize': "large",
    'xtick.labelsize': 'x-large',
    'legend.fontsize': 20,
    'figure.dpi': 150,
    'figure.figsize': [25, 7]
}
plt.rcParams.update(params)

In [None]:
r_cols = ['userID', 'movieID', 'ratings','timestamp']
r_cols1 = [ 'movieID', 'movie title', 'release date', 'IMDB', 'unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western' ]
ratings = pd.read_csv('final_dataset.csv', sep='\t', names=r_cols,
                      encoding='latin-1')
movies = pd.read_csv('u.item', sep='|', names=r_cols1,
                      encoding='latin-1', index_col=3)
df_r = ratings.copy()
df_m = movies.copy()

In [None]:
ratings.head()

In [None]:
movies.head()

In [None]:
movies['movieID']

In [None]:
ratings.drop(['timestamp'], axis=1, inplace=True)
ratings.head()

In [None]:
print('Shape: ', movies.shape, '\n')
movies.info()

In [None]:
from keras.layers import Embedding, Input, dot, concatenate
from keras.models import Model
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import math 

In [None]:
#X = ratings.iloc[:,:2]
#Y = ratings.iloc[:,2]

#x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 66)

In [None]:
#y_train

In [None]:
g = ratings.groupby('userID')['ratings'].count()
top_users = g.sort_values(ascending=False)[:15]
g = ratings.groupby('movieID')['ratings'].count()
top_movies = g.sort_values(ascending=False)[:15]
top_r = ratings.join(top_users, rsuffix='_r', how='inner', on='userID')
top_r = top_r.join(top_movies, rsuffix='_r', how='inner', on='movieID')
pd.crosstab(top_r.userID, top_r.movieID, top_r.ratings, aggfunc=np.sum)

In [None]:
user_enc = LabelEncoder()
ratings['user'] = user_enc.fit_transform(ratings['userID'].values)
n_users = ratings['user'].nunique()
item_enc = LabelEncoder()
ratings['movie'] = item_enc.fit_transform(ratings['movieID'].values)
n_movies = ratings['movie'].nunique()
ratings['ratings'] = ratings['ratings'].values.astype(np.float32)
min_rating = min(ratings['ratings'])
max_rating = max(ratings['ratings'])
n_users, n_movies, min_rating, max_rating

In [None]:
X = ratings[['user', 'movie']].values
y = ratings['ratings'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=22)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
n_factors = 100
X_train_array = [X_train[:, 0], X_train[:, 1]]
X_test_array = [X_test[:, 0], X_test[:, 1]]

In [None]:
from keras.models import Model
from keras.layers import Input, Reshape, Dot
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.regularizers import l2

In [None]:
def RecommenderV1(n_users, n_movies, n_factors):
    user = Input(shape=(1,))
    u = Embedding(n_users, n_factors, embeddings_initializer='he_normal',
                  embeddings_regularizer=l2(1e-6))(user)
    u = Reshape((n_factors,))(u)
    
    movie = Input(shape=(1,))
    m = Embedding(n_movies, n_factors, embeddings_initializer='he_normal',
                  embeddings_regularizer=l2(1e-6))(movie)
    m = Reshape((n_factors,))(m)
    
    x = Dot(axes=1)([u, m])
    model = Model(inputs=[user, movie], outputs=x)
    opt = Adam(lr=0.001)
    model.compile(loss='mean_squared_error', optimizer=opt)
    return model

In [None]:
model = RecommenderV1(n_users, n_movies, n_factors)
model.summary()

In [None]:
history = model.fit(x=X_train_array, y=y_train, batch_size=128, epochs=100,
                    verbose=1, validation_data=(X_test_array, y_test))

In [None]:
print(history.history.keys())
# summarize history for performance
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Performance')
plt.ylabel('Mean Squared Error')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
from keras.layers import Add, Activation, Lambda
class EmbeddingLayer:
    def __init__(self, n_items, n_factors):
        self.n_items = n_items
        self.n_factors = n_factors
    
    def __call__(self, x):
        x = Embedding(self.n_items, self.n_factors, embeddings_initializer='he_normal',
                      embeddings_regularizer=l2(1e-6))(x)
        x = Reshape((self.n_factors,))(x)
        return x
def RecommenderV2(n_users, n_movies, n_factors, min_rating, max_rating):
    user = Input(shape=(1,))
    u = EmbeddingLayer(n_users, n_factors)(user)
    ub = EmbeddingLayer(n_users, 1)(user)
    
    movie = Input(shape=(1,))
    m = EmbeddingLayer(n_movies, n_factors)(movie)
    mb = EmbeddingLayer(n_movies, 1)(movie)
    x = Dot(axes=1)([u, m])
    x = Add()([x, ub, mb])
    x = Activation('sigmoid')(x)
    x = Lambda(lambda x: x * (max_rating - min_rating) + min_rating)(x)
    model = Model(inputs=[user, movie], outputs=x)
    opt = Adam(lr=0.001)
    model.compile(loss='mean_squared_error', optimizer=opt)
    return model

In [None]:
model = RecommenderV2(n_users, n_movies, n_factors, min_rating, max_rating)
model.summary()

In [None]:
history = model.fit(x=X_train_array, y=y_train, batch_size=128, epochs=100,
                    verbose=1, validation_data=(X_test_array, y_test))

In [None]:
print(history.history.keys())
# summarize history for performance
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Performance')
plt.ylabel('Mean Squared Error')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
from keras.layers import Concatenate, Dense, Dropout
def RecommenderNet(n_users, n_movies, n_factors, min_rating, max_rating):
    user = Input(shape=(1,))
    u = EmbeddingLayer(n_users, n_factors)(user)
    
    movie = Input(shape=(1,))
    m = EmbeddingLayer(n_movies, n_factors)(movie)
    
    x = Concatenate()([u, m])
    x = Dropout(0.05)(x)
    
    x = Dense(100, kernel_initializer='he_normal')(x)
    x = Activation('relu')(x)
    x = Dropout(0.5)(x)
    
    x = Dense(1, kernel_initializer='he_normal')(x)
    x = Activation('sigmoid')(x)
    x = Lambda(lambda x: x * (max_rating - min_rating) + min_rating)(x)
    model = Model(inputs=[user, movie], outputs=x)
    opt = Adam(lr=0.001)
    model.compile(loss='mean_squared_error', optimizer=opt)
    return model

In [None]:
model = RecommenderNet(n_users, n_movies, n_factors, min_rating, max_rating)
model.summary()

In [None]:
history = model.fit(x=X_train_array, y=y_train, batch_size=128, epochs=100,
                    verbose=1, validation_data=(X_test_array, y_test))

In [None]:
print(history.history.keys())
# summarize history for performance
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Performance')
plt.ylabel('Mean Squared Error')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()