In [1]:
# from google.colab import drive
# drive.mount('/content/gdrive')
# %cd '/content/gdrive/MyDrive/CFDataset/ml-100k'
# %ls
# import warnings
# warnings.filterwarnings('ignore')
# !pip install pandas
# !pip install -U scikit-learn

**Importing Libraries**

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.metrics.pairwise import cosine_similarity

**Loading dataset**

In [3]:
def readData(name):
  fold = pd.read_csv(name, sep="\t", header=None)
  fold.columns = ['userId', 'movieId', 'rating', 'timestamp']
  return fold

**Creating User-Item Matrix**

In [4]:
def userItemMatrix(data):
  n = 943
  m = 1682
  matrix = [[0]*m for i in range(n)]

  for i in range(len(data)):
    userid = data['userId'][i]
    movieid = data['movieId'][i]
    rating = data['rating'][i]
    matrix[userid-1][movieid-1] = rating

  return matrix

**Creating Similarity Matrix**

In [5]:
def similarityMatrix(matrix):
  similarityMatrix= cosine_similarity(matrix) 
  return similarityMatrix

**Function for calculating Average of users**

In [6]:
def calculateAvg(row):
  sum =0
  n =0
  for i in range(len(row)):
    if(row[i]==0): 
      continue
    sum += row[i]
    n += 1
  if n==0: 
    return 0
  else: 
    avg = sum/n
    return avg

**Function for calculating MAE Score**

In [7]:
def MAEScore(actualR,predR):
  return mean_absolute_error(actualR, predR)

**Function for predicting the missing rating**



In [8]:
def prediction(matrix, simMatrix, userID, movieID,K):
  smUser = simMatrix[userID]
  sim = list()
  for i in range(len(smUser)):
    if(i == userID):
      continue
    sim.append((smUser[i],i))
  sim.sort(key = lambda x:x[0],reverse=True)

  Num =0
  Den =0
  missingR =0
  count =0

  for i in range(len(sim)):
    userSim, curID = sim[i]
    userRating = matrix[curID][movieID]

    if(userRating == 0):
      continue
    
    Num += ((userRating - calculateAvg(matrix[curID]))*userSim)
    Den += userSim
    
    count +=1
    if(count == K):
      break

  if(Den ==0):
    missingR = calculateAvg(matrix[userID])
  else:
    missingR = calculateAvg(matrix[userID]) + (Num/Den)
 
  return missingR

**Calculating MAE Score for each fold and for each K neighbour Value**



In [9]:
KNeighbour = [10,20,30,40,50]
finalAccuracy =[[0]*5 for i in range(5)]

for fold in range(1,6):
  trainData = readData('u' + str(fold) + '.base')
  testData = readData('u' + str(fold) + '.test')

  matrix = userItemMatrix(trainData)
  simMatrix = similarityMatrix(matrix)

  c = 0
  for k in KNeighbour:
    predList =list()
    actualList =list()
    for i in range(testData.shape[0]):
      userID = testData['userId'][i]-1
      movieID = testData['movieId'][i]-1

      predictR = prediction(matrix, simMatrix, userID, movieID,k)
      predList.append(predictR)
      actualList.append(testData['rating'][i])
   
    finalAccuracy[fold-1][c] = MAEScore(actualList,predList)
    c +=1
    print(f"K {k} completed for fold {fold}") 
  print(f"fold {fold} completed")   

K 10 completed for fold 1
K 20 completed for fold 1
K 30 completed for fold 1
K 40 completed for fold 1
K 50 completed for fold 1
fold 1 completed
K 10 completed for fold 2
K 20 completed for fold 2
K 30 completed for fold 2
K 40 completed for fold 2
K 50 completed for fold 2
fold 2 completed
K 10 completed for fold 3
K 20 completed for fold 3
K 30 completed for fold 3
K 40 completed for fold 3
K 50 completed for fold 3
fold 3 completed
K 10 completed for fold 4
K 20 completed for fold 4
K 30 completed for fold 4
K 40 completed for fold 4
K 50 completed for fold 4
fold 4 completed
K 10 completed for fold 5
K 20 completed for fold 5
K 30 completed for fold 5
K 40 completed for fold 5


In [None]:
print(np.matrix(finalAccuracy))

[[0.76208424 0.7526601  0.75273121 0.7529733  0.75363184]
 [0.74922404 0.7396596  0.73895325 0.74002126 0.74092716]
 [0.74941709 0.73897463 0.73746467 0.73752042 0.73810018]
 [0.74366147 0.73522938 0.73488766 0.73503262 0.73593911]
 [0.75217146 0.74091496 0.73943641 0.74037654 0.74098665]]
