In [1]:
import pandas as pd 
import numpy as np
import scipy.stats as stats
import numba
import scipy
from sklearn.metrics import jaccard_score

In [2]:
u_data = pd.read_csv('u.data',names=['userId','itemId','rating','timestamp'],delimiter='\t')
u_genre = pd.read_csv('u.genre',names=['genre','genreId'],delimiter='|')
u_occupation = pd.read_csv('u.occupation',names=['occupation'])
u_user = pd.read_csv('u.user',names=['userId','age','gender','occupation','zipcode'],delimiter='|')
u_item = pd.read_csv('u.item',names=['movieId','movieTitle','releaseDate','videoReleaseDate','IMDbId','unknown','action','adventure','animation','children','comedy','crime','documentary','drama','fantasy','filmNoir','horror','musical','mystery','romance','sciFi','thriller','war','western'],delimiter='|')

In [3]:
def split_data_ml100k(data, num_users, num_items, split_mode='random',
                      test_ratio=0.1):
    """Split the dataset in random mode or seq-aware mode."""
    if split_mode == 'seq-aware':
        train_items, test_items, train_list = {}, {}, []
        for line in data.itertuples():
            u, i, rating, time = line[1], line[2], line[3], line[4]
            train_items.setdefault(u, []).append((u, i, rating, time))
            if u not in test_items or test_items[u][-1] < time:
                test_items[u] = (i, rating, time)
        for u in range(1, num_users + 1):
            train_list.extend(sorted(train_items[u], key=lambda k: k[3]))
        test_data = [(key, *value) for key, value in test_items.items()]
        train_data = [item for item in train_list if item not in test_data]
        train_data = pd.DataFrame(train_data)
        test_data = pd.DataFrame(test_data)
    else:
        mask = [
            True if x == 1 else False
            for x in np.random.uniform(0, 1, (len(data))) < 1 - test_ratio]
        neg_mask = [not x for x in mask]
        train_data, test_data = data[mask], data[neg_mask]
    return train_data, test_data
train_data , test_data = split_data_ml100k(u_data,943,1682,test_ratio=0.2)
train_data

Unnamed: 0,userId,itemId,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [4]:
master_record = pd.merge(u_data,u_user,on='userId')
master_record_1 = pd.merge(master_record,u_item,left_on='itemId',right_on='movieId')
master_record_1.sort_values(by=['userId','itemId'])

Unnamed: 0,userId,itemId,rating,timestamp,age,gender,occupation,zipcode,movieId,movieTitle,...,fantasy,filmNoir,horror,musical,mystery,romance,sciFi,thriller,war,western
50341,1,1,5,874965758,24,M,technician,85711,1,Toy Story (1995),...,0,0,0,0,0,0,0,0,0,0
30753,1,2,3,876893171,24,M,technician,85711,2,GoldenEye (1995),...,0,0,0,0,0,0,0,1,0,0
47930,1,3,4,878542960,24,M,technician,85711,3,Four Rooms (1995),...,0,0,0,0,0,0,0,1,0,0
32499,1,4,3,876893119,24,M,technician,85711,4,Get Shorty (1995),...,0,0,0,0,0,0,0,0,0,0
89376,1,5,3,889751712,24,M,technician,85711,5,Copycat (1995),...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64364,943,1067,2,875501756,22,M,student,77841,1067,Bottle Rocket (1996),...,0,0,0,0,0,0,0,0,0,0
40854,943,1074,4,888640250,22,M,student,77841,1074,Reality Bites (1994),...,0,0,0,0,0,0,0,0,0,0
50089,943,1188,3,888640250,22,M,student,77841,1188,Young Guns II (1990),...,0,0,0,0,0,0,0,0,0,1
83377,943,1228,3,888640275,22,M,student,77841,1228,Under Siege 2: Dark Territory (1995),...,0,0,0,0,0,0,0,0,0,0


In [5]:
matrix = np.zeros((1682,943))
#u_data = pd.DataFrame(train_data).sort_values(by=[0,1]).to_numpy()

for i in train_data.to_numpy():
    matrix[i[1]-1][i[0]-1]=float(i[2])
    #print(i[2])
rated_unrated_mat = np.zeros((1682,943))
row_index = 0
col_index = 0
for i in matrix:
    for j in i:
        if j!=0:
            rated_unrated_mat[row_index][col_index]=1
        col_index+=1
    row_index+=1
    col_index = 0
matrix = matrix.transpose()
matrix.shape

(943, 1682)

In [14]:

def coef(matrix):
    pearson = np.ones((matrix.shape[0],matrix.shape[0]))
    cosine = np.ones((matrix.shape[0],matrix.shape[0]))
    jaccard = np.ones((matrix.shape[0],matrix.shape[0]))
    row = 0
    for i in matrix:
        col = 0
        for j in matrix:
            #realation = np.corrcoef(i,j)
            r,p = stats.pearsonr(i,j)
            pearson[row,col] = r
            cosine[row,col] = 1 - scipy.spatial.distance.cosine(i,j)
            jaccard[row,col] = jaccard_score(i,j,average='macro')
            col+=1
        row+=1
    return pearson,cosine,jaccard

In [15]:
pearson,cosine,jaccard = coef(matrix)

In [40]:
with open('pearson.csv','w') as f:
    for i in pearson:
        for j in i:
            f.write(str(j))
            f.write(',')
with open('cosine.csv','w') as f:
    for i in cosine:
        for j in i:
            f.write(str(j))
            f.write(',')
with open('jaccard.csv','w') as f:
    for i in pearson:
        for j in i:
            f.write(str(j))
            f.write(',')

In [28]:
print('enter the user with whom most similar users are to be shown')
user = input()
users_pearson = pearson[int(user)-1]
users_cosine = cosine[int(user)-1]
users_jaccard = jaccard[int(user)-1]

enter the user with whom most similar users are to be shown


In [29]:
bestFitUsers_pearson = (-users_pearson).argsort()[:10] +1
bestFitUsers_cosine = (-users_cosine).argsort()[:10] +1
bestFitUsers_jaccard = (-users_jaccard).argsort()[:10] +1

In [30]:
print(" pearson = ",bestFitUsers_pearson,'\n',
    'cosine similarity',bestFitUsers_cosine,'\n'
    ' jaccard',bestFitUsers_jaccard)

 pearson =  [  5 648 545 307  22 922  44 660 738  70] 
 cosine similarity [  5 648 545 660  44 307  22 922 407 738] 
 jaccard [  5  70 545  28 746 307  22 847 933 727]


5
648
545
660
44
307
22
922
407
738
