# Data Analysis & Calculating Pearson Similarity

In [71]:
import pandas as pd    
import numpy as np    
import seaborn as sns # For visualisation, built on matplotlib

from scipy.stats import pearsonr

%matplotlib inline

In [147]:
# Read the data
movies_df = pd.read_csv('Data/movies.csv')

# Rename the column-headers
movies_df.columns = ['MovieID', 'Title', 'Genres']

movies_df.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [183]:
# Read the data
ratings_df = pd.read_csv('Data/ratings.csv')

# Rename the column-header
ratings_df.columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']

ratings_df.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [184]:
# Pivot the ratings_df dataframe
# Replace na values with float 0.0
movie_user_df = ratings_df.pivot(index = 'UserID', columns = 'MovieID', values = 'Rating').fillna(0)
movie_user_df.head()

MovieID,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Calculating Pearson Similarity

Now we will calculate the Pearson Similarity between taste of each user. The way it's gonna work is that for each row corresponding to the user, (let's say for example: UserID- 2 and UserID-3) we are going to calculate the cosine of the angle between both the vector of User1D 2 and 3.

In [93]:
# Function to compute the pearson correlation between two users
def user_similarity(user_id, data):
    
    my_list = []
    # iterate over the userID
    
    for i in range(1, len(data)):
        
        # computing the similarity
        r = pearsonr(data.iloc[user_id], data.iloc[i])
        print("Correlation b/w userID {0} and {1} is {2}".format(user_id ,i, r[0])) # optional
        my_list.append(r[0])
    return my_list

# Create a matrix to store the similarity between every single userID
def similarity_matrix(data):
    
    my_matrix = []
    for i in range(1, len(data)):
        s = user_similarity(i, data)
        my_matrix.append(s)
        # I enjoy to look the screen showing the progress.
        # Around 7 mins on my 2013 Mac
        print("Processing user: ", i) # optional
    return my_matrix    

s = similarity_matrix(movie_user_df)

# Tried printing the matrix lead to memory error
print("Process Complete") # optional

Processing user:  1
Processing user:  2
Processing user:  3
Processing user:  4
Processing user:  5
Processing user:  6
Processing user:  7
Processing user:  8
Processing user:  9
Processing user:  10
Processing user:  11
Processing user:  12
Processing user:  13
Processing user:  14
Processing user:  15
Processing user:  16
Processing user:  17
Processing user:  18
Processing user:  19
Processing user:  20
Processing user:  21
Processing user:  22
Processing user:  23
Processing user:  24
Processing user:  25
Processing user:  26
Processing user:  27
Processing user:  28
Processing user:  29
Processing user:  30
Processing user:  31
Processing user:  32
Processing user:  33
Processing user:  34
Processing user:  35
Processing user:  36
Processing user:  37
Processing user:  38
Processing user:  39
Processing user:  40
Processing user:  41
Processing user:  42
Processing user:  43
Processing user:  44
Processing user:  45
Processing user:  46
Processing user:  47
Processing user:  48
P

Processing user:  379
Processing user:  380
Processing user:  381
Processing user:  382
Processing user:  383
Processing user:  384
Processing user:  385
Processing user:  386
Processing user:  387
Processing user:  388
Processing user:  389
Processing user:  390
Processing user:  391
Processing user:  392
Processing user:  393
Processing user:  394
Processing user:  395
Processing user:  396
Processing user:  397
Processing user:  398
Processing user:  399
Processing user:  400
Processing user:  401
Processing user:  402
Processing user:  403
Processing user:  404
Processing user:  405
Processing user:  406
Processing user:  407
Processing user:  408
Processing user:  409
Processing user:  410
Processing user:  411
Processing user:  412
Processing user:  413
Processing user:  414
Processing user:  415
Processing user:  416
Processing user:  417
Processing user:  418
Processing user:  419
Processing user:  420
Processing user:  421
Processing user:  422
Processing user:  423
Processing

In [154]:
# Convert the list of list into pandas dataframe
similarity_matrix_df = pd.DataFrame(s)

# Modification of indicies
similarity_matrix_df.columns = [x for x in range(0, 670)]
similarity_matrix_df.index = range(1, 671)

similarity_matrix_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,660,661,662,663,664,665,666,667,668,669
1,1.0,0.57865,0.578052,0.570023,0.568298,0.561226,0.554234,0.547574,0.544872,0.535907,...,-0.006736,-0.006895,-0.006958,-0.007073,-0.007192,-0.007711,-0.00809,-0.008281,-0.008581,-0.015856
2,0.118567,1.0,0.07184,0.14507,0.056172,0.148799,0.243796,0.130091,0.110151,0.087762,...,0.156404,0.060541,0.163837,0.146575,0.173297,0.119107,0.122091,0.076807,0.133076,0.163512
3,0.107401,0.07184,1.0,0.117307,0.071007,0.310544,0.177773,0.020629,0.12874,0.009824,...,0.1044,0.03992,0.106031,0.230808,0.114162,0.077471,0.062695,0.096494,0.046918,0.198755
4,0.095318,0.14507,0.117307,1.0,0.05745,0.086858,0.155993,0.080104,0.025376,0.014211,...,0.184525,0.015722,0.125818,0.208144,0.134188,0.049924,0.038544,0.032182,0.057245,0.216081
5,-0.005817,0.056172,0.071007,0.05745,1.0,-0.006247,0.122288,0.017348,0.040938,-0.004149,...,0.007951,0.005586,0.112479,0.070618,-0.004109,-0.005536,0.016713,0.020646,0.015888,0.081165


In [200]:
similarity_matrix_df.iloc[126:128]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,660,661,662,663,664,665,666,667,668,669
127,0.141209,0.129239,0.102686,0.202786,0.028148,0.082948,0.157237,0.101025,0.035223,-0.003459,...,0.202377,0.047066,0.127676,0.340137,0.135122,0.079831,0.011659,0.050563,0.094081,0.157572
128,0.062024,0.054672,-0.007689,0.033967,0.012094,0.000369,0.057621,0.0541,0.042642,0.028047,...,0.078696,-0.002763,0.057733,0.12218,0.060159,0.018837,-0.002312,0.030681,0.072206,0.016621


In [239]:
# Function predicts the rating of the movie that is unrated by the user
def rating_prediction(user_id, data):
    
    # sum of all the similarity values along column for the particular user 
    users_similarity_sum = similarity_matrix_df.iloc[user_id - 1].sum(axis = 0)
    # sort the matrix of user similarities
    sorted_similarity_values = similarity_matrix_df.iloc[user_id - 1].sort_values(ascending = False)
    # select the top 5 users with similar taste
    most_similar_users = sorted_similarity_values.iloc[1:6]
    # create list of userID
    most_similar_user_id = most_similar_users.index
    print(most_similar_user_id)  

    for i in most_similar_user_id:
        
        # iterate over list of column headers
        for col in data.columns:
            # similarity b/w user_id and i * rating of movies by y
            if (data.iloc[i - 1][col] == 0.0):
                temp = similarity_matrix_df.iloc[user_id - 1][col] * data.iloc[i-1][col]
                pred = temp / users_similarity_sum
                
    print("Done")

# Prediction of the above function    
rating_prediction(4, movie_user_df)

NameError: name 'movie_user_df' is not defined