# User-Based Collaborative Filtering

Start by importing the MovieLens 100K data set into a pandas DataFrame:

In [1]:
import pandas as pd

r_cols = ['user_id', 'movie_id', 'rating']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols, usecols=range(3), encoding="ISO-8859-1")

m_cols = ['movie_id', 'title']
movies = pd.read_csv('ml-100k/u.item', sep='|', names=m_cols, usecols=range(2), encoding="ISO-8859-1")

ratings = pd.merge(movies, ratings)

ratings.head()

Unnamed: 0,movie_id,title,user_id,rating
0,1,Toy Story (1995),308,4
1,1,Toy Story (1995),287,5
2,1,Toy Story (1995),148,4
3,1,Toy Story (1995),280,4
4,1,Toy Story (1995),66,3


Construct a matrix of users and the movies they rated.

In [2]:
userRatings = ratings.pivot_table(index=['title'],columns=['user_id'],values='rating')
userRatings.head()

user_id,0,1,2,3,4,5,6,7,8,9,...,934,935,936,937,938,939,940,941,942,943
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Til There Was You (1997),,,,,,,,,,,...,,,,,,,,,,
1-900 (1994),,,,,,,,,,,...,,,,,,,,,,
101 Dalmatians (1996),,2.0,,,,2.0,,,,,...,2.0,,,2.0,4.0,,,,,
12 Angry Men (1957),,5.0,,,,,4.0,4.0,,,...,,,,,,,,,,
187 (1997),,,,2.0,,,,,,,...,,,,,,,,,,


Pandas has a built-in corr() method that will compute a correlation score for every column pair in the matrix.

In [3]:
corrMatrix = userRatings.corr()
corrMatrix.head()

user_id,0,1,2,3,4,5,6,7,8,9,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,1.0,,,,,,-0.5,,,...,,,,,,,,,,
1,1.0,1.0,0.160841,0.11278,0.5,0.420809,0.29541,0.258137,0.692086,-0.102062,...,0.061695,-0.260242,0.386346,0.029,0.326744,0.5343904,0.263289,0.205616,-0.180784,0.067549
2,,0.160841,1.0,0.06742,0.148522,0.327327,0.446966,0.643675,0.585491,0.242536,...,0.029341,-0.271163,0.214017,0.566724,0.331587,1.380822e-16,-0.011682,-0.062017,0.08596,0.479702
3,,0.11278,0.06742,1.0,-0.2626,,-0.109109,0.064803,0.291937,,...,0.0,,-0.045162,0.0,-0.137523,,-0.104678,1.0,-0.011792,
4,,0.5,0.148522,-0.2626,1.0,1.0,-0.581318,-0.266632,0.642938,,...,0.866025,,-0.203653,,0.375,,0.850992,1.0,0.412568,


However, we want to avoid spurious results that happened from just a handful of users that happened to rate the same pair of movies. In order to restrict our results to movies that lots of people rated together - and also give us more popular results that are more easily recongnizable - we'll use the min_periods argument to throw out results where fewer than 100 users rated a given movie pair:

In [14]:
corrMatrix = userRatings.corr(method='pearson', min_periods=10)
corrMatrix.head()

user_id,0,1,2,3,4,5,6,7,8,9,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,1.0,0.160841,,,0.420809,0.29541,0.258137,0.692086,,...,0.061695,-0.260242,0.386346,0.029,0.326744,0.5343904,0.263289,0.205616,-0.180784,0.067549
2,,0.160841,1.0,,,,0.446966,0.643675,,,...,0.029341,-0.271163,0.214017,0.566724,0.331587,1.380822e-16,-0.011682,,0.08596,
3,,,,1.0,-0.2626,,-0.109109,0.064803,,,...,,,-0.045162,,-0.137523,,-0.104678,,,
4,,,,-0.2626,1.0,,,-0.266632,,,...,,,,,,,0.850992,,,


Now let's produce some movie recommendations for user ID 4.

In [49]:
myRatings = userRatings.loc[:,4].dropna().sort_values(ascending=False)
print(len(myRatings))
myRatings

24


title
Air Force One (1997)                         5.0
In & Out (1997)                              5.0
Wedding Singer, The (1998)                   5.0
Ulee's Gold (1997)                           5.0
Star Wars (1977)                             5.0
Lost Highway (1997)                          5.0
Assignment, The (1997)                       5.0
Incognito (1997)                             5.0
Liar Liar (1997)                             5.0
Desperate Measures (1998)                    5.0
Cop Land (1997)                              5.0
Contact (1997)                               5.0
Blues Brothers 2000 (1998)                   5.0
Wonderland (1997)                            5.0
Event Horizon (1997)                         4.0
One Flew Over the Cuckoo's Nest (1975)       4.0
Scream (1996)                                4.0
Seven (Se7en) (1995)                         4.0
Starship Troopers (1997)                     4.0
Indiana Jones and the Last Crusade (1989)    3.0
Mimic (1997)  

In [64]:
similar_users = corrMatrix[4].dropna().sort_values(ascending=False)
similar_users

user_id
4      1.000000
940    0.850992
683    0.658311
332    0.575854
773    0.569267
         ...   
551   -0.374351
466   -0.407804
782   -0.443637
889   -0.449467
608   -0.681677
Name: 4, Length: 104, dtype: float64

In [65]:
# Once we have the correlation score with all the users, we consider only few top matching users and also drop our current user
similar_users = similar_users[similar_users.values > 0.5].drop(index=4)
similar_users

user_id
940    0.850992
683    0.658311
332    0.575854
773    0.569267
896    0.565819
592    0.559405
276    0.534052
758    0.533852
464    0.527339
Name: 4, dtype: float64

In [110]:
# extract movies of the above user_id's 
similar_movies= []

for i in range(0, len(similar_users.index)):
    similar_movies.extend(userRatings[similar_users.index[i]].dropna().index)

sim_movies = pd.Series(similar_movies)
print(sim_movies.head(10))

0                                    Abyss, The (1989)
1    Adventures of Priscilla, Queen of the Desert, ...
2                                 Air Force One (1997)
3                                       Aladdin (1992)
4                           Alice in Wonderland (1951)
5                                         Alien (1979)
6                           Alien: Resurrection (1997)
7                                        Aliens (1986)
8                                       Amadeus (1984)
9                       American President, The (1995)
dtype: object


In [113]:
# count the multiple occurances of mvoies by grouping them with sum
sim_movies_count = sim_movies.value_counts() 
print(sim_movies_count['Alice in Wonderland (1951)']) # no. of times alice in wonderland has come
print(sim_movies_count)

4
Contact (1997)                        9
Star Wars (1977)                      8
Godfather, The (1972)                 8
L.A. Confidential (1997)              8
Scream (1996)                         8
                                     ..
Celluloid Closet, The (1995)          1
Bronx Tale, A (1993)                  1
Brassed Off (1996)                    1
Bottle Rocket (1996)                  1
French Twist (Gazon maudit) (1995)    1
Length: 851, dtype: int64


In [124]:
# take top movies where value count is greater than 6 and not rated by the user.
# Ignore the error that key not found for those movies that are in User list but not in Similar movies. We can safely ignore this.
recommended_movies = sim_movies_count[sim_movies_count > 6].drop(myRatings.index, inplace=False, errors='ignore') 

print(recommended_movies)

Godfather, The (1972)                8
L.A. Confidential (1997)             8
Usual Suspects, The (1995)           8
Return of the Jedi (1983)            8
Silence of the Lambs, The (1991)     7
Pulp Fiction (1994)                  7
Terminator 2: Judgment Day (1991)    7
Speed (1994)                         7
Titanic (1997)                       7
Twelve Monkeys (1995)                7
Devil's Advocate, The (1997)         7
Empire Strikes Back, The (1980)      7
Raiders of the Lost Ark (1981)       7
Back to the Future (1985)            7
Aliens (1986)                        7
dtype: int64


End!!