In [14]:
import pandas as pd
ratings_cols = ['user_id', 'movie_id', 'rating']
ratings = pd.read_csv(
    '../../datasets/movie_ratings.data',
    sep='\t', 
    names=ratings_cols, 
    usecols=range(3), 
    encoding="ISO-8859-1"
)
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,0,50,5
1,0,172,5
2,0,133,1
3,196,242,3
4,186,302,3


In [19]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100003 entries, 0 to 100002
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype
---  ------    --------------   -----
 0   user_id   100003 non-null  int64
 1   movie_id  100003 non-null  int64
 2   rating    100003 non-null  int64
dtypes: int64(3)
memory usage: 2.3 MB


In [24]:
ratings.groupby('user_id').size()

user_id
0        3
1      272
2       62
3       54
4       24
      ... 
939     49
940    107
941     22
942     79
943    168
Length: 944, dtype: int64

In [11]:
movie_cols = ['movie_id', 'title']
movies = pd.read_csv(
    '../../datasets/movie_items.item', 
    sep='|', 
    usecols=range(2), 
    encoding="ISO-8859-1",
    names=movie_cols
)

movies.head()

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [20]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1682 entries, 0 to 1681
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  1682 non-null   int64 
 1   title     1682 non-null   object
dtypes: int64(1), object(1)
memory usage: 26.4+ KB


In [16]:
# Merge movies and ratings

movies_ratings = pd.merge(movies, ratings)
movies_ratings.head()

Unnamed: 0,movie_id,title,user_id,rating
0,1,Toy Story (1995),308,4
1,1,Toy Story (1995),287,5
2,1,Toy Story (1995),148,4
3,1,Toy Story (1995),280,4
4,1,Toy Story (1995),66,3


In [25]:
# Build a matrix of users vs movies
movie_matrix = movies_ratings.pivot_table(index=['user_id'], columns=['title'], values='rating')
movie_matrix.head(10)

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,2.0,,,,,4.0,,,...,,,,4.0,,,,,4.0,
6,,,,4.0,,,,5.0,,,...,,,,4.0,,,,,,
7,,,,4.0,,,5.0,5.0,,4.0,...,,,,5.0,3.0,,3.0,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,4.0,...,,,,,,,,,,


In [28]:
# Let's pick one movie and find similar movie recommendations

TS_rating = movie_matrix['Toy Story (1995)']
TS_rating.head()

user_id
0    NaN
1    5.0
2    4.0
3    NaN
4    NaN
Name: Toy Story (1995), dtype: float64

In [30]:
# correlate Toy Story with other movies
similar_movies = movie_matrix.corrwith(TS_rating)
similar_movies = similar_movies.dropna()
similar_movies.head()

title
'Til There Was You (1997)      0.534522
101 Dalmatians (1996)          0.232118
12 Angry Men (1957)            0.334943
187 (1997)                     0.651857
2 Days in the Valley (1996)    0.162728
dtype: float64

In [34]:
similar_movies.sort_values(ascending=False)

title
Ladybird Ladybird (1994)                                                             1.0
Albino Alligator (1996)                                                              1.0
Old Lady Who Walked in the Sea, The (Vieille qui marchait dans la mer, La) (1991)    1.0
Critical Care (1997)                                                                 1.0
Late Bloomers (1996)                                                                 1.0
                                                                                    ... 
Feast of July (1995)                                                                -1.0
Love and Death on Long Island (1997)                                                -1.0
Slingshot, The (1993)                                                               -1.0
Stalker (1979)                                                                      -1.0
Heavy (1995)                                                                        -1.0
Length: 1370, d

In [33]:
similar_df = pd.DataFrame(similar_movies)
similar_df.head(10)

Unnamed: 0_level_0,0
title,Unnamed: 1_level_1
'Til There Was You (1997),0.534522
101 Dalmatians (1996),0.232118
12 Angry Men (1957),0.334943
187 (1997),0.651857
2 Days in the Valley (1996),0.162728
"20,000 Leagues Under the Sea (1954)",0.328472
2001: A Space Odyssey (1968),-0.06906
"39 Steps, The (1935)",0.150055
8 1/2 (1963),-0.117259
8 Heads in a Duffel Bag (1997),0.5
