In [7]:
import pandas as pd


# Grabbing data

In [8]:
movies_df = pd.read_csv('movies.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [9]:
user_ratings_df = pd.read_csv('user_ratings.csv')
user_ratings_df.head(10)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
5,18,1,3.5,1455209816,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
6,19,1,4.0,965705637,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
7,21,1,3.5,1407618878,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
8,27,1,3.0,962685262,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
9,31,1,5.0,850466616,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


## Finding the Average Rating from Users 

In [18]:
average_rating_df = user_ratings_df[["title", "rating"]].groupby("title").mean()
average_rating_df

Unnamed: 0_level_0,rating
title,Unnamed: 1_level_1
'71 (2014),4.000000
'Hellboy': The Seeds of Creation (2004),4.000000
'Round Midnight (1986),3.500000
'Salem's Lot (2004),5.000000
'Til There Was You (1997),4.000000
...,...
eXistenZ (1999),3.863636
xXx (2002),2.770833
xXx: State of the Union (2005),2.000000
¡Three Amigos! (1986),3.134615


## Sorting the Values by Rating

In [20]:
sorted_average_rating_df = average_rating_df.sort_values(by = 'rating', ascending=False)
sorted_average_rating_df

Unnamed: 0_level_0,rating
title,Unnamed: 1_level_1
Gena the Crocodile (1969),5.0
True Stories (1986),5.0
Cosmic Scrat-tastrophe (2015),5.0
Love and Pigeons (1985),5.0
Red Sorghum (Hong gao liang) (1987),5.0
...,...
Don't Look Now (1973),0.5
Journey 2: The Mysterious Island (2012),0.5
Joe Dirt 2: Beautiful Loser (2015),0.5
Jesus Christ Vampire Hunter (2001),0.5


We need to make sure the ratings shown refer to movies that have been reviewed a certain number of times.

Creating a list of movies that shows more than 50 times in the database.

In [22]:
movie_popularity =  user_ratings_df["title"].value_counts()
movie_popularity

title
Forrest Gump (1994)                 329
Shawshank Redemption, The (1994)    317
Pulp Fiction (1994)                 307
Silence of the Lambs, The (1991)    279
Matrix, The (1999)                  278
                                   ... 
Sex, Drugs & Taxation (2013)          1
Extraordinary Tales (2015)            1
Tomorrow (2015)                       1
Embrace of the Serpent (2016)         1
31 (2016)                             1
Name: count, Length: 9719, dtype: int64

Let's now apply some filter for the number of times the moview was seen.

In [25]:
popular_movies = movie_popularity[movie_popularity > 50].index
popular_movies

Index(['Forrest Gump (1994)', 'Shawshank Redemption, The (1994)',
       'Pulp Fiction (1994)', 'Silence of the Lambs, The (1991)',
       'Matrix, The (1999)', 'Star Wars: Episode IV - A New Hope (1977)',
       'Jurassic Park (1993)', 'Braveheart (1995)',
       'Terminator 2: Judgment Day (1991)', 'Schindler's List (1993)',
       ...
       'Chasing Amy (1997)', 'Mystic River (2003)',
       'Grand Budapest Hotel, The (2014)', 'Blow (2001)',
       'Army of Darkness (1993)', 'Training Day (2001)', 'Bad Boys (1995)',
       'Splash (1984)', 'Mulholland Drive (2001)',
       'The Devil's Advocate (1997)'],
      dtype='object', name='title', length=437)

Now we are able to use this list of movies to filter the original dataset. 

In [26]:
popular_movies_rankings = user_ratings_df[user_ratings_df["title"].isin(popular_movies)]
popular_movies_rankings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


Then we do the same with respect to computing the average of ratings:

In [28]:
popular_movies_average_rankings = popular_movies_rankings[["title", "rating"]].groupby("title").mean()
popular_movies_average_rankings.sort_values(by='rating', ascending=False)

Unnamed: 0_level_0,rating
title,Unnamed: 1_level_1
"Shawshank Redemption, The (1994)",4.429022
"Godfather, The (1972)",4.289062
Fight Club (1999),4.272936
Cool Hand Luke (1967),4.271930
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964),4.268041
...,...
Johnny Mnemonic (1995),2.679245
Judge Dredd (1995),2.669355
City Slickers II: The Legend of Curly's Gold (1994),2.645455
Coneheads (1993),2.420635


## Pair of Movies
Now we will need to find pairs of movies that happened to be seen by the same user. 
It is important to find different permutations (movie B given movie A and Movie A given movie B... )

In [38]:
from AC_recommend import find_movie_pairs

We can now apply the find_movie_pairs to the users ratings database:

In [39]:
movie_combinations = user_ratings_df.groupby("userId")['title'].apply(find_movie_pairs).reset_index(drop=True)
movie_combinations

Unnamed: 0,0,1
0,Toy Story (1995),Grumpier Old Men (1995)
1,Toy Story (1995),Heat (1995)
2,Toy Story (1995),Seven (a.k.a. Se7en) (1995)
3,Toy Story (1995),"Usual Suspects, The (1995)"
4,Toy Story (1995),From Dusk Till Dawn (1996)
...,...,...
60793295,31 (2016),Gen-X Cops (1999)
60793296,31 (2016),Bloodmoon (1997)
60793297,31 (2016),Sympathy for the Underdog (1971)
60793298,31 (2016),Hazard (2005)


In [45]:
new_column_names = {0: 'movie_a', 1: 'movie_b'}
movie_combinations.rename(columns= new_column_names, inplace=True)
movie_combinations

Unnamed: 0,movie_a,movie_b
0,Toy Story (1995),Grumpier Old Men (1995)
1,Toy Story (1995),Heat (1995)
2,Toy Story (1995),Seven (a.k.a. Se7en) (1995)
3,Toy Story (1995),"Usual Suspects, The (1995)"
4,Toy Story (1995),From Dusk Till Dawn (1996)
...,...,...
60793295,31 (2016),Gen-X Cops (1999)
60793296,31 (2016),Bloodmoon (1997)
60793297,31 (2016),Sympathy for the Underdog (1971)
60793298,31 (2016),Hazard (2005)


Now we can count the pairs of movie. This will help us to make recommendations.

In [48]:
movie_combinations_counts = movie_combinations.groupby(["movie_a", "movie_b"]).size()
movie_combinations_counts.sort_values(ascending=False)

movie_a                                    movie_b                              
Shawshank Redemption, The (1994)           Forrest Gump (1994)                      231
Forrest Gump (1994)                        Shawshank Redemption, The (1994)         231
                                           Pulp Fiction (1994)                      230
Pulp Fiction (1994)                        Forrest Gump (1994)                      230
Shawshank Redemption, The (1994)           Pulp Fiction (1994)                      222
                                                                                   ... 
History Boys, The (2006)                   8 Seconds (1994)                           1
                                           7th Voyage of Sinbad, The (1958)           1
                                           6th Day, The (2000)                        1
                                           54 (1998)                                  1
À nous la liberté (Freedom for Us) (193

In [51]:
combination_counts_df = movie_combinations_counts.to_frame(name = 'size').reset_index()


TypeError: DataFrame.sort_values() missing 1 required positional argument: 'by'

In [54]:
combination_counts_df.sort_values(by='size', ascending=False)

Unnamed: 0,movie_a,movie_b,size
20672049,"Shawshank Redemption, The (1994)",Forrest Gump (1994),231
8538313,Forrest Gump (1994),"Shawshank Redemption, The (1994)",231
8537603,Forrest Gump (1994),Pulp Fiction (1994),230
18567035,Pulp Fiction (1994),Forrest Gump (1994),230
20675275,"Shawshank Redemption, The (1994)",Pulp Fiction (1994),222
...,...,...,...
10737102,"History Boys, The (2006)",8 Seconds (1994),1
10737100,"History Boys, The (2006)","7th Voyage of Sinbad, The (1958)",1
10737099,"History Boys, The (2006)","6th Day, The (2000)",1
10737098,"History Boys, The (2006)",54 (1998),1
