In [57]:
import numpy as np
import pandas as pd


In [58]:
ratings_data = pd.read_table('ml-1m/ratings.dat', sep = '::',
                            header = None, names = ['userId','movieId','rating','timestamp'])
movies_data = pd.read_table('ml-1m/movies.dat', sep = '::',header = None, names = ['movieId','title','genres'])

  
  This is separate from the ipykernel package so we can avoid doing imports until


In [59]:
pd.set_option('max_rows', 20)

In [60]:
ratings_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
userId       1000209 non-null int64
movieId      1000209 non-null int64
rating       1000209 non-null int64
timestamp    1000209 non-null int64
dtypes: int64(4)
memory usage: 30.5 MB


In [61]:
movies_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
movieId    3883 non-null int64
title      3883 non-null object
genres     3883 non-null object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB


In [62]:
missing_values = ['na','--','?','-','None','none','non']

In [63]:
#extract the year from the title
movies_data['year'] = movies_data.title.str.extract('(\(\d\d\d\d\))', expand = False)

In [64]:
movies_data.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),Animation|Children's|Comedy,(1995)
1,2,Jumanji (1995),Adventure|Children's|Fantasy,(1995)
2,3,Grumpier Old Men (1995),Comedy|Romance,(1995)
3,4,Waiting to Exhale (1995),Comedy|Drama,(1995)
4,5,Father of the Bride Part II (1995),Comedy,(1995)


In [65]:
#removing the year from the title column
movies_data['title'] = movies_data.title.str.replace('(\(\d\d\d\d\))', '')

In [66]:
movies_data.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Animation|Children's|Comedy,(1995)
1,2,Jumanji,Adventure|Children's|Fantasy,(1995)
2,3,Grumpier Old Men,Comedy|Romance,(1995)
3,4,Waiting to Exhale,Comedy|Drama,(1995)
4,5,Father of the Bride Part II,Comedy,(1995)


In [67]:
movies_data['title'] = movies_data['title'].apply(lambda x: x.strip())

In [68]:
movies_data['genres'] = movies_data.genres.str.split('|')
movies_data.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Animation, Children's, Comedy]",(1995)
1,2,Jumanji,"[Adventure, Children's, Fantasy]",(1995)
2,3,Grumpier Old Men,"[Comedy, Romance]",(1995)
3,4,Waiting to Exhale,"[Comedy, Drama]",(1995)
4,5,Father of the Bride Part II,[Comedy],(1995)


In [69]:
movies_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 4 columns):
movieId    3883 non-null int64
title      3883 non-null object
genres     3883 non-null object
year       3883 non-null object
dtypes: int64(1), object(3)
memory usage: 121.4+ KB


In [70]:
movies_with_genres= movies_data.copy(deep = True)

In [71]:
#this is to get the onehotencoder version of the genres column
x = []
for index, row in movies_data.iterrows():
#     print(row)
    x.append(index)
    for genre in row['genres']:
#         print(genre)
        movies_with_genres.at[index,genre] = 1
print(len(x) == len(movies_data))

True


In [72]:
movies_with_genres.head(3)

Unnamed: 0,movieId,title,genres,year,Animation,Children's,Comedy,Adventure,Fantasy,Romance,...,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1,Toy Story,"[Animation, Children's, Comedy]",(1995),1.0,1.0,1.0,,,,...,,,,,,,,,,
1,2,Jumanji,"[Adventure, Children's, Fantasy]",(1995),,1.0,,1.0,1.0,,...,,,,,,,,,,
2,3,Grumpier Old Men,"[Comedy, Romance]",(1995),,,1.0,,,1.0,...,,,,,,,,,,


In [73]:
movies_with_genres.fillna(0, inplace = True)

In [74]:
movies_with_genres.head()

Unnamed: 0,movieId,title,genres,year,Animation,Children's,Comedy,Adventure,Fantasy,Romance,...,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1,Toy Story,"[Animation, Children's, Comedy]",(1995),1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children's, Fantasy]",(1995),0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",(1995),0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama]",(1995),0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],(1995),0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [75]:
ratings_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [76]:
ratings_data.drop('timestamp', axis = 1, inplace = True)

In [96]:
Lawrence_movie_ratings = [
    {'title': 'Predator',
    'rating':4.9,
    },
    {'title':'Final Destination', 'rating': 4.9},
    {'title':'Mission Impossible', 'rating': 4},
    {'title': 'Beverly Hills Cop','rating' :3},
    {'title': 'Exorcist, The','rating':4.8},
    {'title': 'Waiting to Exhale','rating':3.9},
    {'title': 'Avengers, The', 'rating':4.5},
    {'title': 'Omen, The','rating':5.0}
]


In [97]:
Lawrence_movie_ratings = pd.DataFrame(Lawrence_movie_ratings)

In [98]:
Lawrence_movie_ratings

Unnamed: 0,rating,title
0,4.9,Predator
1,4.9,Final Destination
2,4.0,Mission Impossible
3,3.0,Beverly Hills Cop
4,4.8,"Exorcist, The"
5,3.9,Waiting to Exhale
6,4.5,"Avengers, The"
7,5.0,"Omen, The"


In [99]:
#find the movies that are in lawrence movie list
Lawrence_movie_id = movies_data[movies_data['title'].isin(Lawrence_movie_ratings['title'])]

In [100]:
print(Lawrence_movie_id)

      movieId              title                      genres    year
3           4  Waiting to Exhale             [Comedy, Drama]  (1995)
1329     1350          Omen, The                    [Horror]  (1976)
1928     1997      Exorcist, The                    [Horror]  (1973)
2084     2153      Avengers, The         [Action, Adventure]  (1998)
3340     3409  Final Destination           [Drama, Thriller]  (2000)
3458     3527           Predator  [Action, Sci-Fi, Thriller]  (1987)


In [101]:
Lawrence_movie_ratings = pd.merge(Lawrence_movie_id, Lawrence_movie_ratings)

In [102]:
Lawrence_movie_ratings

Unnamed: 0,movieId,title,genres,year,rating
0,4,Waiting to Exhale,"[Comedy, Drama]",(1995),3.9
1,1350,"Omen, The",[Horror],(1976),5.0
2,1997,"Exorcist, The",[Horror],(1973),4.8
3,2153,"Avengers, The","[Action, Adventure]",(1998),4.5
4,3409,Final Destination,"[Drama, Thriller]",(2000),4.9
5,3527,Predator,"[Action, Sci-Fi, Thriller]",(1987),4.9


In [103]:
Lawrence_genres = movies_with_genres[movies_with_genres.movieId.isin(Lawrence_movie_id.movieId)]

In [104]:
Lawrence_genres

Unnamed: 0,movieId,title,genres,year,Animation,Children's,Comedy,Adventure,Fantasy,Romance,...,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
3,4,Waiting to Exhale,"[Comedy, Drama]",(1995),0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1329,1350,"Omen, The",[Horror],(1976),0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1928,1997,"Exorcist, The",[Horror],(1973),0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2084,2153,"Avengers, The","[Action, Adventure]",(1998),0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3340,3409,Final Destination,"[Drama, Thriller]",(2000),0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3458,3527,Predator,"[Action, Sci-Fi, Thriller]",(1987),0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [105]:
Lawrence_genres.reset_index(drop = True, inplace = True)

In [106]:
Lawrence_genres.drop(['movieId','title','genres','year'], axis = 1, inplace = True)

In [107]:
Lawrence_genres

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [108]:
print('Shape of Lawrence_movie_ratings is:', Lawrence_movie_ratings.shape)

Shape of Lawrence_movie_ratings is: (6, 5)


In [109]:
print('Shape of Lawrence_genres_df is:', Lawrence_genres.shape)

Shape of Lawrence_genres_df is: (6, 18)


In [110]:
#getting the weights of the users choice of movies
Lawrence_profile = Lawrence_genres.T.dot(Lawrence_movie_ratings.rating)

In [111]:
Lawrence_profile

Animation      0.0
Children's     0.0
Comedy         3.9
Adventure      4.5
Fantasy        0.0
Romance        0.0
Drama          8.8
Action         9.4
Crime          0.0
Thriller       9.8
Horror         9.8
Sci-Fi         4.9
Documentary    0.0
War            0.0
Musical        0.0
Mystery        0.0
Film-Noir      0.0
Western        0.0
dtype: float64

In [112]:
movies_with_genres = movies_with_genres.set_index(movies_with_genres.movieId)

In [113]:
movies_with_genres.head()

Unnamed: 0_level_0,movieId,title,genres,year,Animation,Children's,Comedy,Adventure,Fantasy,Romance,...,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,Toy Story,"[Animation, Children's, Comedy]",(1995),1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,Jumanji,"[Adventure, Children's, Fantasy]",(1995),0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,Grumpier Old Men,"[Comedy, Romance]",(1995),0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,Waiting to Exhale,"[Comedy, Drama]",(1995),0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,5,Father of the Bride Part II,[Comedy],(1995),0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [114]:
movies_with_genres.drop(['movieId','title','genres','year'],axis = 1, inplace = True)

In [115]:
movies_with_genres.head()

Unnamed: 0_level_0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [120]:
print(movies_with_genres.shape)

(3883, 18)


In [121]:
#this is to get the weights associated with each genre
recommendation_table = (movies_with_genres.dot(Lawrence_profile))/Lawrence_profile.sum()

In [122]:
recommendation_table

movieId
1       0.076321
2       0.088063
3       0.076321
4       0.248532
5       0.076321
6       0.375734
7       0.076321
8       0.088063
9       0.183953
10      0.463796
          ...   
3943    0.076321
3944    0.248532
3945    0.088063
3946    0.547945
3947    0.191781
3948    0.076321
3949    0.172211
3950    0.172211
3951    0.172211
3952    0.363992
Length: 3883, dtype: float64

In [123]:
recommendation_table.sort_values(ascending = False, inplace = True)

In [125]:
recommendation_table.head(20)

movieId
1214    0.663405
1320    0.663405
2288    0.663405
2617    0.655577
70      0.643836
1876    0.643836
1215    0.636008
2344    0.636008
2826    0.567515
1544    0.559687
1127    0.559687
1129    0.559687
1591    0.559687
2322    0.559687
610     0.559687
2916    0.559687
849     0.559687
1917    0.559687
2488    0.555773
1626    0.547945
dtype: float64

In [126]:
copy = movies_data.copy(deep = True)

In [127]:
copy = copy.set_index('movieId', drop = True)


In [129]:
top_20_index = recommendation_table.index[:20].tolist()

In [130]:
recommended_movies = copy.loc[top_20_index, :]

In [131]:
recommended_movies

Unnamed: 0_level_0,title,genres,year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1214,Alien,"[Action, Horror, Sci-Fi, Thriller]",(1979)
1320,Alien³,"[Action, Horror, Sci-Fi, Thriller]",(1992)
2288,"Thing, The","[Action, Horror, Sci-Fi, Thriller]",(1982)
2617,"Mummy, The","[Action, Adventure, Horror, Thriller]",(1999)
70,From Dusk Till Dawn,"[Action, Comedy, Crime, Horror, Thriller]",(1996)
1876,Deep Impact,"[Action, Drama, Sci-Fi, Thriller]",(1998)
1215,Army of Darkness,"[Action, Adventure, Comedy, Horror, Sci-Fi]",(1993)
2344,Runaway Train,"[Action, Adventure, Drama, Thriller]",(1985)
2826,"13th Warrior, The","[Action, Horror, Thriller]",(1999)
1544,"Lost World: Jurassic Park, The","[Action, Adventure, Sci-Fi, Thriller]",(1997)
