<a href="https://colab.research.google.com/github/disha2sinha/Movie-Recommendation-System/blob/master/Recommendation_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**IMPORTING LIBRARIES:**

In [1]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import pandas as pd
import math
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
!pip install rake_nltk
from rake_nltk import Rake 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

  import pandas.util.testing as tm




In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
#https://drive.google.com/file/d/1HcaON_5Qz-bfqlRyDgVWfdS4bD0hQ0Z3/view?usp=sharing

In [None]:
id='1HcaON_5Qz-bfqlRyDgVWfdS4bD0hQ0Z3'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('MoviesData.csv')  
movies_data = pd.read_csv('MoviesData.csv',engine='python',index_col=0)
movies_data.head()

# **POPULARITY BASED RECOMMENDATION SYSTEM :**

In [None]:
movies_data[['vote_count','vote_average','rating_count','mean_rating','popularity']].describe()

**WEIGHTED AVERAGE FOR EACH MOVIES AVERAGE RATINGS :**

Weighted Ratings: 
>              W =   Rv + Cm
>                   ---------
                     v+m

R= average rating for the movie as a number from 0 to 10

v= number of votes for the movie

m= minimum votes required to be listed in Top(here 2000)

C=the mean vote across whole report


In [None]:
R=movies_data['vote_average']
v=movies_data['vote_count']
C=movies_data['vote_average'].mean()
m=2000

In [None]:
movies_data['weighted_votes']=(R*v+C*m)/(v+m)
movies_data[['title','weighted_votes']].sort_values('weighted_votes',ascending=False).head(20)

In [None]:
R1=movies_data['mean_rating']
v1=movies_data['rating_count']
C1=movies_data['mean_rating'].mean()
m1=movies_data['rating_count'].quantile(0.95)
m1

In [None]:
movies_data['weighted_ratings']=(R1*v1+C1*m1)/(v1+m1)
movies_data[['title','weighted_ratings']].sort_values('weighted_ratings',ascending=False).head(20)

**ASSIGNING SCORES TO THE MOVIES BY GIVING 45% IMPORTANCE TO WEIGHTED AVERAGE OF VOTES + 45% IMPORTANCE TO POPULARITY + 10% TO WEIGHTED AVERAGE OF RATINGS**

In [None]:
scaler=MinMaxScaler()
movies_data[['weighted_votes','weighted_ratings','popularity']]=scaler.fit_transform(movies_data[['weighted_votes','weighted_ratings','popularity']])
movies_data['score']=movies_data['weighted_votes']*0.45 + movies_data['weighted_ratings']*0.1+movies_data['popularity']*0.45
movies_data[['title','weighted_votes','weighted_ratings','popularity','score']].sort_values('score',ascending=False).head(10)

**ONLY MOVIES HAVING SCORE ABOVE A THRESHOLD VALUE CONSIDERED POPULAR :**

In [None]:
movies_data.score.describe()

In [None]:
movies_data[movies_data['score']>=0.0992].movieId.count()

In [None]:
popular_movies=movies_data[movies_data['score']>=0.0992]
popular_movies=popular_movies[['movieId','title','budget','profit','score','release_year','release_day','genres','production_countries','production_companies','original_language','runtime','content','status','cast','director']].sort_values('score',ascending=False).reset_index(drop=True)
plt.figure(figsize=(30,10))
axis=sns.barplot(x=popular_movies['score'].head(20),y=popular_movies['title'].head(20))
plt.title('Top 20 Most popular movies: ',weight='bold')
plt.xlabel('score',weight='bold')
plt.ylabel('Movies',weight='bold')
plt.savefig('PopularMovies.jpg')

In [None]:
popular_movies.shape

In [None]:
popular_movies.head()

**IMPORTING USER_RATINGS DATASET:**

In [None]:
#https://drive.google.com/file/d/1nVaAmYBm8xnDR8ynF9eX0qY06_c5ZzqG/view?usp=sharing
id='1nVaAmYBm8xnDR8ynF9eX0qY06_c5ZzqG'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('RevisedRatings.csv')  
user_ratings = pd.read_csv('RevisedRatings.csv',engine='python',index_col=0)
user_ratings.head()

**FILTERING USERS BY THE NUMBER OF RATINGS THEY GAVE:**Users who gave less than 60 ratings are discarded.

In [None]:
user_numberOfRatings=pd.DataFrame(user_ratings.groupby('userId')['rating'].count()).rename(columns={'rating':'count_rating'})
user_numberOfRatings=user_numberOfRatings[user_numberOfRatings['count_rating']>50]
len(user_numberOfRatings)

In [None]:
user_ratings.shape

In [None]:
users_movies=pd.merge(user_ratings,popular_movies[['movieId','title']],how='inner',on='movieId')
selectedusers=user_numberOfRatings.index.values.tolist()
final_ratings=users_movies.query('userId in @selectedusers')
final_ratings.shape

In [None]:
final_ratings.head()

# **CONTENT - BASED RECOMMENDATION SYSTEM ON POPULAR MOVIES**

In [None]:
popular_movies.loc[0,'profit']

In [None]:
popular_movies['keywords']=popular_movies['cast'].apply(str)+" "+popular_movies['production_companies'].apply(str)+" "+popular_movies['production_countries'].apply(str)+" "+popular_movies['content'].apply(str)+" "+popular_movies['genres'].apply(str)+" "+popular_movies['director'].apply(str)+" "+popular_movies['release_day'].apply(str)+" "+popular_movies['release_year'].apply(str)+" "+popular_movies['budget'].apply(str)+" "+popular_movies['profit'].apply(str)+" "+popular_movies['score'].apply(str)+" "+popular_movies['runtime'].apply(str)+" "+popular_movies['status'].apply(str)+" "+popular_movies['original_language'].apply(str)
popular_movies['important_words']=""
for index,row in popular_movies.iterrows():
  keywords=row['keywords']
  r=Rake()
  r.extract_keywords_from_text(keywords)
  keywords_scores=r.get_word_degrees()
  popular_movies.at[index,'important_words']=list(keywords_scores.keys())


In [None]:
popular_movies['important_words']=[','.join(map(str,word)) for word in popular_movies['important_words']]
popular_movies.head()

In [None]:
cv=CountVectorizer()
count_matrix=cv.fit_transform(popular_movies['important_words'])
cosine_sim=cosine_similarity(count_matrix,count_matrix)
cosine_sim