In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
credits = pd.read_csv('../input/tmdb-movie-metadata/tmdb_5000_credits.csv')
credits.head(5)

In [None]:
movies = pd.read_csv('../input/tmdb-movie-metadata/tmdb_5000_movies.csv')
movies.head(5)

In [None]:
credits.shape

In [None]:
movies.shape

In [None]:
movies.isnull().sum()

## How to get the weighted averages

In [None]:
V = movies['vote_count']
R = movies['vote_average']
C = movies['vote_average'].mean()
m = movies['vote_count'].quantile(0.7)

movies['weight_avg'] = (V/(V+m) * R) + (m/(m+V) * C)

# Building a Basic Recommendation System


# Recommendation by Weighted Average Ratings

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
weight_avg = movies.sort_values('weight_avg', ascending=False)

plt.figure(figsize=(16,8))

ax = sns.barplot(x=weight_avg['weight_avg'].head(10),
                 y=weight_avg['original_title'].head(10),
                 data=weight_avg, palette='deep')

plt.title("Top Voted Movies")
plt.xlabel('Weight Average Score')
plt.ylabel('Movie Title')
plt.xlim(5, 9)
plt.show()

# Recommendation by Popularity

In [None]:
popular = movies.sort_values('popularity', ascending=False)

plt.figure(figsize=(16,8))

ax= sns.barplot(x=popular['popularity'].head(10),
                y=popular['original_title'].head(10),
                data=popular, palette='deep')

plt.title("Top Popular Movies")
plt.xlabel('Popular Score')
plt.ylabel('Movie Title')

plt.show()

# Recommendation by Weight and Popularity

In [None]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
movies_scaler = min_max_scaler.fit_transform(movies[['weight_avg', 'popularity']])
movies_norm = pd.DataFrame(movies_scaler, columns=['weight_avg', 'popularity'])

movies_norm.head(5)

In [None]:
movies[['norm_weight_avg', 'norm_popularity']] = movies_norm

movies['score'] = movies['norm_weight_avg']*0.5 + movies['norm_popularity']*0.5
movies_scored = movies.sort_values(['score'], ascending=False)
movies_scored[['original_title', 'norm_weight_avg', 'norm_popularity', 'score']].head(20)

In [None]:
score = movies.sort_values('score', ascending=False)

plt.figure(figsize=(16,6))

ax = sns.barplot(x=score['score'].head(10),
                 y=score['original_title'].head(10),
                 data=score, palette='deep')

plt.title("Top Rating&Popular Movies")
plt.xlabel('Score')
plt.ylabel('Movie Title')

plt.show()

# **Content-Based Filtering**

## Recommendation by sharing similar plot summaries

#### Word Vectorization and TF-IDF to convert test in the overview to word vectors

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

#### Abhishek Thakur's arguments for TF-IDF

In [None]:
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

In [None]:
movies['overview'] = movies['overview'].fillna('')

tfv_matrix = tfv.fit_transform(movies['overview'])
tfv_matrix.shape

## Calculating Similarity Scores


In [None]:
from sklearn.metrics.pairwise import sigmoid_kernel

sigmoid = sigmoid_kernel(tfv_matrix, tfv_matrix)

indices = pd.Series(movies.index, index=movies['original_title']).drop_duplicates()

def recommend(title, sig=sigmoid):
    index = indices[title]
    score = list(enumerate(sigmoid[index]))
    score = sorted(score, key=lambda x: x[1], reverse=True)
    
    #score of first 10 similar movies
    score = score[1:11]
    
    movie_index = [i[0] for i in score]
    
    return movies['original_title'].iloc[movie_index]

In [None]:
recommend('In Too Deep')

In [None]:
recommend('Minions')