In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('imdb_movies.csv')
df.head()

Unnamed: 0,names,date_x,score,genre,overview,crew,orig_title,status,orig_lang,budget_x,revenue,country
0,Creed III,03/02/2023,73.0,"Drama, Action","After dominating the boxing world, Adonis Cree...","Michael B. Jordan, Adonis Creed, Tessa Thompso...",Creed III,Released,English,75000000.0,271616700.0,AU
1,Avatar: The Way of Water,12/15/2022,78.0,"Science Fiction, Adventure, Action",Set more than a decade after the events of the...,"Sam Worthington, Jake Sully, Zoe Saldaña, Neyt...",Avatar: The Way of Water,Released,English,460000000.0,2316795000.0,AU
2,The Super Mario Bros. Movie,04/05/2023,76.0,"Animation, Adventure, Family, Fantasy, Comedy","While working underground to fix a water main,...","Chris Pratt, Mario (voice), Anya Taylor-Joy, P...",The Super Mario Bros. Movie,Released,English,100000000.0,724459000.0,AU
3,Mummies,01/05/2023,70.0,"Animation, Comedy, Family, Adventure, Fantasy","Through a series of unfortunate events, three ...","Óscar Barberán, Thut (voice), Ana Esther Albor...",Momias,Released,"Spanish, Castilian",12300000.0,34200000.0,AU
4,Supercell,03/17/2023,61.0,Action,Good-hearted teenager William always lived in ...,"Skeet Ulrich, Roy Cameron, Anne Heche, Dr Quin...",Supercell,Released,English,77000000.0,340942000.0,US


In [3]:
df.isnull().sum()


names          0
date_x         0
score          0
genre         85
overview       0
crew          56
orig_title     0
status         0
orig_lang      0
budget_x       0
revenue        0
country        0
dtype: int64

In [4]:
df.shape


(10178, 12)

In [5]:
df['text'] = df['names']+' '+df['overview']
df['text']

0        Creed III After dominating the boxing world, A...
1        Avatar: The Way of Water Set more than a decad...
2        The Super Mario Bros. Movie While working unde...
3        Mummies Through a series of unfortunate events...
4        Supercell Good-hearted teenager William always...
                               ...                        
10173    20th Century Women In 1979 Santa Barbara, Cali...
10174    Delta Force 2: The Colombian Connection When D...
10175    The Russia House Barley Scott Blair, a Lisbon-...
10176    Darkman II: The Return of Durant Darkman and D...
10177    The Swan Princess: A Royal Wedding Princess Od...
Name: text, Length: 10178, dtype: object

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [7]:
# list of english stop words to remove
tfidf = TfidfVectorizer(stop_words='english')
# creating tfidf matrix
tfidf_matrix = tfidf.fit_transform(df['text'])
# creating cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [8]:
indices = pd.Series(df.index, index=df['names']).drop_duplicates()

In [9]:
def get_recommendations(title, cosine_sim=cosine_sim, count=10):
    title_match = df[df['names'].str.contains(title, case=False, na=False)]['names'].iloc[0]
    idx = indices[title_match]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:count+1]
    movie_indices = [i[0] for i in sim_scores]
    return df['names'].iloc[movie_indices]


In [10]:
# movie names in df name them
movie_names = df['names']
movie_names.to_list()


['Creed III',
 'Avatar: The Way of Water',
 'The Super Mario Bros. Movie',
 'Mummies',
 'Supercell',
 'Cocaine Bear',
 'John Wick: Chapter 4',
 'Puss in Boots: The Last Wish',
 'Attack on Titan',
 'The Park',
 'Winnie the Pooh: Blood and Honey',
 'The Exorcist',
 'Murder Mystery 2',
 'Black Panther: Wakanda Forever',
 "The Pope's Exorcist",
 'Prizefighter: The Life of Jem Belcher',
 'Knock at the Cabin',
 'The Devil Conspiracy',
 'Cazadora',
 'Gold Run',
 "The Magician's Elephant",
 'Plane',
 'The Passion of the Christ',
 'Batman: The Doom That Came to Gotham',
 'Shazam! Fury of the Gods',
 'Consecration',
 'Shark Side of the Moon',
 'Black Adam',
 'Money Shot: The Pornhub Story',
 'M3GAN',
 'Ant-Man and the Wasp: Quantumania',
 'Sayen',
 'Die Hart',
 '13 Exorcisms',
 "H.P. Lovecraft's Witch House",
 'John Wick: Chapter 2',
 'Sick',
 'Black Warrant',
 'Shotgun Wedding',
 'John Wick: Chapter 3 - Parabellum',
 'Legion of Super-Heroes',
 'Fall',
 'Lord of the Streets',
 'Little Dixie',
 '

In [11]:
get_recommendations('Croods', count=10)

997                      The Croods
4923    Don't Be Afraid of the Dark
2924                      Neighbors
6407        The Woman in the Window
9252                 Paradise Hills
2741              Coming to America
3488                The Black Demon
1541                       Woodlawn
2582                    Beetlejuice
9646                Out of the Dark
Name: names, dtype: object

In [12]:
from flask import Flask, request, flash, redirect, url_for, render_template

app = Flask(__name__)
app.secret_key = 'your_secret_key'

@app.route('/results', methods=['GET','POST'])
def result():
    if request.method == 'POST':
        name = request.form['name']
        by = request.form['by']
        count = request.form['count']
        print(f'name: {name}, by: {by}, count: {count}')
        count = int(count)
        match by:
            case 'name':
                movies = get_recommendations(name, count=count)
            case 'word':
                movies = get_recommendations(name, by=by, count=count)
        if not movies:
            flash('No movies available', 'danger')
            return redirect(url_for('forminput'))
        
        if movies == 'Movie not found':
            flash('Movie not found', 'danger')
            return redirect(url_for('forminput'))
        else:
            results = [movie_data_from_tmdb(movie) for movie in movies]
            return render_template('results.html', name=name, by=by, count=count, results=results)
    return redirect(url_for('forminput'))

def movie_data_from_tmdb(movie):
    # Dummy implementation for the sake of example
    return {'title': movie, 'overview': 'Some overview'}

In [13]:
get_recommendations('Croods', count=10)

997                      The Croods
4923    Don't Be Afraid of the Dark
2924                      Neighbors
6407        The Woman in the Window
9252                 Paradise Hills
2741              Coming to America
3488                The Black Demon
1541                       Woodlawn
2582                    Beetlejuice
9646                Out of the Dark
Name: names, dtype: object

In [14]:
movie_data_from_tmdb(get_recommendations('Croods', count=10))

{'title': 997                      The Croods
 4923    Don't Be Afraid of the Dark
 2924                      Neighbors
 6407        The Woman in the Window
 9252                 Paradise Hills
 2741              Coming to America
 3488                The Black Demon
 1541                       Woodlawn
 2582                    Beetlejuice
 9646                Out of the Dark
 Name: names, dtype: object,
 'overview': 'Some overview'}

In [None]:
import requests
def movie_data_from_tmdb(movie_name):
    url = f"https://api.themoviedb.org/3/search/movie?query={movie_name}&include_adult=false&language=en-US&page=1"
    headers = {
        "accept": "application/json",
        "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiJlMjBjMGRlMTU4ZTllYmE1ZjViMDQ1YWFkMmVjYTA3NSIsIm5iZiI6MTcyNDY1Mjk1MC45MzczNDUsInN1YiI6IjVlZTlkYzNlMTY4NWRhMDAzNjI5ODc1ZCIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.VQS6z9TVtiem10Ev-1qhecdTEkl0BxpatxEBHoq7KEw"
    }
    response = requests.get(url, headers=headers)
    # take the first result
    movie_id = response.json()['results'][0]['id']
    url = f"https://api.themoviedb.org/3/movie/{movie_id}?language=en-US"
    response = requests.get(url, headers=headers)
    # convert 
    movie_data = response.json()
    poster = f"https://image.tmdb.org/t/p/w500{movie_data['poster_path']}"
    genres = [i['name'] for i in movie_data['genres']]
    link = movie_data['homepage']
    imdb_id = movie_data['imdb_id']
    overview = movie_data['overview']
    return {
        'movie': movie_name,
        'poster': poster,
        'genres': genres,
        'link': link,
        'imdb_id': imdb_id,
        'overview': overview
    }

movie_data_from_tmdb('Avengers')


{'movie': 'Avengers',
 'poster': 'https://image.tmdb.org/t/p/w500/k3LGf9afqmUZiAJ9nffBinpOhOI.jpg',
 'genres': ['Animation', 'Comedy', 'Science Fiction'],
 'link': 'https://www.disneyplus.com/movies/lego-marvel-avengers-mission-demolition/2SFUclISa5af',
 'imdb_id': 'tt33653255',
 'overview': 'A young, aspiring hero and superhero fan inadvertently unleashes a powerful new villain looking to rid the world of the Avengers.'}

In [15]:
# simple example of tf idf

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

corpus = [
    'The quick brown fox jumps over the lazy dog.',
    'The dog',
    'The fox',
    'none is in the list'
]

X = vectorizer.fit_transform(corpus)

print(vectorizer.get_feature_names_out(), '\n')

print('1:', X.toarray()[0], '\n')
print('2:', X.toarray()[1], '\n')
print('3:', X.toarray()[2])
print('4:', X.toarray()[3])



['brown' 'dog' 'fox' 'in' 'is' 'jumps' 'lazy' 'list' 'none' 'over' 'quick'
 'the'] 

1: [0.36929648 0.29115758 0.29115758 0.         0.         0.36929648
 0.36929648 0.         0.         0.36929648 0.36929648 0.38542844] 

2: [0.         0.83388421 0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.55193942] 

3: [0.         0.         0.83388421 0.         0.         0.
 0.         0.         0.         0.         0.         0.55193942]
4: [0.         0.         0.         0.48380259 0.48380259 0.
 0.         0.48380259 0.48380259 0.         0.         0.25246826]


In [16]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend(corpus, query):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(corpus)
    query_vector = vectorizer.transform([query])
    similarity = cosine_similarity(X, query_vector)
    print(similarity)


In [17]:
recommend(corpus, 'dog')

[[0.29115758]
 [0.83388421]
 [0.        ]
 [0.        ]]
