In [None]:
CONST_path_prefix = '/content/drive/MyDrive/ENTR 3901/assignments'

CONST_path_output = 'ENTR 3901/'

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install tmdbv3api python-dotenv nltk scikit-learn textblob streamlit
import nltk
nltk.download('vader_lexicon')




[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [None]:
import requests
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob


In [None]:
API_KEY = "765f721b002191fdc6a324061701eed7"
BASE_URL = "https://api.themoviedb.org/3"


In [None]:
def get_movie_details(movie_id):
    url = f"{BASE_URL}/movie/{movie_id}?api_key={API_KEY}&append_to_response=credits"
    res = requests.get(url).json()

    return {
        "id": res.get("id"),
        "title": res.get("title"),
        "year": res.get("release_date", "")[:4],
        "genres": [g["name"] for g in res.get("genres", [])],
        "overview": res.get("overview", ""),
        "rating": res.get("vote_average", 0),
        "votes": res.get("vote_count", 0),
        "runtime": res.get("runtime", 0),
        "cast": [c["name"] for c in res.get("credits", {}).get("cast", [])[:5]],
        "director": next(
            (c["name"] for c in res.get("credits", {}).get("crew", []) if c["job"] == "Director"),
            None
        )
    }


In [None]:
def get_popular_movies(n=200):
    movies = []
    page = 1

    while len(movies) < n:
        url = f"{BASE_URL}/movie/popular?api_key={API_KEY}&page={page}"
        data = requests.get(url).json()["results"]

        for m in data:
            details = get_movie_details(m["id"])
            movies.append(details)

            if len(movies) >= n:
                break
        page += 1

    return movies

movies = get_popular_movies(150)  # You can adjust number
df = pd.DataFrame(movies)
df.head()


Unnamed: 0,id,title,year,genres,overview,rating,votes,runtime,cast,director
0,1062722,Frankenstein,2025,"[Drama, Horror, Fantasy]","Dr. Victor Frankenstein, a brilliant but egoti...",7.9,1105,150,"[Oscar Isaac, Jacob Elordi, Christoph Waltz, M...",Guillermo del Toro
1,1231813,Sister Swapping,2023,"[Romance, Drama]",Sisters Ji-soo and Ji-yeong decided to sell th...,6.6,14,61,"[Ga Won, White Sugar, Si Woo, Im Tae-mok]",Choi Jong-gyoo
2,1248226,Playdate,2025,"[Action, Comedy]",When out-of-work accountant Brian joins stay-a...,6.8,61,93,"[Kevin James, Alan Ritchson, Sarah Chalke, Isl...",Luke Greenfield
3,1242898,Predator: Badlands,2025,"[Action, Science Fiction, Adventure]","Cast out from his clan, a young Predator finds...",7.247,269,107,"[Elle Fanning, Dimitrius Schuster-Koloamatangi...",Dan Trachtenberg
4,1197137,Black Phone 2,2025,"[Horror, Thriller]","Four years after escaping The Grabber, Finney ...",7.2,442,114,"[Ethan Hawke, Mason Thames, Madeleine McGraw, ...",Scott Derrickson


In [None]:
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(df["overview"].fillna(""))


In [None]:
sia = SentimentIntensityAnalyzer()

df["sentiment"] = df["overview"].apply(
    lambda x: sia.polarity_scores(str(x))["compound"]
)


In [None]:
def recommend_content(movie_title, top_k=10):
    idx = df.index[df["title"] == movie_title].tolist()
    if not idx:
        return []
    idx = idx[0]

    sims = cosine_similarity(tfidf_matrix[idx], tfidf_matrix)[0]
    sim_idx = sims.argsort()[-top_k-1:-1][::-1]

    return df.iloc[sim_idx][["title", "rating", "genres", "year"]]


In [None]:
def recommend_sentiment(movie_title, top_k=10):
    target = df[df["title"] == movie_title]
    if target.empty:
        return None

    sentiment = target.iloc[0]["sentiment"]
    df["sent_diff"] = (df["sentiment"] - sentiment).abs()

    return df.sort_values("sent_diff").iloc[1:top_k+1][["title", "sentiment"]]


In [None]:
def recommend_hybrid(movie_title, top_k=10):
    content_recs = recommend_content(movie_title, top_k * 2)
    senti_recs = recommend_sentiment(movie_title, top_k * 2)

    merged = df.merge(content_recs, how="inner").merge(senti_recs, how="inner")
    merged["hybrid_score"] = merged["sent_diff"] * 0.3  # smaller = better

    return merged.sort_values("hybrid_score").head(top_k)


In [None]:
def apply_filters(df, year_range=None, min_rating=None, genre=None):
    filtered = df.copy()

    if year_range:
        filtered = filtered[
            (filtered["year"].astype(int) >= year_range[0]) &
            (filtered["year"].astype(int) <= year_range[1])
        ]
    if min_rating:
        filtered = filtered[filtered["rating"] >= min_rating]
    if genre:
        filtered = filtered[filtered["genres"].apply(lambda g: genre in g)]

    return filtered


In [None]:
def recommend_hybrid(movie_title, top_k=5):
    # Get content and sentiment results (lists)
    content_recs = recommend_content(movie_title)
    senti_recs = recommend_sentiment(movie_title)

    # Convert lists to DataFrames
    content_df = pd.DataFrame({"title": content_recs})
    senti_df   = pd.DataFrame({"title": senti_recs})

    # Merge with the main df
    merged = df.merge(content_df, on="title", how="inner") \
               .merge(senti_df, on="title", how="inner")

    # Add simple hybrid score
    merged["hybrid_score"] = merged["sent_diff"] * 0.3

    # Sort and return top movies
    return merged.sort_values("hybrid_score").head(top_k)
