## Imports & Configurations

In [109]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

In [110]:
MOVIES_PATH = r"C:\Users\lunaf\Desktop\Projects\jellyfin-movie-recommender\data\raw\movies.csv"
RATINGS_PATH = r"C:\Users\lunaf\Desktop\Projects\jellyfin-movie-recommender\data\raw\ratings.csv"
LINKS_PATH = r"C:\Users\lunaf\Desktop\Projects\jellyfin-movie-recommender\data\raw\links.csv"
AVG_RATING = 3
RATING_COUNT = 2000

options = Options()
#options.add_argument("--headless")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
wait = WebDriverWait(driver, 10)

## Data Analysis and Feature Engineering

In [111]:
movies_df = pd.read_csv(MOVIES_PATH)
ratings_df = pd.read_csv(RATINGS_PATH)
links_df = pd.read_csv(LINKS_PATH)

In [112]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [113]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,1225734739
1,1,110,4.0,1225865086
2,1,158,4.0,1225733503
3,1,260,4.5,1225735204
4,1,356,5.0,1225735119


In [114]:
links_df.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [115]:
movie_data = pd.merge(ratings_df, movies_df, on = 'movieId')
movie_data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,1225734739,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,1,5.0,835815971,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.0,974518024,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,10,1,3.0,1430666394,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,12,1,5.0,862500738,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [116]:
movie_ratings = (
    movie_data.groupby("title")["rating"]
    .agg(avg_rating=lambda x: round(x.mean(), 1), rating_counts="count")
    .reset_index()
)

In [117]:
movie_ratings.head()

Unnamed: 0,title,avg_rating,rating_counts
0,(2019),2.7,37
1,"""BLOW THE NIGHT!"" Let's Spend the Night Togeth...",2.0,2
2,"""Great Performances"" Cats (1998)",2.9,275
3,"""Sr."" (2022)",3.2,11
4,#1 Cheerleader Camp (2010),2.2,15


In [118]:
top_list = movie_ratings[(movie_ratings['avg_rating'] > AVG_RATING) & (movie_ratings['rating_counts'] > RATING_COUNT)]
top_list = top_list.sort_values(by='avg_rating', ascending = False)
top_list.head()

Unnamed: 0,title,avg_rating,rating_counts
51240,Planet Earth II (2016),4.5,2041
7305,Band of Brothers (2001),4.4,2835
51239,Planet Earth (2006),4.4,3015
57966,"Shawshank Redemption, The (1994)",4.4,122296
27011,"Godfather, The (1972)",4.3,75004


In [119]:
top10 = top_list.head(10)
print(top10)

                                             title  avg_rating  rating_counts
51240                       Planet Earth II (2016)         4.5           2041
7305                       Band of Brothers (2001)         4.4           2835
51239                          Planet Earth (2006)         4.4           3015
57966             Shawshank Redemption, The (1994)         4.4         122296
27011                        Godfather, The (1972)         4.3          75004
77946                   Usual Suspects, The (1995)         4.3          72893
27012               Godfather: Part II, The (1974)         4.3          47271
57521  Seven Samurai (Shichinin no samurai) (1954)         4.3          17120
49890                              Parasite (2019)         4.3          12399
256                            12 Angry Men (1957)         4.3          22730


In [120]:
merged_df = pd.merge(top_list, movies_df, on= "title")

In [121]:
merged_df = pd.merge(merged_df, links_df, on="movieId")

In [90]:
merged_df.head()

Unnamed: 0,title,avg_rating,rating_counts,movieId,genres,imdbId,tmdbId
0,Planet Earth II (2016),4.5,2041,171011,Documentary,5491994,420714.0
1,Band of Brothers (2001),4.4,2835,170705,Action|Drama|War,185906,331214.0
2,Planet Earth (2006),4.4,3015,159817,Documentary,795176,192040.0
3,"Shawshank Redemption, The (1994)",4.4,122296,318,Crime|Drama,111161,278.0
4,"Godfather, The (1972)",4.3,75004,858,Crime|Drama,68646,238.0


In [122]:
merged_df.shape

(2436, 7)

In [123]:
merged_df[merged_df["movieId"] == 171011]

Unnamed: 0,title,avg_rating,rating_counts,movieId,genres,imdbId,tmdbId
0,Planet Earth II (2016),4.5,2041,171011,Documentary,5491994,420714.0


In [124]:
from typing import List, Dict, Any

class UserTest:
    def __init__(self, userId: int, ratings: list = []):
        self.userId = userId
        self.ratings = []
        self.watchedList = []

    def _get_user_info(self) -> List[Dict[str, Any]]:
        print(f"User's id is: {self.userId} ")
        print(f"User's ratings are: {self.ratings}")
        print(f"User's watched list is: {self.watchedList}")

    def _add_rating(self, movieId, rating):
        rating = {movieId, rating}
        self.ratings.append(rating)

    def _add_movies_to_list(self, movieId):
        self.watchedList.append(movieId)

In [125]:
furkan = UserTest(userId = 1)

In [126]:
furkan._add_rating(171011, 4)

In [127]:
furkan._add_movies_to_list(171011)

In [128]:
furkan._get_user_info()

User's id is: 1 
User's ratings are: [{171011, 4}]
User's watched list is: [171011]


## Trending Based Recommendation

I want to handle popularity-based recommendations in two ways: the first representing overall yearly popularity across the movie sector, and the second focusing on recent trends (maybe from the last two years)

In [129]:
merged_df.head()

Unnamed: 0,title,avg_rating,rating_counts,movieId,genres,imdbId,tmdbId
0,Planet Earth II (2016),4.5,2041,171011,Documentary,5491994,420714.0
1,Band of Brothers (2001),4.4,2835,170705,Action|Drama|War,185906,331214.0
2,Planet Earth (2006),4.4,3015,159817,Documentary,795176,192040.0
3,"Shawshank Redemption, The (1994)",4.4,122296,318,Crime|Drama,111161,278.0
4,"Godfather, The (1972)",4.3,75004,858,Crime|Drama,68646,238.0


As you can see, we have the imdbId, ratings, and movieId, but no information about the movie’s release date. Now, we need to get the release year from IMDb using the imdbId.

In [None]:
release_dates = []

for imdb_id in merged_df['imdbId']:
    imdb_id_str = str(imdb_id).zfill(7)  # 7 haneli hale getir
    url = f"https://www.imdb.com/title/tt{imdb_id_str}/"
    print(f"Şu an {url} sayfasına gidiliyor...")

    try:
        driver.get(url)
        release_elem = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "a[href*='releaseinfo']")))
        release_date = release_elem.text
        print(f"Bulunan release date: {release_date}")
    except Exception as e:
        print(f"Hata oluştu: {e}")
        release_date = None

    release_dates.append(release_date)

driver.quit()

merged_df['release_date'] = release_dates
print(merged_df)

Şu an https://www.imdb.com/title/tt5491994/ sayfasına gidiliyor...
Bulunan release date: 2016
Şu an https://www.imdb.com/title/tt0185906/ sayfasına gidiliyor...
Bulunan release date: 2001
Şu an https://www.imdb.com/title/tt0795176/ sayfasına gidiliyor...
Bulunan release date: 2006
Şu an https://www.imdb.com/title/tt0111161/ sayfasına gidiliyor...
Bulunan release date: 1994
Şu an https://www.imdb.com/title/tt0068646/ sayfasına gidiliyor...
Bulunan release date: 1972
Şu an https://www.imdb.com/title/tt0114814/ sayfasına gidiliyor...
Bulunan release date: 1995
Şu an https://www.imdb.com/title/tt0071562/ sayfasına gidiliyor...
Bulunan release date: 1974
Şu an https://www.imdb.com/title/tt0047478/ sayfasına gidiliyor...
Bulunan release date: 1954
Şu an https://www.imdb.com/title/tt6751668/ sayfasına gidiliyor...
