## Imports & Configurations

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

In [2]:
MOVIES_PATH = r"C:\Users\lunaf\Desktop\Projects\jellyfin-movie-recommender\data\raw\movies.csv"
RATINGS_PATH = r"C:\Users\lunaf\Desktop\Projects\jellyfin-movie-recommender\data\raw\ratings.csv"
LINKS_PATH = r"C:\Users\lunaf\Desktop\Projects\jellyfin-movie-recommender\data\raw\links.csv"
AVG_RATING = 3
RATING_COUNT = 50000

options = Options()
#options.add_argument("--headless")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
wait = WebDriverWait(driver, 10)

## Data Analysis and Feature Engineering

In [3]:
movies_df = pd.read_csv(MOVIES_PATH)
ratings_df = pd.read_csv(RATINGS_PATH)
links_df = pd.read_csv(LINKS_PATH)

In [4]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,1225734739
1,1,110,4.0,1225865086
2,1,158,4.0,1225733503
3,1,260,4.5,1225735204
4,1,356,5.0,1225735119


In [6]:
links_df.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [7]:
movie_data = pd.merge(ratings_df, movies_df, on = 'movieId')
movie_data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,1225734739,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,1,5.0,835815971,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.0,974518024,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,10,1,3.0,1430666394,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,12,1,5.0,862500738,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [8]:
movie_ratings = (
    movie_data.groupby("title")["rating"]
    .agg(avg_rating=lambda x: round(x.mean(), 1), rating_counts="count")
    .reset_index()
)

In [9]:
movie_ratings.head()

Unnamed: 0,title,avg_rating,rating_counts
0,(2019),2.7,37
1,"""BLOW THE NIGHT!"" Let's Spend the Night Togeth...",2.0,2
2,"""Great Performances"" Cats (1998)",2.9,275
3,"""Sr."" (2022)",3.2,11
4,#1 Cheerleader Camp (2010),2.2,15


In [10]:
top_list = movie_ratings[(movie_ratings['avg_rating'] > AVG_RATING) & (movie_ratings['rating_counts'] > RATING_COUNT)]
top_list = top_list.sort_values(by='avg_rating', ascending = False)
top_list.head()

Unnamed: 0,title,avg_rating,rating_counts
57966,"Shawshank Redemption, The (1994)",4.4,122296
77946,"Usual Suspects, The (1995)",4.3,72893
27011,"Godfather, The (1972)",4.3,75004
23600,Fight Club (1999),4.2,86207
58573,"Silence of the Lambs, The (1991)",4.2,101802


In [12]:
top10 = top_list.head(10)
print(top10)

                                  title  avg_rating  rating_counts
57966  Shawshank Redemption, The (1994)         4.4         122296
77946        Usual Suspects, The (1995)         4.3          72893
27011             Godfather, The (1972)         4.3          75004
23600                 Fight Club (1999)         4.2          86207
58573  Silence of the Lambs, The (1991)         4.2         101802
56643           Schindler's List (1993)         4.2          84232
52643               Pulp Fiction (1994)         4.2         108756
32908                  Inception (2010)         4.2          65056
42352                Matrix, The (1999)         4.2         107056
17042           Dark Knight, The (2008)         4.2          65349


In [13]:
merged_df = pd.merge(top_list, movies_df, on= "title")

In [14]:
merged_df = pd.merge(merged_df, links_df, on="movieId")

In [15]:
merged_df.head()

Unnamed: 0,title,avg_rating,rating_counts,movieId,genres,imdbId,tmdbId
0,"Shawshank Redemption, The (1994)",4.4,122296,318,Crime|Drama,111161,278.0
1,"Usual Suspects, The (1995)",4.3,72893,50,Crime|Mystery|Thriller,114814,629.0
2,"Godfather, The (1972)",4.3,75004,858,Crime|Drama,68646,238.0
3,Fight Club (1999),4.2,86207,2959,Action|Crime|Drama|Thriller,137523,550.0
4,"Silence of the Lambs, The (1991)",4.2,101802,593,Crime|Horror|Thriller,102926,274.0


In [16]:
merged_df.shape

(45, 7)

In [17]:
merged_df[merged_df["movieId"] == 171011]

Unnamed: 0,title,avg_rating,rating_counts,movieId,genres,imdbId,tmdbId


In [18]:
from typing import List, Dict, Any

class UserTest:
    def __init__(self, userId: int, ratings: list = []):
        self.userId = userId
        self.ratings = []
        self.watchedList = []

    def _get_user_info(self) -> List[Dict[str, Any]]:
        print(f"User's id is: {self.userId} ")
        print(f"User's ratings are: {self.ratings}")
        print(f"User's watched list is: {self.watchedList}")

    def _add_rating(self, movieId, rating):
        rating = {movieId, rating}
        self.ratings.append(rating)

    def _add_movies_to_list(self, movieId):
        self.watchedList.append(movieId)

In [19]:
furkan = UserTest(userId = 1)

In [20]:
furkan._add_rating(171011, 4)

In [21]:
furkan._add_movies_to_list(171011)

In [22]:
furkan._get_user_info()

User's id is: 1 
User's ratings are: [{171011, 4}]
User's watched list is: [171011]


## Trending Based Recommendation

I want to handle popularity-based recommendations in two ways: the first representing overall yearly popularity across the movie sector, and the second focusing on recent trends (maybe from the last two years)

In [23]:
merged_df.head()

Unnamed: 0,title,avg_rating,rating_counts,movieId,genres,imdbId,tmdbId
0,"Shawshank Redemption, The (1994)",4.4,122296,318,Crime|Drama,111161,278.0
1,"Usual Suspects, The (1995)",4.3,72893,50,Crime|Mystery|Thriller,114814,629.0
2,"Godfather, The (1972)",4.3,75004,858,Crime|Drama,68646,238.0
3,Fight Club (1999),4.2,86207,2959,Action|Crime|Drama|Thriller,137523,550.0
4,"Silence of the Lambs, The (1991)",4.2,101802,593,Crime|Horror|Thriller,102926,274.0


As you can see, we have the imdbId, ratings, and movieId, but no information about the movie’s release date. Now, we need to get the release year from IMDb using the imdbId.

In [24]:
"""
release_dates = []

for imdb_id in merged_df['imdbId']:
    imdb_id_str = str(imdb_id).zfill(7)  # 7 haneli hale getir
    url = f"https://www.imdb.com/title/tt{imdb_id_str}/"
    print(f"Şu an {url} sayfasına gidiliyor...")

    try:
        driver.get(url)
        release_elem = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "a[href*='releaseinfo']")))
        release_date = release_elem.text
        print(f"Bulunan release date: {release_date}")
    except Exception as e:
        print(f"Hata oluştu: {e}")
        release_date = None

    release_dates.append(release_date)

driver.quit()

merged_df['release_date'] = release_dates
print(merged_df)
"""

'\nrelease_dates = []\n\nfor imdb_id in merged_df[\'imdbId\']:\n    imdb_id_str = str(imdb_id).zfill(7)  # 7 haneli hale getir\n    url = f"https://www.imdb.com/title/tt{imdb_id_str}/"\n    print(f"Şu an {url} sayfasına gidiliyor...")\n\n    try:\n        driver.get(url)\n        release_elem = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "a[href*=\'releaseinfo\']")))\n        release_date = release_elem.text\n        print(f"Bulunan release date: {release_date}")\n    except Exception as e:\n        print(f"Hata oluştu: {e}")\n        release_date = None\n\n    release_dates.append(release_date)\n\ndriver.quit()\n\nmerged_df[\'release_date\'] = release_dates\nprint(merged_df)\n'

In [25]:
DENEME = r"C:\Users\lunaf\Desktop\Projects\jellyfin-movie-recommender\data\processed\merged_df_full.csv"
deneme_df = pd.read_csv(DENEME)

In [26]:
deneme_df.head()

Unnamed: 0,title,avg_rating,rating_counts,movieId,genres,imdbId,tmdbId,release_date
0,Planet Earth II (2016),4.5,2041,171011,Documentary,5491994,420714.0,2016
1,Band of Brothers (2001),4.4,2835,170705,Action|Drama|War,185906,331214.0,2001
2,Planet Earth (2006),4.4,3015,159817,Documentary,795176,192040.0,2006
3,"Shawshank Redemption, The (1994)",4.4,122296,318,Crime|Drama,111161,278.0,1994
4,"Godfather, The (1972)",4.3,75004,858,Crime|Drama,68646,238.0,1972


In [27]:
deneme_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2436 entries, 0 to 2435
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   title          2436 non-null   object 
 1   avg_rating     2436 non-null   float64
 2   rating_counts  2436 non-null   int64  
 3   movieId        2436 non-null   int64  
 4   genres         2436 non-null   object 
 5   imdbId         2436 non-null   int64  
 6   tmdbId         2436 non-null   float64
 7   release_date   2435 non-null   object 
dtypes: float64(2), int64(3), object(3)
memory usage: 152.4+ KB


In [29]:
# release_date'i datetime yap
deneme_df['release_date'] = pd.to_datetime(deneme_df['release_date'], errors='coerce')

# List of the movies after 2020
movies_after_2020 = deneme_df[deneme_df['release_date'].dt.year > 2020]

movies_after_2020

Unnamed: 0,title,avg_rating,rating_counts,movieId,genres,imdbId,tmdbId,release_date
98,Everything Everywhere All at Once (2022),4.1,3947,270698,Action|Comedy|Sci-Fi,6710474,545611.0,2022-01-01
242,Dune (2021),4.0,5437,254726,Action|Adventure|Drama|Sci-Fi,1160419,438631.0,2021-01-01
560,Top Gun: Maverick (2022),3.9,3262,274053,Action|Drama,1745960,361743.0,2022-01-01
568,Spider-Man: No Way Home (2021),3.8,4195,263007,Action|Adventure|Fantasy|Sci-Fi,10872600,634649.0,2021-01-01
1000,The Batman (2022),3.7,3366,268642,Action|Crime|Drama,1877830,414906.0,2022-01-01
1559,Free Guy (2020),3.5,2042,226202,Action|Adventure|Comedy|Sci-Fi,6264654,550988.0,2021-01-01
1761,Don't Look Up (2021),3.4,2340,263407,Comedy|Drama|Sci-Fi,11286314,646380.0,2021-01-01
