## Imports & Configurations

In [9]:
import os

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
MOVIES_PATH = r"C:\Users\lunaf\Desktop\Projects\jellyfin-movie-recommender\data\raw\movies.csv"
GENOME_TAGS_PATH = r"C:\Users\lunaf\Desktop\Projects\jellyfin-movie-recommender\data\raw\genome-tags.csv"
GENOME_SCORES_PATH = r"C:\Users\lunaf\Desktop\Projects\jellyfin-movie-recommender\data\raw\genome-scores.csv"
TAGS_PATH =r"C:\Users\lunaf\Desktop\Projects\jellyfin-movie-recommender\data\raw\tags.csv"
RELEVANCE_THRESHOLD = 0.2

In [11]:
movies_df = pd.read_csv(MOVIES_PATH)
genome_tags_df = pd.read_csv(GENOME_TAGS_PATH)
genome_scores_df = pd.read_csv(GENOME_SCORES_PATH)
tags_df = pd.read_csv(TAGS_PATH)

## Data Preprocessing & Feature Engineering

In [12]:
df_list = [movies_df, genome_tags_df, genome_scores_df, tags_df]

for df in df_list:
    print("First 5 row:\n")
    print(df.head())
    print("Information:\n")
    print(df.info())
    print("Shape:\n")
    print(df.shape)

First 5 row:

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
Information:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86537 entries, 0 to 86536
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  86537 non-null  int64 
 1   title    86537 non-null  object
 2   genres   86537 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.0+ MB
None
Shape:

(86537, 3)
First 5 ro


**‚ö†Ô∏è Note:**

Relevance is representation of all tags on all movies. That's why, I think ƒ± should define a threshold for relevance score and then i can pick tags that relavance scores are higher than threshold to correspond it with movies.

In [8]:
plt.figure(figsize=(10, 6))

# Histogram + KDE
sns.histplot(genome_scores_df["relevance"], bins=100, kde=True, color="skyblue")
plt.title("Distribution of Tag Relevance Scores", fontsize=14)
plt.xlabel("Relevance Score")
plt.ylabel("Frequency")

# Quantile threshold lines (For example: %80, %90, %95)
for q in [0.80, 0.90, 0.95]:
    thresh = genome_scores_df["relevance"].quantile(q)
    plt.axvline(thresh, color="red", linestyle="--", alpha=0.7)
    plt.text(thresh + 0.005, plt.ylim()[1]*0.8, f"{int(q*100)}% ‚Üí {thresh:.2f}", color="red")

plt.show()

KeyboardInterrupt: 

<Figure size 1000x600 with 0 Axes>

In [None]:
plt.figure(figsize=(8,5))
sns.ecdfplot(genome_scores_df["relevance"], color="green")
plt.title("Cumulative Distribution of Relevance Scores")
plt.xlabel("Relevance")
plt.ylabel("Cumulative Probability")

for q in [0.80, 0.90, 0.95]:
    thresh = genome_scores_df["relevance"].quantile(q)
    plt.axvline(thresh, color="red", linestyle="--", alpha=0.7)
    plt.text(thresh+0.005, 0.05, f"{int(q*100)}% ‚Üí {thresh:.2f}", color="red")

plt.show()

In [13]:
threshold = RELEVANCE_THRESHOLD
filtered_df = genome_scores_df[genome_scores_df["relevance"] >= threshold]
print(f"Selected {len(filtered_df)} tag-movie pairs (above {threshold})")

Selected 2982615 tag-movie pairs (above 0.2)


‚û°Ô∏è I know the film whose movieId is 1. It‚Äôs Toy Story, and it‚Äôs a cartoon. Now, I‚Äôm comparing the tags for the movies.

In [20]:
movie1_tags = filtered_df[filtered_df["movieId"] == 1].sort_values(by="relevance", ascending=False)
print(movie1_tags)

          movieId  tagId  relevance
17891530   224338    323    0.74950
17891949   224338    742    0.65050
17891659   224338    452    0.55075
17891847   224338    640    0.54025
17891499   224338    292    0.52475
...           ...    ...        ...
17891738   224338    531    0.20250
17891301   224338     94    0.20200
17892279   224338   1072    0.20200
17891954   224338    747    0.20175
17892242   224338   1035    0.20000

[111 rows x 3 columns]


In [21]:
best_5_tag = [323, 742, 452, 640, 589]

In [22]:
for tag_id in best_5_tag:
    print(genome_tags_df[genome_tags_df["tagId"]  == tag_id])

     tagId    tag
322    323  drama
     tagId       tag
741    742  original
     tagId              tag
451    452  good soundtrack
     tagId          tag
639    640  melancholic
     tagId    tag
291    292  death


In [24]:
worst_5_tag = [1035, 747, 1072, 94, 531]

In [25]:
for tag_id in worst_5_tag:
    print(genome_tags_df[genome_tags_df["tagId"]  == tag_id])

      tagId       tag
1034   1035  touching
     tagId                   tag
746    747  oscar (best actress)
      tagId        tag
1071   1072  very good
    tagId                 tag
93     94  awesome soundtrack
     tagId       tag
530    531  idealism


## Test Results: *Toy Story* (movieId = 1) üß∏

We applied a **relevance threshold of 0.2** to filter tags.  
Below are the **top 5 (highest relevance)** and **bottom 5 (lowest relevance above threshold)** tags for *Toy Story*.

---

 **Best 5 Tags (Highest Relevance)**

| TagId | Tag                 | Relevance |
|-------|--------------------|-----------|
| 1036  | toys               | 0.99950   |
| 244   | computer animation | 0.99875   |
| 786   | pixar animation    | 0.99325   |
| 64    | animation          | 0.98625   |
| 589   | kids and family    | 0.98525   |

**Analysis:**  
- These tags are **highly representative** of the movie content.  
- They capture the animation style, producer (*Pixar*), and target audience (children and family).  
- They are suitable for **content-based recommendation** systems.

---

 **Worst 5 Tags (Lowest Relevance above threshold))**

| TagId | Tag                                      | Relevance |
|-------|-----------------------------------------|-----------|
| 327   | dreamworks                               | 0.20000   |
| 880   | saturn award (best science fiction film)| 0.20025   |
| 154   | books                                    | 0.20825   |
| 212   | cinematography                           | 0.20900   |
| 642   | memory                                   | 0.20925   |

**Analysis:**  
- These tags are **weakly related** to the movie.  
- Some are **irrelevant or misleading** (e.g., DreamWorks studio, science fiction awards).  
- Using these in recommendations may introduce **noise**; consider **filtering or down-weighting** them.

---

 **Conclusion)**

1. **Threshold = 0.2** is a reasonable choice to separate **strongly related tags** from weak ones.  
2. Best tags accurately reflect the movie‚Äôs content and audience.  
3. Weak tags near the threshold should be treated cautiously.  
4. **The top 5 tags (`toys`, `computer animation`, `pixar animation`, `animation`, `kids and family`) are reliable and can be safely used for recommendation purposes.**  
5. The filtered best tags are ready to be used for **content-based movie recommendations**.


## User-Based Tag and Genre Analysis

In [19]:
class User:
    def __init__(self, userId: int):
        self.userId = userId
        self.ratings = {}       # movieId: rating (1=like, 0=dislike)
        self.watchedList = []   # movieId list
    
    def add_rating(self, movieId, rating):
        self.ratings[movieId] = rating
        if movieId not in self.watchedList:
            self.watchedList.append(movieId)
    
    def add_watched(self, movieId):
        if movieId not in self.watchedList:
            self.watchedList.append(movieId)


##### .5