# Online Recommender System

In [4]:
from pymongo import MongoClient

client = MongoClient("mongodb://localhost:27017/")
db = client["bdnr_MovieLens_ml_25m"]

movies_col = db["movies"]
users_col = db["users"]

In [5]:
import json

sample_movie = movies_col.find_one({}, {'_id': 0})  # Exclude MongoDB _id
print(json.dumps(sample_movie, indent=2, default=str))

{
  "movieId": 1,
  "title": "Toy Story (1995)",
  "year": 1995,
  "genres": [
    "Adventure",
    "Animation",
    "Children",
    "Comedy",
    "Fantasy"
  ],
  "ratings": [
    {
      "userId": 2,
      "rating": 3.5,
      "timestamp": 1141415820
    },
    {
      "userId": 3,
      "rating": 4.0,
      "timestamp": 1439472215
    },
    {
      "userId": 4,
      "rating": 3.0,
      "timestamp": 1573944252
    },
    {
      "userId": 5,
      "rating": 4.0,
      "timestamp": 858625949
    },
    {
      "userId": 8,
      "rating": 4.0,
      "timestamp": 890492517
    },
    {
      "userId": 10,
      "rating": 3.5,
      "timestamp": 1227571347
    },
    {
      "userId": 12,
      "rating": 4.0,
      "timestamp": 1167582601
    },
    {
      "userId": 13,
      "rating": 4.0,
      "timestamp": 1265223970
    },
    {
      "userId": 18,
      "rating": 3.0,
      "timestamp": 1108273483
    },
    {
      "userId": 26,
      "rating": 3.0,
      "timestamp": 12805158

## Popular Movies Carousel: Bayesian Average

This aggregation pipeline retrieves the **top 20 statistically significant popular movies** using a **Bayesian average formula** that balances average rating and volume of votes.

The Bayesian average is computed as:

```
bayesianAvg = (v / (v + m)) * R + (m / (v + m)) * C
```

Where:
- `R`: the movie’s average rating (`stats.avgRating`)
- `v`: number of ratings for the movie (`stats.ratingCount`)
- `m`: minimum ratings threshold (set here to 100)
- `C`: global average rating across all movies (set here to 3.5)

This avoids bias toward movies with few high ratings and promotes consistently well-rated films with sufficient data.

In [7]:
from pprint import pprint

# Define Bayesian average aggregation pipeline
pipeline = [
    {
        "$match": {
            "stats.ratingCount": { "$gte": 100 }
        }
    },
    {
        "$addFields": {
            "bayesianAvg": {
                "$let": {
                    "vars": {
                        "r": "$stats.avgRating",
                        "v": "$stats.ratingCount",
                        "m": 100,
                        "c": 3.5
                    },
                    "in": {
                        "$add": [
                            { "$multiply": [ { "$divide": [ "$$v", { "$add": [ "$$v", "$$m" ] } ] }, "$$r" ] },
                            { "$multiply": [ { "$divide": [ "$$m", { "$add": [ "$$v", "$$m" ] } ] }, "$$c" ] }
                        ]
                    }
                }
            }
        }
    },
    {
        "$sort": { "bayesianAvg": -1 }
    },
    {
        "$limit": 20
    },
    {
        "$project": {
            "_id": 0,
            "movieId": 1,
            "title": 1,
            "stats.ratingCount": 1,
            "stats.avgRating": 1,
            "bayesianAvg": 1
        }
    }
]

results = movies_col.aggregate(pipeline)
for movie in results:
    pprint(movie)

{'bayesianAvg': 4.417067577457917,
 'movieId': 318,
 'stats': {'avgRating': 4.44, 'ratingCount': 3999},
 'title': 'Shawshank Redemption, The (1994)'}
{'bayesianAvg': 4.31766831494865,
 'movieId': 858,
 'stats': {'avgRating': 4.35, 'ratingCount': 2529},
 'title': 'Godfather, The (1972)'}
{'bayesianAvg': 4.271969166082691,
 'movieId': 50,
 'stats': {'avgRating': 4.3, 'ratingCount': 2754},
 'title': 'Usual Suspects, The (1995)'}
{'bayesianAvg': 4.263475014359563,
 'movieId': 1221,
 'stats': {'avgRating': 4.31, 'ratingCount': 1641},
 'title': 'Godfather: Part II, The (1974)'}
{'bayesianAvg': 4.244745818301082,
 'movieId': 527,
 'stats': {'avgRating': 4.27, 'ratingCount': 2949},
 'title': "Schindler's List (1993)"}
{'bayesianAvg': 4.205939353988135,
 'movieId': 2959,
 'stats': {'avgRating': 4.23, 'ratingCount': 2934},
 'title': 'Fight Club (1999)'}
{'bayesianAvg': 4.190109289617487,
 'movieId': 1193,
 'stats': {'avgRating': 4.23, 'ratingCount': 1730},
 'title': "One Flew Over the Cuckoo's N

## Movie–Movie Carousel: Pairwise Co-Watch

This function implements a recommendation strategy that answers:

> _“Which movies are most frequently co-watched or co-rated by users who watched Movie A?”_

It uses MongoDB's aggregation pipeline to analyze co-occurrence in the `ratingHistory` of users. This is useful for carousels like **"Users who watched this also watched..."**.


In [19]:
def get_co_watched_movies(movie_id, min_count=5, limit=20):
    """
    - `movie_id`: the target movie (e.g., Toy Story → `1`)
    - `min_count`: minimum number of co-occurrences required to include a result
    - `limit`: maximum number of related movies to return
    """
    pipeline = [
        # Find users who rated the target movie
        { "$match": { "ratingHistory.movieId": movie_id } },

        # filters the `users` collection to those who have rated the given `movie_id`
        { "$unwind": "$ratingHistory" },

        # flattens each user's `ratingHistory` array into individual documents, 
        # so we can process one rating per pipeline document
        { "$group": {
            "_id": "$ratingHistory.movieId",
            "count": { "$sum": 1 }
        }},

        # count how many times each movie appears in these users' histories,
        # this gives us **co-watch frequency**
        { "$match": {
            "_id": { "$ne": movie_id },
            "count": { "$gte": min_count }
        }},

        # Removes the target movie itself from the results and filters out 
        # movies with very low co-watch counts (noise)
        { "$sort": { "count": -1 } },
        { "$limit": limit },

        # Join movie metadata
        {
            "$lookup": {
                "from": "movies",
                "localField": "_id",
                "foreignField": "movieId",
                "as": "movie"
            }
        },
        { "$unwind": "$movie" },

        # Joins in the movie document from the `movies` collection to access the title
        {
            "$project": {
                "_id": 0,
                "movieId": "$_id",
                "title": "$movie.title",
                "coWatchCount": "$count"
            }
        }
    ]

    return list(users_col.aggregate(pipeline))

In [20]:
movie_id = 1
movie = movies_col.find_one({ "movieId": movie_id }, { "_id": 0, "title": 1 })

print(movie["title"] if movie else "Not found")

Toy Story (1995)


In [21]:
results = get_co_watched_movies(movie_id=1)

for r in results:
    print(f"{r['title']} (co-watched {r['coWatchCount']} times)")

Forrest Gump (1994) (co-watched 1917 times)
Random Movie 356 (co-watched 1917 times)
Star Wars: Episode IV - A New Hope (1977) (co-watched 1797 times)
Pulp Fiction (1994) (co-watched 1748 times)
Jurassic Park (1993) (co-watched 1690 times)
Shawshank Redemption, The (1994) (co-watched 1682 times)
Silence of the Lambs, The (1991) (co-watched 1647 times)
Matrix, The (1999) (co-watched 1645 times)
Star Wars: Episode VI - Return of the Jedi (1983) (co-watched 1522 times)
Independence Day (a.k.a. ID4) (1996) (co-watched 1504 times)
Star Wars: Episode V - The Empire Strikes Back (1980) (co-watched 1465 times)
Back to the Future (1985) (co-watched 1457 times)
Braveheart (1995) (co-watched 1419 times)
Terminator 2: Judgment Day (1991) (co-watched 1416 times)
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981) (co-watched 1412 times)
Aladdin (1992) (co-watched 1319 times)
Lion King, The (1994) (co-watched 1317 times)
Lord of the Rings: The Fellowship of the Ring, The (

## User–User Carousel: Collaborative Filtering

Given a user, recommend **movies they haven’t seen**, but that were **highly rated by similar users**.

This approach implements **user-based collaborative filtering**, 
matching users by overlapping watch history and mining top-rated unseen movies from the most similar ones.

1. Identify movies the **target user** has rated.
2. Find **other users** who have watched **some of the same movies**.
3. Filter to those with a **minimum overlap** (e.g., at least 3 shared movies).
4. Pull out **highly rated movies** (e.g., rating ≥ 4.0) from these similar users.
5. Filter out movies already rated by the target user.
6. Count how often each unseen movie appears and **sort by popularity**.
7. Return a list of **recommended movies**, enriched with title metadata.


In [26]:
def recommend_from_similar_users(target_user_id, rating_threshold=4.0, similarity_overlap=3, limit=20):
    # Get rated movies for the target user
    target_user = users_col.find_one({ "userId": target_user_id }, { "_id": 0, "ratingHistory.movieId": 1 })
    if not target_user:
        return []

    user_movies = { r["movieId"] for r in target_user.get("ratingHistory", []) }

    if not user_movies:
        return []

    pipeline = [
        # Find users who rated overlapping movies
        {
            "$match": {
                "userId": { "$ne": target_user_id },
                "ratingHistory.movieId": { "$in": list(user_movies) }
            }
        },

        # Filter ratingHistory to only those movies
        { "$unwind": "$ratingHistory" },

        # Keep only overlapping movies
        {
            "$match": {
                "ratingHistory.movieId": { "$in": list(user_movies) }
            }
        },

        # Group by user and collect their overlapping movies
        {
            "$group": {
                "_id": "$userId",
                "commonMovies": { "$addToSet": "$ratingHistory.movieId" },
                "count": { "$sum": 1 }
            }
        },

        # Filter by minimum overlap (similarity threshold)
        {
            "$match": { "count": { "$gte": similarity_overlap } }
        },

        # Rejoin to get their full rating histories
        {
            "$lookup": {
                "from": "users",
                "localField": "_id",
                "foreignField": "userId",
                "as": "user"
            }
        },
        { "$unwind": "$user" },
        { "$unwind": "$user.ratingHistory" },

        # Only include highly-rated movies not seen by target user
        {
            "$match": {
                "user.ratingHistory.movieId": { "$nin": list(user_movies) },
                "user.ratingHistory.rating": { "$gte": rating_threshold }
            }
        },

        # Count how many similar users recommended each movie
        {
            "$group": {
                "_id": "$user.ratingHistory.movieId",
                "count": { "$sum": 1 }
            }
        },

        { "$sort": { "count": -1 } },
        { "$limit": limit },

        # Sort, limit, and enrich with movie metadata
        {
            "$lookup": {
                "from": "movies",
                "localField": "_id",
                "foreignField": "movieId",
                "as": "movie"
            }
        },
        { "$unwind": "$movie" },
        {
            "$project": {
                "_id": 0,
                "movieId": "$_id",
                "title": "$movie.title",
                "recommendedBy": "$count"
            }
        }
    ]

    return list(users_col.aggregate(pipeline))

In [27]:
sample_user = users_col.find_one({}, {'_id': 0}) 
print(json.dumps(sample_user, indent=2, default=str))

{
  "userId": 1,
  "stats": {
    "ratingCount": 70,
    "avgRating": 3.81
  },
  "ratingHistory": [
    {
      "movieId": 5952,
      "rating": 4.0,
      "timestamp": 1147868053
    },
    {
      "movieId": 2012,
      "rating": 2.5,
      "timestamp": 1147868068
    },
    {
      "movieId": 2011,
      "rating": 2.5,
      "timestamp": 1147868079
    },
    {
      "movieId": 1653,
      "rating": 4.0,
      "timestamp": 1147868097
    },
    {
      "movieId": 1250,
      "rating": 4.0,
      "timestamp": 1147868414
    },
    {
      "movieId": 6539,
      "rating": 3.5,
      "timestamp": 1147868461
    },
    {
      "movieId": 6377,
      "rating": 4.0,
      "timestamp": 1147868469
    },
    {
      "movieId": 3448,
      "rating": 4.0,
      "timestamp": 1147868480
    },
    {
      "movieId": 1088,
      "rating": 4.0,
      "timestamp": 1147868495
    },
    {
      "movieId": 899,
      "rating": 3.5,
      "timestamp": 1147868510
    },
    {
      "movieId": 4308,
 

In [28]:
user_id = 1
results = recommend_from_similar_users(user_id)

for movie in results:
    print(f"{movie['title']} (recommended by {movie['recommendedBy']} similar users)")

Matrix, The (1999) (recommended by 1973 similar users)
Shawshank Redemption, The (1994) (recommended by 1970 similar users)
Fight Club (1999) (recommended by 1742 similar users)
Forrest Gump (1994) (recommended by 1649 similar users)
Random Movie 356 (recommended by 1649 similar users)
Silence of the Lambs, The (1991) (recommended by 1616 similar users)
Lord of the Rings: The Fellowship of the Ring, The (2001) (recommended by 1616 similar users)
Star Wars: Episode IV - A New Hope (1977) (recommended by 1562 similar users)
Star Wars: Episode V - The Empire Strikes Back (1980) (recommended by 1521 similar users)
Lord of the Rings: The Return of the King, The (2003) (recommended by 1482 similar users)
Godfather, The (1972) (recommended by 1462 similar users)
American Beauty (1999) (recommended by 1443 similar users)
Usual Suspects, The (1995) (recommended by 1428 similar users)
Schindler's List (1993) (recommended by 1413 similar users)
Raiders of the Lost Ark (Indiana Jones and the Raide