In [1]:
# Import necessary modules
from loguru import logger
from datetime import datetime
import pandas as pd
from src.elt.transforms.utils import connect_mongodb

[32m2025-08-19 20:43:52.586[0m | [1mINFO    [0m | [36msrc.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /Users/Edward/Documents/GitHub/book-club[0m


In [2]:
def count_books_read_this_year():
    """
    Connects to the MongoDB database and counts the number of books read
    by all users in the current year.
    """
    db, client = connect_mongodb()

    # Get the current year
    current_year = datetime.now().year

    # Define the aggregation pipeline
    pipeline = [
        # Match documents with a completed date in the current year
        {
            "$match": {
                "date_completed": {
                    "$gte": datetime(current_year, 1, 1),
                    "$lt": datetime(current_year + 1, 1, 1)
                }
            }
        },
        # Group all matched documents and count them
        {
            "$group": {
                "_id": None,
                "total_books_read": {"$count": {}}
            }
        }
    ]

    try:
        # Execute the pipeline
        results = list(db["user_reads"].aggregate(pipeline))
        
        # Extract the total count
        if results:
            total_books_read = results[0]["total_books_read"]
            logger.success(f"A total of {total_books_read} books were read this year.")
        else:
            logger.info("No books were read this year.")
            
    except Exception as e:
        logger.error(f"An error occurred during aggregation: {e}")
    finally:
        client.close()
        logger.info("MongoDB connection closed.")

if __name__ == "__main__":
    count_books_read_this_year()



[32m2025-08-17 00:20:43.924[0m | [1mINFO    [0m | [36msrc.elt.transforms.utils[0m:[36mconnect_mongodb[0m:[36m17[0m - [1mSuccessfully connected to MongoDB[0m
[32m2025-08-17 00:20:43.958[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mcount_books_read_this_year[0m:[36m38[0m - [32m[1mA total of 42 books were read this year.[0m
[32m2025-08-17 00:20:43.980[0m | [1mINFO    [0m | [36m__main__[0m:[36mcount_books_read_this_year[0m:[36m46[0m - [1mMongoDB connection closed.[0m


In [2]:
def books_read_by_month():
    """
    Connects to the MongoDB database, counts the number of books read
    by all users each month, and displays the result in a Pandas DataFrame.
    """
    db, client = connect_mongodb()

    # Define the aggregation pipeline to count books read per month
    pipeline = [
        # Match documents that have a completed date and are marked as completed
        {
            "$match": {
                "date_completed": {"$ne": None},
                "current_rstatus_name": "Read"
            }
        },
        # Group documents by year and month of the completion date
        {
            "$group": {
                "_id": {
                    "year": {"$year": "$date_completed"},
                    "month": {"$month": "$date_completed"}
                },
                "books_read": {"$count": {}}
            }
        },
        # Sort the results chronologically by year and then by month
        {
            "$sort": {
                "_id.year": 1,
                "_id.month": 1
            }
        }
    ]

    try:
        # Execute the aggregation pipeline
        results = list(db["user_reads"].aggregate(pipeline))
        
        if not results:
            logger.info("No completed books found. The DataFrame will be empty.")
            df = pd.DataFrame(columns=['Year', 'Month', 'Books Read'])
        else:
            # Create a list of dictionaries in a format suitable for a DataFrame
            df_data = [
                {
                    "Year": r["_id"]["year"],
                    "Month": r["_id"]["month"],
                    "Books Read": r["books_read"]
                } for r in results
            ]
            
            # Create the Pandas DataFrame
            df = pd.DataFrame(df_data)
            
        logger.success("Successfully created the DataFrame:")
        print(df.to_string()) # Use .to_string() for better console formatting
        
    except Exception as e:
        logger.error(f"An error occurred during aggregation or DataFrame creation: {e}")
    finally:
        client.close()
        logger.info("MongoDB connection closed.")

if __name__ == "__main__":
    books_read_by_month()


[32m2025-08-19 20:44:27.261[0m | [1mINFO    [0m | [36msrc.elt.transforms.utils[0m:[36mconnect_mongodb[0m:[36m17[0m - [1mSuccessfully connected to MongoDB[0m
[32m2025-08-19 20:44:27.325[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mbooks_read_by_month[0m:[36m56[0m - [32m[1mSuccessfully created the DataFrame:[0m
    Year  Month  Books Read
0   2023      5           1
1   2024      3           2
2   2024      5           2
3   2024      7           1
4   2024      8           1
5   2024      9           2
6   2024     11           4
7   2024     12           6
8   2025      1           8
9   2025      2           7
10  2025      3           8
11  2025      4           3
12  2025      5           3
13  2025      6           8
14  2025      7           5
[32m2025-08-19 20:44:27.415[0m | [1mINFO    [0m | [36m__main__[0m:[36mbooks_read_by_month[0m:[36m63[0m - [1mMongoDB connection closed.[0m


In [None]:
db, client = connect_mongodb()

# Define the aggregation pipeline to count books read per month
pipeline = [
    # Match documents that have a completed date and are marked as completed
    {
        "$match": {
            "date_completed": {"$ne": None},
            "current_rstatus_name": "Read"
        }
    },
    # Group documents by year and month of the completion date
    {
        "$group": {
            "_id": {
                "year": {"$year": "$date_completed"},
                "month": {"$month": "$date_completed"}
            },
            "books_read": {"$count": {}}
        }
    },
    # Sort the results chronologically by year and then by month
    {
        "$sort": {
            "_id.year": 1,
            "_id.month": 1
        }
    }
]