In [8]:
import pandas as pd
from sqlalchemy import create_engine, select, Table, MetaData
from sqlalchemy.orm import sessionmaker
import yaml
import time
from tqdm import tqdm

In [9]:
def conexion_db():
    """
    Establishes a connection to the SQL database.
    
    Returns:
        tuple: (engine, session) where:
            - engine: The SQLAlchemy Engine object connected to the database.
            - session: A SQLAlchemy session for executing queries.
    
    Raises:
        Exception: If the connection to the database fails.
    """
    try:
        ## URL of the database
        database_url = 'postgresql://postgres.pczyoeavtwijgtkzgcaz:D0jVgaoGmDAFuaMS@aws-0-eu-west-3.pooler.supabase.com:6543/postgres'
        engine = create_engine(database_url)
        session = sessionmaker(bind=engine)
        session = session()
        print("Connection to the database successful")
        return engine, session
    except:
        raise Exception("Error in the connection to the database")


In [10]:
engine, session = conexion_db()

Connection to the database successful


In [11]:
def get_table_names(engine):
    """
    Get the names of all tables in the database.
    
    Args:
        engine (sqlalchemy.engine.base.Engine): The SQLAlchemy Engine object connected to the database.
    
    Returns:
        list: A list of strings with the names of all tables in the database.
    """
    metadata = MetaData()
    metadata.reflect(engine)
    return metadata.tables.keys()

def get_table(engine, table_name, n_rows=5):
    """
    Get the first n rows of a table.
    
    Args:
        engine (sqlalchemy.engine.base.Engine): The SQLAlchemy Engine object connected to the database.
        table_name (str): The name of the table to query.
        n_rows (int): The number of rows to retrieve.
    
    Returns:
        pandas.DataFrame: A pandas DataFrame with the first n rows of the table.
    """
    return pd.read_sql(f"SELECT * FROM {table_name} LIMIT {n_rows}", engine)

In [12]:
print(get_table_names(engine))

dict_keys(['book', 'author', 'publisher', 'genre', 'series', 'awards', 'characters', 'book_source', 'user_book_source', 'User', 'media', 'book_author', 'liked_publisher', 'book_publisher', 'liked_genres', 'book_genre', 'liked_series', 'book_series', 'liked_author', 'book_awards', 'book_characters', 'liked_books', 'fav_books', 'fav_medias'])


In [13]:
display(get_table(engine, 'book'))

Unnamed: 0,book_id,book_title,nb_of_pages,book_description,settings,isbn,isbn13,original_title,review_count,one_star_rating,two_star_rating,three_star_rating,four_star_rating,five_star_rating
0,139826,The Dreamer Wakes,384,"""The Story of the Stone"" (c. 1760), also known...",,014044372X,9780140443721.0,"ç´ æ¨""夢 [Hónglóu Mèng]",34,2,14,70,137,262
1,81050,"Brave on the Rocks: If You Don't Go, You Don't...",160,"""In the continuum of life and trying to discov...",,375756639,9780375756634.0,"Brave on the Rocks: If You Don't Go, You Don't...",23,19,46,210,306,515
2,363245,Madeleine's Ghost,356,Brooklyn needs a saint. Ned Conti needs a stip...,,385316364,9780385316361.0,Madeleine's Ghost,74,17,47,152,208,186
3,9536317,O Renascer,352,Com a capital do Império tomada pelas forças d...,,,,"O Renascer (Estrela de Nariën, #2)",10,2,1,7,18,8
4,2082405,Geek High,256,"At this school, everyone's a geek. And Miranda...",,451222253,9780451222251.0,Geek High,175,52,162,589,547,504


In [None]:
def livre_pref_user(user_id):
    """
    Get the books that a user has in his/her library.
    
    Args:
        user_id (int): The ID of the user.
    
    Returns:
        a list of books that the user has in his/her library.
    """
    query = f"""
    SELECT book.title
    FROM book
    JOIN library ON book.id = library.book_id
    WHERE library.user_id = {user_id}
    """
    return pd.read_sql(query, engine)

In [14]:
# La table users n'etant pas encore remplie, nous allons les simuler
# Nous allons creer 1000 utilisateurs avec des pseudo aleatoires 2 ou 3 livre preferes( parmis la table book), un nombre de livre lus aleatoire entre 0 et 10 et un temps de lecture aleatoire entre 0 et 1000
import random
import string

def random_string(length):
    return ''.join(random.choices(string.ascii_uppercase + string.digits, k=length))

def random_books(n_books, df_books):
    return random.sample(list(df_books['book_title']), n_books)

def random_users(n_users, df_books):
    users = []
    for i in range(n_users):
        user = {
            'username': random_string(10),
            'favorite_books': random_books(random.randint(2, 3), df_books),
            'liked_books': random_books(random.randint(0, 5), df_books),
            'books_read': random.randint(0, 10),
            'time_read': random.randint(0, 1000)
        }
        users.append(user)
    return users

df_books = get_table(engine, 'book', n_rows=5000)
users = random_users(5000, df_books)
df_users = pd.DataFrame(users)
df_users.head()

Unnamed: 0,username,favorite_books,liked_books,books_read,time_read
0,7KVHJLOM2P,"[Weapon, Robert Pattinson: The Unauthorized Bi...",[],2,901
1,UL8BKBLC7W,"[Dirty Filthy Rich Men, Devilish, The Peter Pa...",[The Crimson Thread: A Retelling of Rumpelstil...,7,658
2,Q7PQS2X5MK,"[Deep Green: Color Me Jealous, Whernside, An E...","[Gloriana, The Chosen, Message from an Unknown...",1,458
3,N8TXC72PYV,"[The Widow's Broom, Birthday Girl]","[Halo: First Strike, Just Don't Mention It, رو...",6,477
4,138RVKF5UP,"[The Wolfman, The Charmed Sphere]","[علم السياسة: الأسس, The Happiness Advantage: ...",4,62


In [15]:
def recoUserBased(user, userDF, k=5):
    dicoRecos = {}
    dicoUserSim = {}
    # On recupere les livres lus par l'utilisateur
    livresLus = set(user['liked_books'])
    # On recupere les livres preferes de l'utilisateur
    livresPref = set(user['favorite_books'])

    #on ajoute les livres preferes dans les livres lus si ils ne sont pas deja presents
    livresLus = livresLus.union(livresPref)
    # On recupere les livres lus par les autres utilisateurs
    
    for i, u in userDF.iterrows():
        if u['username'] != user['username']:
            livresLusAutre = set(u['liked_books'])
            livresPrefAutre = set(u['favorite_books'])
            # On calcule le nombre de livres en commun entre l'utilisateur et les autres utilisateurs
            nbLivreCommun = len(livresLus.intersection(livresLusAutre))
            nbLivrePrefCommun = len(livresPref.intersection(livresPrefAutre))
            # On calcule le score de similarite entre l'utilisateur et les autres utilisateurs
            score = nbLivreCommun + 1.2 * nbLivrePrefCommun
            dicoUserSim[u['username']] = score
    
    # On trie les utilisateurs en fonction de leur score de similarite
    dicoUserSim = dict(sorted(dicoUserSim.items(), key=lambda item: item[1], reverse=True))
    
    # quel sont les livre non lu par l'utilisateur qui sont lu par les utilisateurs les plus similaires (le 1er qartile)
    for u in list(dicoUserSim.keys())[:len(dicoUserSim)//4]:
        for livre in userDF[userDF['username'] == u]['liked_books'].values[0]:
            if livre not in livresLus:
                if livre in dicoRecos:
                    dicoRecos[livre] += 1
                else:
                    dicoRecos[livre] = 1
    # si le dictionnaire est vide on continue avec les 10 utilisateurs suivants les plus similaires tant que le dictionnaire est vide
    i = 0
    while len(dicoRecos) == 0 and i < 10:
        for u in list(dicoUserSim.keys())[i*len(dicoUserSim)//4:(i+1)*len(dicoUserSim)//4]:
            for livre in userDF[userDF['username'] == u]['liked_books'].values[0]:
                if livre not in livresLus:
                    if livre in dicoRecos:
                        dicoRecos[livre] += 1
                    else:
                        dicoRecos[livre] = 1
        i += 1
    print(i)
    # On trie les livres en fonction du nombre de fois qu'ils ont ete recommandes
    dicoRecos = dict(sorted(dicoRecos.items(), key=lambda item: item[1], reverse=True))
    return dicoRecos



In [16]:
print(f"l'utilisateur 0 a pour livre prefere : {df_users.iloc[0]['favorite_books']}")
print(f"l'utilisateur 0 a pour livre lu : {df_users.iloc[0]['liked_books']}")
print(f"recomandation : {recoUserBased(df_users.iloc[0], df_users, 5)}")

l'utilisateur 0 a pour livre prefere : ['Weapon', 'Robert Pattinson: The Unauthorized Biography']
l'utilisateur 0 a pour livre lu : []
0
