In [1]:
import pandas as pd
from sqlalchemy import create_engine, text
from sqlalchemy.exc import OperationalError

# === Paramètres de connexion ===
host = "localhost"
port = "5432"
database = "Admission_DW_new"
username = "postgres"
password = "hela"


# === Connexion à PostgreSQL ===
db_url = f"postgresql+psycopg2://{username}:{password}@{host}:{port}/{database}"

try:
    engine = create_engine(db_url)

    # Test de connexion
    with engine.connect() as connection:
        result = connection.execute(text("SELECT version();"))
        version = result.fetchone()
        print("✅ Connexion réussie à PostgreSQL !")
        print("📦 Version PostgreSQL :", version[0])


    df_profil = pd.read_sql('SELECT * FROM "DimProfil";', engine)
        # === Import des tables nécessaires pour l’objectif ML ===
    df_localisation = pd.read_sql('SELECT * FROM "DimLocalisation";', engine)
    df_employabilite = pd.read_sql('SELECT * FROM "FactEmployabilite";', engine)
    df_offre = pd.read_sql('SELECT * FROM "DimOffre";', engine)
    df_dates = pd.read_sql("SELECT * FROM dimdates;", engine)
except OperationalError as e:
    print("❌ Erreur de connexion à la base de données :")
    print(e)


✅ Connexion réussie à PostgreSQL !
📦 Version PostgreSQL : PostgreSQL 15.10, compiled by Visual C++ build 1942, 64-bit


In [None]:
query = """
SELECT 
    p.skills,
    p.industry,
    o."secteur",
    o."competence",
    o."offre"
FROM "FactEmployabilite" f
JOIN "DimProfil" p ON f."fkProfil" = p."PkProfil"
JOIN "DimOffre" o ON f."fkOffre" = o."pkOffre";
"""
df_reco = pd.read_sql(query, engine)
df_reco.head()


In [3]:
df_reco = df_reco.fillna('')  # Remplacer les NaN par ''
df_reco['profil_description'] = (
    df_reco['secteur'] + ' ' +
    2 * df_reco['skills'] + ' ' +  # plus d’importance à industry
    df_reco['secteur'] + ' ' +
    2 * df_reco['competence']       # plus d’importance à competence
).str.lower()


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df_reco['profil_description'])


In [5]:
# Dimensions de la matrice
print("Shape of TF-IDF matrix:", tfidf_matrix.shape)

# Voir le vocabulaire appris
print("Mots utilisés (vocabulaire):")
print(vectorizer.get_feature_names_out()[:10])  # juste les 10 premiers

# Voir la matrice pour les 2 premiers profils
print("TF-IDF (2 premiers profils):")
print(tfidf_matrix[:2].toarray())


Shape of TF-IDF matrix: (37719, 1355)
Mots utilisés (vocabulaire):
['10g' '14001' '15022' '17' '20022' '2eme' '360' '360autodesk' '365' '3d']
TF-IDF (2 premiers profils):
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [None]:
from sklearn.neighbors import NearestNeighbors
import numpy as np
import matplotlib.pyplot as plt

# Tester plusieurs valeurs de k
k_values = list(range(1, 11))
avg_distances = []

for k in k_values:
    knn = NearestNeighbors(n_neighbors=k, metric='cosine')
    knn.fit(tfidf_matrix)
    
    distances, _ = knn.kneighbors(tfidf_matrix)
    avg_distance = distances.mean()
    avg_distances.append(avg_distance)

# Tracer les résultats
plt.figure(figsize=(8, 4))
plt.plot(k_values, avg_distances, marker='o')
plt.title('Évaluation de différentes valeurs de k')
plt.xlabel('Nombre de voisins (k)')
plt.ylabel('Distance moyenne (cosine)')
plt.grid(True)
plt.show()


In [None]:
def recommander_offres(input_profil, df_reco, vectorizer, tfidf_matrix, k=3):
    input_vector = vectorizer.transform([input_profil.lower()])
    knn = NearestNeighbors(n_neighbors=len(df_reco), metric='cosine')  # chercher sur tout
    knn.fit(tfidf_matrix)
    distances, indices = knn.kneighbors(input_vector)

    print(f"\n🔍 Offres recommandées pour le profil : {input_profil}\n")
    seen = set()
    count = 0
    for i, idx in enumerate(indices[0]):
        offre = df_reco.iloc[idx]
        offre_id = (offre['offre'], offre['secteur'], offre['competence'])  # tuple d’identifiants

        if offre_id not in seen:
            seen.add(offre_id)
            similarite = (1 - distances[0][i]) * 100  # similarité en pourcentage
            print(f"{count+1}. Offre : {offre['offre']} | Secteur : {offre['secteur']}  Similarité : {similarite:.2f}%")
            count += 1
            if count == k:
                break


In [None]:
profil_input = "python Data Scientist informatique teamwork"
recommander_offres(profil_input, df_reco, vectorizer, tfidf_matrix, k=3)
