# Prova de Conceito - Sistema de Recomendação (Entrega 3)

Objetivos:
* Criar modelo minimamente viável

# 1 - Configurações do ambiente e carregamento do dataset

In [43]:
from pathlib import Path

PROJECT_ROOT = Path(".").resolve()

DATA_RAW     = PROJECT_ROOT / "data" / "raw"
DATA_INT     = PROJECT_ROOT / "data" / "interim"
DATA_PROC    = PROJECT_ROOT / "data" / "processed"
MODELS_DIR   = PROJECT_ROOT / "models"
REPORTS_DIR  = PROJECT_ROOT / "reports"

for d in [DATA_RAW, DATA_INT, DATA_PROC, MODELS_DIR, REPORTS_DIR]:
    d.mkdir(parents=True, exist_ok=True)

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 120)

raw_path = DATA_PROC / "udemy_cleaned_for_training.csv"
df = pd.read_csv(raw_path)
df.head()

Unnamed: 0,course_id,course_title,url,price,is_paid,num_subscribers,num_reviews,num_lectures,content_duration_hours,subject,level,subject_norm,level_norm,popularity_score
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,200,True,2147,23,51,1.5,Business Finance,All Levels,business finance,all levels,0.007984
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,75,True,2792,923,274,39.0,Business Finance,All Levels,business finance,all levels,0.010382
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,45,True,2174,74,51,2.5,Business Finance,Intermediate Level,business finance,intermediate level,0.008084
3,1210588,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,95,True,2451,11,36,3.0,Business Finance,All Levels,business finance,all levels,0.009114
4,1011058,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,200,True,1276,45,26,2.0,Business Finance,Intermediate Level,business finance,intermediate level,0.004745


# 2 - Criação dos corpus

In [9]:

def build_corpus_v1(df):
    # só o título
    return df["course_title"].fillna("").astype(str).tolist()

def build_corpus_v2(df):
    # título + assunto normalizado
    return (
        df["course_title"].fillna("").astype(str) + " " +
        df["subject_norm"].fillna("").astype(str)
    ).tolist()

def build_corpus_v3(df):
    # título + assunto + nível normalizado
    return (
        df["course_title"].fillna("").astype(str) + " " +
        df["subject_norm"].fillna("").astype(str) + " " +
        df["level_norm"].fillna("").astype(str)
    ).tolist()

# 3 - Criação do pipeline para treino

In [39]:
pipelines = {}
for name, corpus_fn in {
    "v1_title": build_corpus_v1,
    "v2_title_subject": build_corpus_v2,
    "v3_title_subject_level": build_corpus_v3,
}.items():
    corpus = corpus_fn(df)
    vec = TfidfVectorizer(stop_words="english", ngram_range=(1, 2), min_df=2)
    mat = vec.fit_transform(corpus)
    pipelines[name] = {"vectorizer": vec, "matrix": mat}
    
print("Pipelines criadas:", list(pipelines.keys()))

Pipelines criadas: ['v1_title', 'v2_title_subject', 'v3_title_subject_level']


# 4 - Funções para avaliação e recomendação

In [None]:
def recomendar_cursos(texto_busca: str, df: pd.DataFrame, pipelines: dict, nome_pipeline: str, top_k: int = 5) -> pd.DataFrame:
    pipe = pipelines[nome_pipeline]
    vec  = pipe["vectorizer"]
    mat  = pipe["matrix"]

    query_vec = vec.transform([texto_busca])
    sims = cosine_similarity(query_vec, mat).flatten()
    idxs = sims.argsort()[::-1][:top_k]

    # colunas que vamos mostrar
    cols = ["course_title"]
    if "url" in df.columns:
        cols.append("url")
    if "num_subscribers" in df.columns:
        cols.append("num_subscribers")
    if "price" in df.columns:
        cols.append("price")

    recs = df.iloc[idxs][cols].copy()
    recs = recs.assign(score=sims[idxs])
    return recs


In [19]:
def avaliar_pipeline_por_popularidade(df: pd.DataFrame,
                                      pipelines: dict,
                                      nome_pipeline: str,
                                      consultas: list[str],
                                      k: int = 5) -> float:
    resultados = []
    for q in consultas:
        recs = recomendar_cursos(q, df, pipelines, nome_pipeline, top_k=k)

        if "popularity_score" in recs.columns:
            pop_mean = recs["popularity_score"].mean()
        elif "num_subscribers" in recs.columns:
            pop_mean = recs["num_subscribers"].mean()
        else:
            pop_mean = np.nan

        resultados.append(pop_mean)

    return float(np.nanmean(resultados))


# 5 - Execução da avaliação da pipeline

In [23]:
consultas_teste = [
    "python",
    "data science",
    "excel",
    "business",
    "graphic design"
]

avaliacoes = []
for nome in pipelines.keys():
    score = avaliar_pipeline_por_popularidade(
        df,
        pipelines,
        nome_pipeline=nome,
        consultas=consultas_teste,
        k=5
    )
    avaliacoes.append({
        "pipeline": nome,
        "pop_score@5": score
    })

avaliacoes_df = pd.DataFrame(avaliacoes).sort_values("pop_score@5", ascending=False)
display(avaliacoes_df)


Unnamed: 0,pipeline,pop_score@5
0,v1_title,4866.56
2,v3_title_subject_level,4067.0
1,v2_title_subject,3516.92


In [34]:
melhor_pipeline = avaliacoes_df.iloc[0]["pipeline"]
print("Melhor pipeline:", melhor_pipeline)


Melhor pipeline: v1_title


# 6 - Execução da recomendação com base no melhor modelo

In [37]:
consultas_demo = ["python", "excel", "web development"]

resultados_demo = {}
for q in consultas_teste:
    resultados_demo[q] = recomendar_cursos(
        q,
        df,
        pipelines,
        nome_pipeline=melhor_pipeline,
        top_k=5
    )


In [38]:
for q, recs in resultados_demo.items():
    print(f"\n=== Recomendação para: {q} ===")
    display(recs)



=== Recomendação para: python ===


Unnamed: 0,course_title,url,num_subscribers,price,score
2673,Python for Beginners: Python Programming Langu...,https://www.udemy.com/python-course/,6153,150,0.595088
538,Python for Trading & Investing,https://www.udemy.com/python-for-trading-inves...,638,95,0.516324
3497,Fun and creative web engineering with Python a...,https://www.udemy.com/web-engineering-with-pyt...,10917,0,0.507469
2491,Web Programming with Python,https://www.udemy.com/web-programming-with-pyt...,35267,50,0.501808
3129,Complete Python Web Course: Build 8 Python Web...,https://www.udemy.com/the-complete-python-web-...,7489,110,0.479602



=== Recomendação para: data science ===


Unnamed: 0,course_title,url,num_subscribers,price,score
480,Visualizing Data,https://www.udemy.com/visualizing-data/,149,40,1.0
3263,Big Data and Apache Hadoop for Developers - Fu...,https://www.udemy.com/learn-big-data-and-apach...,1154,30,0.551362
3566,Display and analyze GIS data on the web with L...,https://www.udemy.com/display-and-analyze-gis-...,25,100,0.476481
131,Excel functions to analyze and visualize data,https://www.udemy.com/basic-excel-functions-to...,2283,20,0.430785
3340,Implementing a Data Warehouse with SQL Server ...,https://www.udemy.com/implementing-a-data-ware...,1142,85,0.379716



=== Recomendação para: excel ===


Unnamed: 0,course_title,url,num_subscribers,price,score
1501,Photoshop: Automatiza invitaciones con bloc de...,https://www.udemy.com/curso-photoshop-automati...,86,30,0.823171
237,Excel Dashboard - Interactive Excel Dashboard ...,https://www.udemy.com/interactive-charts-in-ex...,843,30,0.752812
60,Excel Crash Course: Master Excel for Financial...,https://www.udemy.com/excel-crash-course-maste...,8121,105,0.548346
741,Financial Ratios Using Excel,https://www.udemy.com/financialratios/,1223,100,0.514421
132,Building Financial Statements in Excel,https://www.udemy.com/guide-to-building-financ...,1181,35,0.478903



=== Recomendação para: business ===


Unnamed: 0,course_title,url,num_subscribers,price,score
522,Decide whether to Invest in a Business,https://www.udemy.com/decide-whether-to-invest...,1,20,0.624116
560,Budgeting for Business,https://www.udemy.com/budgeting-for-business/,78,40,0.567339
375,Business finances,https://www.udemy.com/business-finances-learn-...,4695,75,0.525071
525,How to Obtain a Business Loan,https://www.udemy.com/how-to-obtain-a-business...,1,20,0.525071
569,Small Business Owners: Drive a Productive Busi...,https://www.udemy.com/small-business-owners-dr...,5099,20,0.509968



=== Recomendação para: graphic design ===


Unnamed: 0,course_title,url,num_subscribers,price,score
1588,Learn Real Graphic Design,https://www.udemy.com/learn-real-design/,297,20,0.813862
1261,Diventa un professionista del Graphic Design,https://www.udemy.com/diventa-un-professionist...,185,200,0.808877
1219,Graphic Design Masterclass: Learn Graphic Desi...,https://www.udemy.com/graphic-design/,6858,95,0.796267
1408,Graphic Design - An Overview of the Field,https://www.udemy.com/graphic-design-secrets/,23229,0,0.776488
1707,Graphic Design: Beginner's Graphic Design Guid...,https://www.udemy.com/how-to-use-adobe-photosh...,4550,25,0.758827


# 7 - Salvando artefatos

In [None]:
# salvar ranking de pipelines
REPORTS_DIR.mkdir(parents=True, exist_ok=True)
rank_path = REPORTS_DIR / "ranking_pipelines.csv"
avaliacoes_df.to_csv(rank_path, index=False)
print("Ranking salvo em:", rank_path)

# montar o pacote do modelo
best_model_obj = {
    "pipeline_name": melhor_pipeline,
    "vectorizer": pipelines[melhor_pipeline]["vectorizer"],
    "matrix": pipelines[melhor_pipeline]["matrix"],
    "df_columns": df.columns.tolist(),
    "metadata": {
        "consultas_usadas": consultas_teste,
        "metric": "pop_score@5",
        "score": float(avaliacoes_df.iloc[0]["pop_score@5"])
    }
}

MODELS_DIR.mkdir(parents=True, exist_ok=True)
best_model_pkl = MODELS_DIR / "best_pipeline.pkl"

with open(best_model_pkl, "wb") as f:
    pickle.dump(best_model_obj, f)

print("Modelo salvo em:", best_model_pkl)

# salvar exemplo de recomendação
exemplo = recomendar_cursos("python", df, pipelines, nome_pipeline=melhor_pipeline, top_k=5)
exemplo.to_csv(REPORTS_DIR / "exemplo_recomendacao_python.csv", index=False)
print("Exemplo de recomendação salvo em:", REPORTS_DIR / "exemplo_recomendacao_python.csv")


Ranking salvo em: C:\Users\lsbar\OneDrive\Mackenzie\Projeto Aplicado III\Entrega 3\Projeto\reports\ranking_pipelines.csv
Modelo salvo em: C:\Users\lsbar\OneDrive\Mackenzie\Projeto Aplicado III\Entrega 3\Projeto\models\best_pipeline.pkl
Exemplo de recomendação salvo em: C:\Users\lsbar\OneDrive\Mackenzie\Projeto Aplicado III\Entrega 3\Projeto\reports\exemplo_recomendacao_python.csv
