In [13]:
# ============================================
# 1. IMPORTAÇÕES E CARREGAMENTO DO DATASET
# ============================================

from pathlib import Path
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Arquivo vindo do EDA / preparação
DF_PATH = Path("/Users/iloop/Desktop/notebooks/udemy_courses_clean.csv")

df = pd.read_csv(DF_PATH)
print("Dimensões do dataset:", df.shape)
df.head()


Dimensões do dataset: (3668, 23)


Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,profit,published_date,published_time,year,month,day,content_duration_hours,paid_label,published_dt,level_norm,subject_norm
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5 hours,2017-01-18T20:58:58Z,Business Finance,429400,2017-01-18,20:58:58Z,2017.0,1.0,18.0,1.5,Pago,2017-01-18 20:58:58+00:00,all levels,business finance
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39 hours,2017-03-09T16:34:20Z,Business Finance,209400,2017-03-09,16:34:20Z,2017.0,3.0,9.0,39.0,Pago,2017-03-09 16:34:20+00:00,all levels,business finance
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,True,45,2174,74,51,Intermediate Level,2.5 hours,2016-12-19T19:26:30Z,Business Finance,97830,2016-12-19,19:26:30Z,2016.0,12.0,19.0,2.5,Pago,2016-12-19 19:26:30+00:00,intermediate level,business finance
3,1210588,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,True,95,2451,11,36,All Levels,3 hours,2017-05-30T20:07:24Z,Business Finance,232845,2017-05-30,20:07:24Z,2017.0,5.0,30.0,3.0,Pago,2017-05-30 20:07:24+00:00,all levels,business finance
4,1011058,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,True,200,1276,45,26,Intermediate Level,2 hours,2016-12-13T14:57:18Z,Business Finance,255200,2016-12-13,14:57:18Z,2016.0,12.0,13.0,2.0,Pago,2016-12-13 14:57:18+00:00,intermediate level,business finance


In [14]:
# ============================================
# 2. CONSTRUÇÃO DOS CORPORA DE TEXTO
# ============================================

def build_corpus_v1(df: pd.DataFrame) -> list[str]:
    # Apenas título
    return df["course_title"].fillna("").astype(str).tolist()

def build_corpus_v2(df: pd.DataFrame) -> list[str]:
    # Título + assunto normalizado
    return (
        df["course_title"].fillna("").astype(str) + " " +
        df["subject_norm"].fillna("").astype(str)
    ).tolist()

def build_corpus_v3(df: pd.DataFrame) -> list[str]:
    # Título + assunto + nível normalizado
    return (
        df["course_title"].fillna("").astype(str) + " " +
        df["subject_norm"].fillna("").astype(str) + " " +
        df["level_norm"].fillna("").astype(str)
    ).tolist()


In [15]:
# ============================================
# 3. CRIAÇÃO DAS PIPELINES (TF-IDF)
# ============================================

pipelines: dict[str, dict] = {}

configs = {
    "v1_title": build_corpus_v1,
    "v2_title_subject": build_corpus_v2,
    "v3_title_subject_level": build_corpus_v3,
}

for nome, fn_corpus in configs.items():
    corpus = fn_corpus(df)

    vectorizer = TfidfVectorizer(
        stop_words="english",
        ngram_range=(1, 2),
        min_df=2
    )

    matriz = vectorizer.fit_transform(corpus)

    pipelines[nome] = {
        "vectorizer": vectorizer,
        "matrix": matriz,
    }

print("Pipelines criadas:", list(pipelines.keys()))


Pipelines criadas: ['v1_title', 'v2_title_subject', 'v3_title_subject_level']


In [16]:
# ============================================
# 4. FUNÇÃO DE RECOMENDAR CURSOS
# ============================================

def recomendar_cursos(
    texto_busca: str,
    df: pd.DataFrame,
    pipelines: dict,
    nome_pipeline: str,
    top_k: int = 5
) -> pd.DataFrame:
    pipe = pipelines[nome_pipeline]
    vec  = pipe["vectorizer"]
    mat  = pipe["matrix"]

    query_vec = vec.transform([texto_busca])
    sims = cosine_similarity(query_vec, mat).flatten()

    idxs = sims.argsort()[::-1][:top_k]

    colunas_base = ["course_title"]
    if "url" in df.columns:
        colunas_base.append("url")
    if "num_subscribers" in df.columns:
        colunas_base.append("num_subscribers")
    if "price" in df.columns:
        colunas_base.append("price")

    recs = df.iloc[idxs][colunas_base].copy()
    recs["score"] = sims[idxs]

    return recs


In [17]:
# ============================================
# 5. AVALIAÇÃO DAS PIPELINES (pop_score@5)
# ============================================

def avaliar_pipeline_por_popularidade(
    df: pd.DataFrame,
    pipelines: dict,
    nome_pipeline: str,
    consultas: list[str],
    k: int = 5
) -> float:
    resultados = []

    for q in consultas:
        recs = recomendar_cursos(q, df, pipelines, nome_pipeline, top_k=k)

        if "num_subscribers" in recs.columns:
            pop_mean = recs["num_subscribers"].mean()
        else:
            pop_mean = np.nan

        resultados.append(pop_mean)

    return float(np.nanmean(resultados))


In [18]:
# ============================================
# 6. EXECUÇÃO DA AVALIAÇÃO DAS PIPELINES
# ============================================

consultas_teste = [
    "python",
    "data science",
    "excel",
    "business",
    "graphic design",
]

avaliacoes = []
for nome in pipelines.keys():
    score = avaliar_pipeline_por_popularidade(
        df,
        pipelines,
        nome_pipeline=nome,
        consultas=consultas_teste,
        k=5
    )
    avaliacoes.append({
        "pipeline": nome,
        "pop_score@5": score
    })

avaliacoes_df = pd.DataFrame(avaliacoes).sort_values("pop_score@5", ascending=False)
avaliacoes_df


Unnamed: 0,pipeline,pop_score@5
0,v1_title,4866.56
2,v3_title_subject_level,4142.56
1,v2_title_subject,3132.56


In [19]:
# ============================================
# 7. ESCOLHA DA MELHOR PIPELINE
# ============================================

melhor_pipeline = avaliacoes_df.iloc[0]["pipeline"]
print("Melhor pipeline:", melhor_pipeline)


Melhor pipeline: v1_title


In [20]:
# ============================================
# 8. EXEMPLOS DE RECOMENDAÇÃO COM A MELHOR PIPELINE
# ============================================

consultas_demo = ["python", "excel", "web development"]
resultados_demo: dict[str, pd.DataFrame] = {}

for q in consultas_demo:
    resultados_demo[q] = recomendar_cursos(
        q,
        df,
        pipelines,
        nome_pipeline=melhor_pipeline,
        top_k=5
    )

for q, recs in resultados_demo.items():
    print(f"\n=== Recomendação para: {q} ===")
    display(recs)



=== Recomendação para: python ===


Unnamed: 0,course_title,url,num_subscribers,price,score
2673,Python for Beginners: Python Programming Langu...,https://www.udemy.com/python-course/,6153,150,0.595088
538,Python for Trading & Investing,https://www.udemy.com/python-for-trading-inves...,638,95,0.516324
3497,Fun and creative web engineering with Python a...,https://www.udemy.com/web-engineering-with-pyt...,10917,0,0.507469
2491,Web Programming with Python,https://www.udemy.com/web-programming-with-pyt...,35267,50,0.501808
3129,Complete Python Web Course: Build 8 Python Web...,https://www.udemy.com/the-complete-python-web-...,7489,110,0.479602



=== Recomendação para: excel ===


Unnamed: 0,course_title,url,num_subscribers,price,score
1501,Photoshop: Automatiza invitaciones con bloc de...,https://www.udemy.com/curso-photoshop-automati...,86,30,0.823171
237,Excel Dashboard - Interactive Excel Dashboard ...,https://www.udemy.com/interactive-charts-in-ex...,843,30,0.752812
60,Excel Crash Course: Master Excel for Financial...,https://www.udemy.com/excel-crash-course-maste...,8121,105,0.548346
741,Financial Ratios Using Excel,https://www.udemy.com/financialratios/,1223,100,0.514421
132,Building Financial Statements in Excel,https://www.udemy.com/guide-to-building-financ...,1181,35,0.478903



=== Recomendação para: web development ===


Unnamed: 0,course_title,url,num_subscribers,price,score
3081,The All-In-One Web Development Course,https://www.udemy.com/the-all-in-one-web-devel...,538,85,0.700017
2772,In Depth Web Development Made Easy,https://www.udemy.com/in-depth-web-development...,3249,25,0.669176
2600,Introduction to Web Development,https://www.udemy.com/introduction-to-web-deve...,5921,50,0.652711
3271,Introduction to Frontend Web Development For ...,https://www.udemy.com/programming-for-web-deve...,1018,145,0.635508
3553,Quick learning jQuery web development,https://www.udemy.com/quick-learning-jquery-we...,3388,150,0.623308


In [21]:
# ============================================
# 9. SALVAMENTO DOS ARTEFATOS (RANKING + MODELO)
# ============================================

MODELS_DIR = Path("/Users/iloop/Desktop/notebooks/models")
REPORTS_DIR = Path("/Users/iloop/Desktop/notebooks/reports")

MODELS_DIR.mkdir(parents=True, exist_ok=True)
REPORTS_DIR.mkdir(parents=True, exist_ok=True)

# Ranking das pipelines
rank_path = REPORTS_DIR / "ranking_pipelines.csv"
avaliacoes_df.to_csv(rank_path, index=False)

# Modelo final
best_model_obj = {
    "pipeline_name": melhor_pipeline,
    "vectorizer": pipelines[melhor_pipeline]["vectorizer"],
    "matrix": pipelines[melhor_pipeline]["matrix"],
    "df_columns": df.columns.tolist(),
    "metadata": {
        "metric": "pop_score@5",
        "score": float(avaliacoes_df.iloc[0]["pop_score@5"]),
        "consultas_usadas": consultas_teste,
    },
}

model_path = MODELS_DIR / "best_pipeline.pkl"
with open(model_path, "wb") as f:
    pickle.dump(best_model_obj, f)

# Exemplo específico para "python"
exemplo_python = recomendar_cursos(
    "python", df, pipelines, nome_pipeline=melhor_pipeline, top_k=5
)
exemplo_path = REPORTS_DIR / "exemplo_recomendacao_python.csv"
exemplo_python.to_csv(exemplo_path, index=False)

print("Ranking salvo em:", rank_path)
print("Modelo salvo em:", model_path)
print("Exemplo de recomendação (python) salvo em:", exemplo_path)


Ranking salvo em: /Users/iloop/Desktop/notebooks/reports/ranking_pipelines.csv
Modelo salvo em: /Users/iloop/Desktop/notebooks/models/best_pipeline.pkl
Exemplo de recomendação (python) salvo em: /Users/iloop/Desktop/notebooks/reports/exemplo_recomendacao_python.csv
