In [3]:
from pathlib import Path
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import MinMaxScaler

# Caminho principal onde você está trabalhando
PROJECT_ROOT = Path("/Users/iloop/Desktop/notebooks")

# Caminhos dos arquivos
EDA_OUTPUT = PROJECT_ROOT / "udemy_courses_clean.csv"
PREP_OUTPUT = PROJECT_ROOT / "udemy_cleaned_for_training.csv"

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 120)

# Carrega o dataset limpo vindo do EDA
df = pd.read_csv(EDA_OUTPUT)

print("Dimensões iniciais:", df.shape)
df.head()


Dimensões iniciais: (3668, 23)


Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,profit,published_date,published_time,year,month,day,content_duration_hours,paid_label,published_dt,level_norm,subject_norm
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5 hours,2017-01-18T20:58:58Z,Business Finance,429400,2017-01-18,20:58:58Z,2017.0,1.0,18.0,1.5,Pago,2017-01-18 20:58:58+00:00,all levels,business finance
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39 hours,2017-03-09T16:34:20Z,Business Finance,209400,2017-03-09,16:34:20Z,2017.0,3.0,9.0,39.0,Pago,2017-03-09 16:34:20+00:00,all levels,business finance
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,True,45,2174,74,51,Intermediate Level,2.5 hours,2016-12-19T19:26:30Z,Business Finance,97830,2016-12-19,19:26:30Z,2016.0,12.0,19.0,2.5,Pago,2016-12-19 19:26:30+00:00,intermediate level,business finance
3,1210588,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,True,95,2451,11,36,All Levels,3 hours,2017-05-30T20:07:24Z,Business Finance,232845,2017-05-30,20:07:24Z,2017.0,5.0,30.0,3.0,Pago,2017-05-30 20:07:24+00:00,all levels,business finance
4,1011058,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,True,200,1276,45,26,Intermediate Level,2 hours,2016-12-13T14:57:18Z,Business Finance,255200,2016-12-13,14:57:18Z,2016.0,12.0,13.0,2.0,Pago,2016-12-13 14:57:18+00:00,intermediate level,business finance


In [4]:
# Preenchimento de ausências em content_duration_hours
if "content_duration_hours" in df.columns:
    df["content_duration_hours"] = df["content_duration_hours"].fillna(
        df["content_duration_hours"].median()
    )

df.head()


Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,profit,published_date,published_time,year,month,day,content_duration_hours,paid_label,published_dt,level_norm,subject_norm
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5 hours,2017-01-18T20:58:58Z,Business Finance,429400,2017-01-18,20:58:58Z,2017.0,1.0,18.0,1.5,Pago,2017-01-18 20:58:58+00:00,all levels,business finance
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39 hours,2017-03-09T16:34:20Z,Business Finance,209400,2017-03-09,16:34:20Z,2017.0,3.0,9.0,39.0,Pago,2017-03-09 16:34:20+00:00,all levels,business finance
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,True,45,2174,74,51,Intermediate Level,2.5 hours,2016-12-19T19:26:30Z,Business Finance,97830,2016-12-19,19:26:30Z,2016.0,12.0,19.0,2.5,Pago,2016-12-19 19:26:30+00:00,intermediate level,business finance
3,1210588,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,True,95,2451,11,36,All Levels,3 hours,2017-05-30T20:07:24Z,Business Finance,232845,2017-05-30,20:07:24Z,2017.0,5.0,30.0,3.0,Pago,2017-05-30 20:07:24+00:00,all levels,business finance
4,1011058,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,True,200,1276,45,26,Intermediate Level,2 hours,2016-12-13T14:57:18Z,Business Finance,255200,2016-12-13,14:57:18Z,2016.0,12.0,13.0,2.0,Pago,2016-12-13 14:57:18+00:00,intermediate level,business finance


In [5]:
cols_keep = [
    "course_id",
    "course_title",
    "url",
    "price",
    "is_paid",
    "num_subscribers",
    "num_reviews",
    "num_lectures",
    "content_duration_hours",
    "subject",
    "level",
    "subject_norm",
    "level_norm"
]

df = df[cols_keep].copy()
df.head()


Unnamed: 0,course_id,course_title,url,price,is_paid,num_subscribers,num_reviews,num_lectures,content_duration_hours,subject,level,subject_norm,level_norm
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,200,True,2147,23,51,1.5,Business Finance,All Levels,business finance,all levels
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,75,True,2792,923,274,39.0,Business Finance,All Levels,business finance,all levels
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,45,True,2174,74,51,2.5,Business Finance,Intermediate Level,business finance,intermediate level
3,1210588,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,95,True,2451,11,36,3.0,Business Finance,All Levels,business finance,all levels
4,1011058,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,200,True,1276,45,26,2.0,Business Finance,Intermediate Level,business finance,intermediate level


In [6]:
df["course_title"] = df["course_title"].astype(str).str.strip()
df["subject_norm"] = df["subject_norm"].astype(str).str.strip().str.lower()
df["level_norm"]   = df["level_norm"].astype(str).str.strip().str.lower()

df.head()


Unnamed: 0,course_id,course_title,url,price,is_paid,num_subscribers,num_reviews,num_lectures,content_duration_hours,subject,level,subject_norm,level_norm
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,200,True,2147,23,51,1.5,Business Finance,All Levels,business finance,all levels
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,75,True,2792,923,274,39.0,Business Finance,All Levels,business finance,all levels
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,45,True,2174,74,51,2.5,Business Finance,Intermediate Level,business finance,intermediate level
3,1210588,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,95,True,2451,11,36,3.0,Business Finance,All Levels,business finance,all levels
4,1011058,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,200,True,1276,45,26,2.0,Business Finance,Intermediate Level,business finance,intermediate level


In [7]:
if "num_subscribers" in df.columns:
    min_subs = df["num_subscribers"].min()
    max_subs = df["num_subscribers"].max()
    df["popularity_score"] = (df["num_subscribers"] - min_subs) / (max_subs - min_subs)

df[["num_subscribers","popularity_score"]].head()


Unnamed: 0,num_subscribers,popularity_score
0,2147,0.007984
1,2792,0.010382
2,2174,0.008084
3,2451,0.009114
4,1276,0.004745


In [8]:
df.to_csv(PREP_OUTPUT, index=False)
print("Dataset pré-processado salvo em:", PREP_OUTPUT)


Dataset pré-processado salvo em: /Users/iloop/Desktop/notebooks/udemy_cleaned_for_training.csv


In [9]:
df = df[["course_title", "subject", "level", "price", "num_subscribers"]].copy()
df.head()


Unnamed: 0,course_title,subject,level,price,num_subscribers
0,Ultimate Investment Banking Course,Business Finance,All Levels,200,2147
1,Complete GST Course & Certification - Grow You...,Business Finance,All Levels,75,2792
2,Financial Modeling for Business Analysts and C...,Business Finance,Intermediate Level,45,2174
3,Beginner to Pro - Financial Analysis in Excel ...,Business Finance,All Levels,95,2451
4,How To Maximize Your Profits Trading Options,Business Finance,Intermediate Level,200,1276


In [10]:
df = df.drop_duplicates()

df["level"] = df["level"].fillna("Unknown")
df["price"] = df["price"].fillna(0)
df["num_subscribers"] = df["num_subscribers"].fillna(0)

df.isnull().sum()


course_title       0
subject            0
level              0
price              0
num_subscribers    0
dtype: int64

In [11]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

df["course_title"] = df["course_title"].astype(str).apply(clean_text)

df.head()


Unnamed: 0,course_title,subject,level,price,num_subscribers
0,ultimate investment banking course,Business Finance,All Levels,200,2147
1,complete gst course certification grow your ca...,Business Finance,All Levels,75,2792
2,financial modeling for business analysts and c...,Business Finance,Intermediate Level,45,2174
3,beginner to pro financial analysis in excel 2017,Business Finance,All Levels,95,2451
4,how to maximize your profits trading options,Business Finance,Intermediate Level,200,1276


In [12]:
df = pd.get_dummies(df, columns=["level"], prefix="level")
df = pd.get_dummies(df, columns=["subject"], prefix="subject")

df.head()


Unnamed: 0,course_title,price,num_subscribers,level_52,level_All Levels,level_Beginner Level,level_Expert Level,level_Intermediate Level,subject_Business Finance,subject_Graphic Design,subject_Musical Instruments,subject_Web Development
0,ultimate investment banking course,200,2147,False,True,False,False,False,True,False,False,False
1,complete gst course certification grow your ca...,75,2792,False,True,False,False,False,True,False,False,False
2,financial modeling for business analysts and c...,45,2174,False,False,False,False,True,True,False,False,False
3,beginner to pro financial analysis in excel 2017,95,2451,False,True,False,False,False,True,False,False,False
4,how to maximize your profits trading options,200,1276,False,False,False,False,True,True,False,False,False


In [13]:
df["num_subscribers_log"] = np.log1p(df["num_subscribers"])

scaler = MinMaxScaler()
df["price_norm"] = scaler.fit_transform(df[["price"]])

df[["num_subscribers", "num_subscribers_log", "price", "price_norm"]].head()


Unnamed: 0,num_subscribers,num_subscribers_log,price,price_norm
0,2147,7.672292,200,1.0
1,2792,7.934872,75,0.375
2,2174,7.684784,45,0.225
3,2451,7.804659,95,0.475
4,1276,7.152269,200,1.0


In [14]:
df.to_csv(PREP_OUTPUT, index=False)
print("Dataset final salvo em:", PREP_OUTPUT)


Dataset final salvo em: /Users/iloop/Desktop/notebooks/udemy_cleaned_for_training.csv
