In [3]:
# =====================================
# Projeto Aplicado III - Etapa 2
# Tratamento e Preparação da Base
# =====================================

# Importação das bibliotecas necessárias
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import MinMaxScaler

# Definição do caminho da base original
PATH_MAIN = "/Users/iloop/Desktop/Projeto/udemy_course_data.csv"

# Carregando a base de dados original
df = pd.read_csv(PATH_MAIN)

# Exibindo as primeiras linhas
df.head()


Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,profit,published_date,published_time,year,month,day
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5 hours,2017-01-18T20:58:58Z,Business Finance,429400,2017-01-18,20:58:58Z,2017,1,18
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39 hours,2017-03-09T16:34:20Z,Business Finance,209400,2017-03-09,16:34:20Z,2017,3,9
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,True,45,2174,74,51,Intermediate Level,2.5 hours,2016-12-19T19:26:30Z,Business Finance,97830,2016-12-19,19:26:30Z,2016,12,19
3,1210588,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,True,95,2451,11,36,All Levels,3 hours,2017-05-30T20:07:24Z,Business Finance,232845,2017-05-30,20:07:24Z,2017,5,30
4,1011058,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,True,200,1276,45,26,Intermediate Level,2 hours,2016-12-13T14:57:18Z,Business Finance,255200,2016-12-13,14:57:18Z,2016,12,13


In [4]:
# =====================================
# 1. Seleção de atributos relevantes
# =====================================

# Mantendo apenas as colunas importantes para o sistema de recomendação
df = df[['course_title', 'subject', 'level', 'price', 'num_subscribers']]

# Conferindo resultado
df.head()


Unnamed: 0,course_title,subject,level,price,num_subscribers
0,Ultimate Investment Banking Course,Business Finance,All Levels,200,2147
1,Complete GST Course & Certification - Grow You...,Business Finance,All Levels,75,2792
2,Financial Modeling for Business Analysts and C...,Business Finance,Intermediate Level,45,2174
3,Beginner to Pro - Financial Analysis in Excel ...,Business Finance,All Levels,95,2451
4,How To Maximize Your Profits Trading Options,Business Finance,Intermediate Level,200,1276


In [5]:
# =====================================
# 2. Remover duplicados e tratar valores nulos
# =====================================

# Removendo duplicados
df = df.drop_duplicates()

# Conferindo se existem valores nulos
print(df.isnull().sum())

# Preenchendo valores nulos, se houver
df['level'] = df['level'].fillna("Unknown")
df['price'] = df['price'].fillna(0)
df['num_subscribers'] = df['num_subscribers'].fillna(0)


course_title       0
subject            0
level              0
price              0
num_subscribers    0
dtype: int64


In [6]:
# =====================================
# 3. Padronização de textos
# =====================================

# Função para limpar textos
def clean_text(text):
    text = text.lower()                        # minúsculas
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text) # remove caracteres especiais
    text = re.sub(r'\s+', ' ', text)           # remove espaços extras
    return text.strip()

# Aplicando no título dos cursos
df['course_title'] = df['course_title'].astype(str).apply(clean_text)

# Conferindo
df[['course_title']].head()


Unnamed: 0,course_title
0,ultimate investment banking course
1,complete gst course certification grow your ca...
2,financial modeling for business analysts and c...
3,beginner to pro financial analysis in excel 2017
4,how to maximize your profits trading options


In [7]:
# =====================================
# 4. Transformação de variáveis categóricas
# =====================================

# Transformando 'level' em variáveis dummies (one-hot encoding)
df = pd.get_dummies(df, columns=['level'], prefix='level')

# Transformando 'subject' em variáveis dummies também
df = pd.get_dummies(df, columns=['subject'], prefix='subject')

# Conferindo resultado
df.head()


Unnamed: 0,course_title,price,num_subscribers,level_52,level_All Levels,level_Beginner Level,level_Expert Level,level_Intermediate Level,subject_Business Finance,subject_Graphic Design,subject_Musical Instruments,subject_Web Development
0,ultimate investment banking course,200,2147,False,True,False,False,False,True,False,False,False
1,complete gst course certification grow your ca...,75,2792,False,True,False,False,False,True,False,False,False
2,financial modeling for business analysts and c...,45,2174,False,False,False,False,True,True,False,False,False
3,beginner to pro financial analysis in excel 2017,95,2451,False,True,False,False,False,True,False,False,False
4,how to maximize your profits trading options,200,1276,False,False,False,False,True,True,False,False,False


In [8]:
# =====================================
# 5. Normalização de variáveis numéricas
# =====================================

# Aplicando log1p em num_subscribers para reduzir escala
df['num_subscribers_log'] = np.log1p(df['num_subscribers'])

# Normalizando preço
scaler = MinMaxScaler()
df['price_norm'] = scaler.fit_transform(df[['price']])

# Conferindo
df[['num_subscribers', 'num_subscribers_log', 'price', 'price_norm']].head()


Unnamed: 0,num_subscribers,num_subscribers_log,price,price_norm
0,2147,7.672292,200,1.0
1,2792,7.934872,75,0.375
2,2174,7.684784,45,0.225
3,2451,7.804659,95,0.475
4,1276,7.152269,200,1.0


In [9]:
# =====================================
# 6. Salvando dataset final
# =====================================

OUTPUT_PATH = "/Users/iloop/Desktop/Projeto/udemy_cleaned_for_training.csv"

df.to_csv(OUTPUT_PATH, index=False)

print("Dataset tratado salvo em:", OUTPUT_PATH)


Dataset tratado salvo em: /Users/iloop/Desktop/Projeto/udemy_cleaned_for_training.csv
