# PRÉ-PROCESSAMENTO - ESCALONAMENTO - SEPARAÇÃO

## Importando as bibliotecas e lendo o dataset

In [60]:
import pandas as pd
import numpy as np

In [61]:
df = pd.read_csv('../data/Course_Completion_Prediction.csv')
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99990,99991,99992,99993,99994,99995,99996,99997,99998,99999
Student_ID,STU100000,STU100001,STU100002,STU100003,STU100004,STU100005,STU100006,STU100007,STU100008,STU100009,...,STU199990,STU199991,STU199992,STU199993,STU199994,STU199995,STU199996,STU199997,STU199998,STU199999
Name,Vihaan Patel,Arjun Nair,Aditya Bhardwaj,Krishna Singh,Krishna Nair,Rohan Reddy,Sai Nair,Krishna Desai,Vihaan Joshi,Vivaan Nair,...,Sakshi Desai,Sakshi Gupta,Isha Reddy,Meera Desai,Vivaan Mehta,Neha Singh,Kavya Nair,Neha Nair,Pooja Sharma,Rahul Patel
Gender,Male,Female,Female,Female,Female,Female,Male,Male,Male,Male,...,Female,Female,Male,Female,Male,Female,Female,Male,Female,Male
Age,19,17,34,29,19,21,22,23,17,32,...,30,26,30,20,26,34,24,17,28,29
Education_Level,Diploma,Bachelor,Master,Diploma,Master,Bachelor,Diploma,Master,Bachelor,Bachelor,...,Master,Master,Bachelor,Bachelor,Master,Bachelor,Bachelor,Master,Bachelor,Diploma
Employment_Status,Student,Student,Student,Employed,Self-Employed,Student,Employed,Employed,Student,Student,...,Employed,Self-Employed,Unemployed,Student,Student,Student,Self-Employed,Employed,Student,Student
City,Indore,Delhi,Chennai,Surat,Lucknow,Jaipur,Chennai,Hyderabad,Nagpur,Kolkata,...,Bhopal,Nagpur,Kolkata,Kolkata,Delhi,Hyderabad,Delhi,Ahmedabad,Ahmedabad,Bengaluru
Device_Type,Laptop,Laptop,Mobile,Mobile,Laptop,Mobile,Laptop,Mobile,Tablet,Laptop,...,Laptop,Mobile,Laptop,Tablet,Mobile,Mobile,Mobile,Laptop,Mobile,Laptop
Internet_Connection_Quality,Medium,Low,Medium,High,Medium,Medium,High,Low,High,Medium,...,Medium,High,High,Medium,Low,Medium,High,Medium,High,Low
Course_ID,C102,C106,C101,C105,C106,C102,C103,C101,C103,C104,...,C104,C102,C104,C105,C102,C104,C104,C107,C104,C105


In [62]:
df_transformed = df.copy()
df_transformed.dtypes

Student_ID                       object
Name                             object
Gender                           object
Age                               int64
Education_Level                  object
Employment_Status                object
City                             object
Device_Type                      object
Internet_Connection_Quality      object
Course_ID                        object
Course_Name                      object
Category                         object
Course_Level                     object
Course_Duration_Days              int64
Instructor_Rating               float64
Login_Frequency                   int64
Average_Session_Duration_Min      int64
Video_Completion_Rate           float64
Discussion_Participation          int64
Time_Spent_Hours                float64
Days_Since_Last_Login             int64
Notifications_Checked             int64
Peer_Interaction_Score          float64
Assignments_Submitted             int64
Assignments_Missed                int64


## Preprocessamento

### Apagando colunas desnecessárias

In [63]:
lista_colunas_apagar = ['Student_ID', 'Name', 'Course_ID', 'Enrollment_Date', 'City', 'Assignments_Missed', 'Rewatch_Count']
lista_unicos = set(lista_colunas_apagar)
print(f'{len(lista_unicos)} colunas unicas | {len(lista_colunas_apagar)} colunas totais')
#print(lista_unicos)

7 colunas unicas | 7 colunas totais


In [64]:
df_transformed = df_transformed.drop(columns=lista_colunas_apagar)
#df_transformed.head()

In [65]:
df_unicos = (
    df_transformed.nunique()
      .reset_index()
      .rename(columns={'index': 'coluna', 0: 'valores_unicos'})
      #.query('valores_unicos > 8')
      .sort_values('coluna')
)
df_unicos

Unnamed: 0,coluna,valores_unicos
1,Age,34
28,App_Usage_Percentage,101
19,Assignments_Submitted,11
12,Average_Session_Duration_Min,73
7,Category,5
32,Completed,2
9,Course_Duration_Days,8
8,Course_Level,3
6,Course_Name,8
16,Days_Since_Last_Login,76


In [66]:
df_transformed.dtypes

Gender                           object
Age                               int64
Education_Level                  object
Employment_Status                object
Device_Type                      object
Internet_Connection_Quality      object
Course_Name                      object
Category                         object
Course_Level                     object
Course_Duration_Days              int64
Instructor_Rating               float64
Login_Frequency                   int64
Average_Session_Duration_Min      int64
Video_Completion_Rate           float64
Discussion_Participation          int64
Time_Spent_Hours                float64
Days_Since_Last_Login             int64
Notifications_Checked             int64
Peer_Interaction_Score          float64
Assignments_Submitted             int64
Quiz_Attempts                     int64
Quiz_Score_Avg                  float64
Project_Grade                   float64
Progress_Percentage             float64
Payment_Mode                     object


In [67]:
pd.DataFrame(df_transformed)

Unnamed: 0,Gender,Age,Education_Level,Employment_Status,Device_Type,Internet_Connection_Quality,Course_Name,Category,Course_Level,Course_Duration_Days,...,Progress_Percentage,Payment_Mode,Fee_Paid,Discount_Used,Payment_Amount,App_Usage_Percentage,Reminder_Emails_Clicked,Support_Tickets_Raised,Satisfaction_Rating,Completed
0,Male,19,Diploma,Student,Laptop,Medium,Data Analysis with Python,Programming,Intermediate,60,...,70.8,Scholarship,No,No,1740,49,3,4,3.5,Completed
1,Female,17,Bachelor,Student,Laptop,Low,Machine Learning A-Z,Programming,Advanced,90,...,55.6,Credit Card,Yes,No,6147,86,0,0,4.5,Not Completed
2,Female,34,Master,Student,Mobile,Medium,Python Basics,Programming,Beginner,45,...,78.8,NetBanking,Yes,No,4280,85,1,0,5.0,Completed
3,Female,29,Diploma,Employed,Mobile,High,UI/UX Design Fundamentals,Design,Beginner,40,...,24.7,UPI,Yes,No,3812,42,2,3,3.8,Completed
4,Female,19,Master,Self-Employed,Laptop,Medium,Machine Learning A-Z,Programming,Advanced,90,...,64.9,Debit Card,Yes,Yes,5486,91,3,0,4.0,Completed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,Female,34,Bachelor,Student,Mobile,Medium,Digital Marketing Essentials,Marketing,Beginner,30,...,72.0,UPI,Yes,No,3285,100,3,2,3.8,Completed
99996,Female,24,Bachelor,Self-Employed,Mobile,High,Digital Marketing Essentials,Marketing,Beginner,30,...,62.7,UPI,Yes,No,3233,71,2,0,3.0,Not Completed
99997,Male,17,Master,Employed,Laptop,Medium,Statistics for Data Science,Math,Intermediate,50,...,57.1,UPI,Yes,Yes,3820,61,2,0,4.1,Not Completed
99998,Female,28,Bachelor,Student,Mobile,High,Digital Marketing Essentials,Marketing,Beginner,30,...,47.4,Credit Card,Yes,No,3347,78,1,0,4.7,Not Completed


### Transformando variáveis categóricas em numéricas

In [68]:
df_categoricas = df_transformed.select_dtypes(include=['object', 'category'])
df_categoricas.dtypes

Gender                         object
Education_Level                object
Employment_Status              object
Device_Type                    object
Internet_Connection_Quality    object
Course_Name                    object
Category                       object
Course_Level                   object
Payment_Mode                   object
Fee_Paid                       object
Discount_Used                  object
Completed                      object
dtype: object

In [69]:
df_unicos = (
    df_categoricas.nunique()
      .reset_index()
      .rename(columns={'index': 'coluna', 0: 'valores_unicos'})
      #.query('valores_unicos > 8')
      #.sort_values('coluna')
)
df_unicos

Unnamed: 0,coluna,valores_unicos
0,Gender,3
1,Education_Level,5
2,Employment_Status,4
3,Device_Type,3
4,Internet_Connection_Quality,3
5,Course_Name,8
6,Category,5
7,Course_Level,3
8,Payment_Mode,6
9,Fee_Paid,2


In [70]:
lista_unicos = [col for col in df_categoricas.columns]

print(f'{lista_unicos} {len(lista_unicos)}')



['Gender', 'Education_Level', 'Employment_Status', 'Device_Type', 'Internet_Connection_Quality', 'Course_Name', 'Category', 'Course_Level', 'Payment_Mode', 'Fee_Paid', 'Discount_Used', 'Completed'] 12


In [71]:
for col in lista_unicos:
   print(f'{col}: {df_transformed[col].unique()}')

Gender: ['Male' 'Female' 'Other']
Education_Level: ['Diploma' 'Bachelor' 'Master' 'HighSchool' 'PhD']
Employment_Status: ['Student' 'Employed' 'Self-Employed' 'Unemployed']
Device_Type: ['Laptop' 'Mobile' 'Tablet']
Internet_Connection_Quality: ['Medium' 'Low' 'High']
Course_Name: ['Data Analysis with Python' 'Machine Learning A-Z' 'Python Basics'
 'UI/UX Design Fundamentals' 'Introduction to AI'
 'Digital Marketing Essentials' 'Statistics for Data Science'
 'Excel for Business']
Category: ['Programming' 'Design' 'Marketing' 'Math' 'Business']
Course_Level: ['Intermediate' 'Advanced' 'Beginner']
Payment_Mode: ['Scholarship' 'Credit Card' 'NetBanking' 'UPI' 'Debit Card' 'Free']
Fee_Paid: ['No' 'Yes']
Discount_Used: ['No' 'Yes']
Completed: ['Completed' 'Not Completed']


In [72]:
import pickle
import os

os.makedirs('../pickles', exist_ok=True)

mapeamentos = {}
for col in lista_unicos:
    valores = sorted(df_transformed[col].dropna().unique()) # para o completed e o yes forem igual a 1 e o not completed e o no igual a 0
    mapeamentos[col] = {valor: i for i, valor in enumerate(valores)}

os.makedirs('../pickles/mapeamentos', exist_ok=True)

mapeamentos['Completed'] = {'Not Completed': 0, 'Completed': 1}
print(mapeamentos)
with open('../pickles/mapeamentos/mapeamentos.pkl', 'wb') as f:
    pickle.dump(mapeamentos, f)


{'Gender': {'Female': 0, 'Male': 1, 'Other': 2}, 'Education_Level': {'Bachelor': 0, 'Diploma': 1, 'HighSchool': 2, 'Master': 3, 'PhD': 4}, 'Employment_Status': {'Employed': 0, 'Self-Employed': 1, 'Student': 2, 'Unemployed': 3}, 'Device_Type': {'Laptop': 0, 'Mobile': 1, 'Tablet': 2}, 'Internet_Connection_Quality': {'High': 0, 'Low': 1, 'Medium': 2}, 'Course_Name': {'Data Analysis with Python': 0, 'Digital Marketing Essentials': 1, 'Excel for Business': 2, 'Introduction to AI': 3, 'Machine Learning A-Z': 4, 'Python Basics': 5, 'Statistics for Data Science': 6, 'UI/UX Design Fundamentals': 7}, 'Category': {'Business': 0, 'Design': 1, 'Marketing': 2, 'Math': 3, 'Programming': 4}, 'Course_Level': {'Advanced': 0, 'Beginner': 1, 'Intermediate': 2}, 'Payment_Mode': {'Credit Card': 0, 'Debit Card': 1, 'Free': 2, 'NetBanking': 3, 'Scholarship': 4, 'UPI': 5}, 'Fee_Paid': {'No': 0, 'Yes': 1}, 'Discount_Used': {'No': 0, 'Yes': 1}, 'Completed': {'Not Completed': 0, 'Completed': 1}}


In [73]:
os.makedirs('../mapeamentos', exist_ok=True)

dados = []
with open('../pickles/mapeamentos/mapeamentos.pkl', 'rb') as f:
    mapeamentos = pickle.load(f)

for col, mapa in mapeamentos.items():
    for valor, codigo in mapa.items():
        dados.append({
            'coluna': col,
            'valor_original': valor,
            'codigo_atribuido': codigo
        })

df_mapeamentos = pd.DataFrame(dados)
df_mapeamentos.to_csv('../mapeamentos/mapeamentos.csv', index=False)
df_mapeamentos


Unnamed: 0,coluna,valor_original,codigo_atribuido
0,Gender,Female,0
1,Gender,Male,1
2,Gender,Other,2
3,Education_Level,Bachelor,0
4,Education_Level,Diploma,1
5,Education_Level,HighSchool,2
6,Education_Level,Master,3
7,Education_Level,PhD,4
8,Employment_Status,Employed,0
9,Employment_Status,Self-Employed,1


In [74]:
with open('../pickles/mapeamentos/mapeamentos.pkl', 'rb') as f:
    mapeamentos = pickle.load(f)

df_transformed_numeric = df_transformed.copy()
#df_transformed_numerico = df_transformed.copy()
for col, mapa in mapeamentos.items():
    df_transformed_numeric[col] = df_transformed_numeric[col].map(mapa)
df_transformed_numeric


Unnamed: 0,Gender,Age,Education_Level,Employment_Status,Device_Type,Internet_Connection_Quality,Course_Name,Category,Course_Level,Course_Duration_Days,...,Progress_Percentage,Payment_Mode,Fee_Paid,Discount_Used,Payment_Amount,App_Usage_Percentage,Reminder_Emails_Clicked,Support_Tickets_Raised,Satisfaction_Rating,Completed
0,1,19,1,2,0,2,0,4,2,60,...,70.8,4,0,0,1740,49,3,4,3.5,1
1,0,17,0,2,0,1,4,4,0,90,...,55.6,0,1,0,6147,86,0,0,4.5,0
2,0,34,3,2,1,2,5,4,1,45,...,78.8,3,1,0,4280,85,1,0,5.0,1
3,0,29,1,0,1,0,7,1,1,40,...,24.7,5,1,0,3812,42,2,3,3.8,1
4,0,19,3,1,0,2,4,4,0,90,...,64.9,1,1,1,5486,91,3,0,4.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,34,0,2,1,2,1,2,1,30,...,72.0,5,1,0,3285,100,3,2,3.8,1
99996,0,24,0,1,1,0,1,2,1,30,...,62.7,5,1,0,3233,71,2,0,3.0,0
99997,1,17,3,0,0,2,6,3,2,50,...,57.1,5,1,1,3820,61,2,0,4.1,0
99998,0,28,0,2,1,0,1,2,1,30,...,47.4,0,1,0,3347,78,1,0,4.7,0


In [75]:
df_transformed_numeric.dtypes

Gender                            int64
Age                               int64
Education_Level                   int64
Employment_Status                 int64
Device_Type                       int64
Internet_Connection_Quality       int64
Course_Name                       int64
Category                          int64
Course_Level                      int64
Course_Duration_Days              int64
Instructor_Rating               float64
Login_Frequency                   int64
Average_Session_Duration_Min      int64
Video_Completion_Rate           float64
Discussion_Participation          int64
Time_Spent_Hours                float64
Days_Since_Last_Login             int64
Notifications_Checked             int64
Peer_Interaction_Score          float64
Assignments_Submitted             int64
Quiz_Attempts                     int64
Quiz_Score_Avg                  float64
Project_Grade                   float64
Progress_Percentage             float64
Payment_Mode                      int64


### Previsores e target


#### Feito manualmente

In [76]:
previsores0 = df_transformed_numeric.iloc[:, 0:len(df_transformed_numeric.columns) - 1].values

In [77]:
pd.DataFrame(previsores0).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99990,99991,99992,99993,99994,99995,99996,99997,99998,99999
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
1,19.0,17.0,34.0,29.0,19.0,21.0,22.0,23.0,17.0,32.0,...,30.0,26.0,30.0,20.0,26.0,34.0,24.0,17.0,28.0,29.0
2,1.0,0.0,3.0,1.0,3.0,0.0,1.0,3.0,0.0,0.0,...,3.0,3.0,0.0,0.0,3.0,0.0,0.0,3.0,0.0,1.0
3,2.0,2.0,2.0,0.0,1.0,2.0,0.0,0.0,2.0,2.0,...,0.0,1.0,3.0,2.0,2.0,2.0,1.0,0.0,2.0,2.0
4,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,2.0,0.0,...,0.0,1.0,0.0,2.0,1.0,1.0,1.0,0.0,1.0,0.0
5,2.0,1.0,2.0,0.0,2.0,2.0,0.0,1.0,0.0,2.0,...,2.0,0.0,0.0,2.0,1.0,2.0,0.0,2.0,0.0,1.0
6,0.0,4.0,5.0,7.0,4.0,0.0,3.0,5.0,3.0,1.0,...,1.0,0.0,1.0,7.0,0.0,1.0,1.0,6.0,1.0,7.0
7,4.0,4.0,4.0,1.0,4.0,4.0,4.0,4.0,4.0,2.0,...,2.0,4.0,2.0,1.0,4.0,2.0,2.0,3.0,2.0,1.0
8,2.0,0.0,1.0,1.0,0.0,2.0,2.0,1.0,2.0,1.0,...,1.0,2.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0
9,60.0,90.0,45.0,40.0,90.0,60.0,75.0,45.0,75.0,30.0,...,30.0,60.0,30.0,40.0,60.0,30.0,30.0,50.0,30.0,40.0


In [78]:
previsores0.shape

(100000, 32)

In [79]:
pd.DataFrame(previsores0).dtypes

0     float64
1     float64
2     float64
3     float64
4     float64
5     float64
6     float64
7     float64
8     float64
9     float64
10    float64
11    float64
12    float64
13    float64
14    float64
15    float64
16    float64
17    float64
18    float64
19    float64
20    float64
21    float64
22    float64
23    float64
24    float64
25    float64
26    float64
27    float64
28    float64
29    float64
30    float64
31    float64
dtype: object

#### LabelEncoder e OneHotEncoder

In [80]:
from sklearn.preprocessing import LabelEncoder

In [81]:
print(len(df_transformed.columns))

33


##### LabelEncoder

Criar LabelEncoder para as variáveis categóricas se tornarem numéricas. Após isso, escalonar as variáveis numéricas.  
Não é necessário para variáveis numéricas.

In [82]:
df_previsores1 = df_transformed.iloc[:, 0:len(df_transformed.columns) - 1]
df_previsores1

Unnamed: 0,Gender,Age,Education_Level,Employment_Status,Device_Type,Internet_Connection_Quality,Course_Name,Category,Course_Level,Course_Duration_Days,...,Project_Grade,Progress_Percentage,Payment_Mode,Fee_Paid,Discount_Used,Payment_Amount,App_Usage_Percentage,Reminder_Emails_Clicked,Support_Tickets_Raised,Satisfaction_Rating
0,Male,19,Diploma,Student,Laptop,Medium,Data Analysis with Python,Programming,Intermediate,60,...,71.2,70.8,Scholarship,No,No,1740,49,3,4,3.5
1,Female,17,Bachelor,Student,Laptop,Low,Machine Learning A-Z,Programming,Advanced,90,...,42.5,55.6,Credit Card,Yes,No,6147,86,0,0,4.5
2,Female,34,Master,Student,Mobile,Medium,Python Basics,Programming,Beginner,45,...,87.9,78.8,NetBanking,Yes,No,4280,85,1,0,5.0
3,Female,29,Diploma,Employed,Mobile,High,UI/UX Design Fundamentals,Design,Beginner,40,...,51.4,24.7,UPI,Yes,No,3812,42,2,3,3.8
4,Female,19,Master,Self-Employed,Laptop,Medium,Machine Learning A-Z,Programming,Advanced,90,...,93.0,64.9,Debit Card,Yes,Yes,5486,91,3,0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,Female,34,Bachelor,Student,Mobile,Medium,Digital Marketing Essentials,Marketing,Beginner,30,...,74.1,72.0,UPI,Yes,No,3285,100,3,2,3.8
99996,Female,24,Bachelor,Self-Employed,Mobile,High,Digital Marketing Essentials,Marketing,Beginner,30,...,89.6,62.7,UPI,Yes,No,3233,71,2,0,3.0
99997,Male,17,Master,Employed,Laptop,Medium,Statistics for Data Science,Math,Intermediate,50,...,84.5,57.1,UPI,Yes,Yes,3820,61,2,0,4.1
99998,Female,28,Bachelor,Student,Mobile,High,Digital Marketing Essentials,Marketing,Beginner,30,...,71.7,47.4,Credit Card,Yes,No,3347,78,1,0,4.7


In [83]:
df_previsores1_categoricos = df_previsores1.select_dtypes(include=['object', 'category'])
df_previsores1_categoricos.columns

Index(['Gender', 'Education_Level', 'Employment_Status', 'Device_Type',
       'Internet_Connection_Quality', 'Course_Name', 'Category',
       'Course_Level', 'Payment_Mode', 'Fee_Paid', 'Discount_Used'],
      dtype='object')

In [84]:
# Lista de índices das colunas que são categóricas
colunas = list(df_previsores1.columns)
colunas_categoricas = list(df_previsores1_categoricos.columns)

print(colunas)
print(colunas_categoricas)

['Gender', 'Age', 'Education_Level', 'Employment_Status', 'Device_Type', 'Internet_Connection_Quality', 'Course_Name', 'Category', 'Course_Level', 'Course_Duration_Days', 'Instructor_Rating', 'Login_Frequency', 'Average_Session_Duration_Min', 'Video_Completion_Rate', 'Discussion_Participation', 'Time_Spent_Hours', 'Days_Since_Last_Login', 'Notifications_Checked', 'Peer_Interaction_Score', 'Assignments_Submitted', 'Quiz_Attempts', 'Quiz_Score_Avg', 'Project_Grade', 'Progress_Percentage', 'Payment_Mode', 'Fee_Paid', 'Discount_Used', 'Payment_Amount', 'App_Usage_Percentage', 'Reminder_Emails_Clicked', 'Support_Tickets_Raised', 'Satisfaction_Rating']
['Gender', 'Education_Level', 'Employment_Status', 'Device_Type', 'Internet_Connection_Quality', 'Course_Name', 'Category', 'Course_Level', 'Payment_Mode', 'Fee_Paid', 'Discount_Used']


In [85]:
indices_categoricos = []
for col in colunas_categoricas:
    if col in colunas:
        indices_categoricos.append(colunas.index(col))
print(indices_categoricos)

[0, 2, 3, 4, 5, 6, 7, 8, 24, 25, 26]


In [86]:
previsores1 = df_previsores1.values

In [87]:
previsores1

array([['Male', 19, 'Diploma', ..., 3, 4, 3.5],
       ['Female', 17, 'Bachelor', ..., 0, 0, 4.5],
       ['Female', 34, 'Master', ..., 1, 0, 5.0],
       ...,
       ['Male', 17, 'Master', ..., 2, 0, 4.1],
       ['Female', 28, 'Bachelor', ..., 1, 0, 4.7],
       ['Male', 29, 'Diploma', ..., 1, 0, 3.9]],
      shape=(100000, 32), dtype=object)

In [88]:
for i in indices_categoricos:
   previsores1[:,i] = LabelEncoder().fit_transform(previsores1[:,i])
previsores1

array([[1, 19, 1, ..., 3, 4, 3.5],
       [0, 17, 0, ..., 0, 0, 4.5],
       [0, 34, 3, ..., 1, 0, 5.0],
       ...,
       [1, 17, 3, ..., 2, 0, 4.1],
       [0, 28, 0, ..., 1, 0, 4.7],
       [1, 29, 1, ..., 1, 0, 3.9]], shape=(100000, 32), dtype=object)

In [89]:
pd.DataFrame(previsores1).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,1,19,1,2,0,2,0,4,2,60,...,71.2,70.8,4,0,0,1740,49,3,4,3.5
1,0,17,0,2,0,1,4,4,0,90,...,42.5,55.6,0,1,0,6147,86,0,0,4.5
2,0,34,3,2,1,2,5,4,1,45,...,87.9,78.8,3,1,0,4280,85,1,0,5.0
3,0,29,1,0,1,0,7,1,1,40,...,51.4,24.7,5,1,0,3812,42,2,3,3.8
4,0,19,3,1,0,2,4,4,0,90,...,93.0,64.9,1,1,1,5486,91,3,0,4.0


##### **OneHotEncoder - Criação de variáveis Dummy (fictícia)**

>⚠️**Cuidado** com multicolinearidade (variáveis altamente corelacionadas entre si)

Variáveis dummy (fictícias) são criadas para evitar multicolinearidade. Multicolinearidade ocorre quando duas ou mais variáveis independentes em um modelo de regressão estão altamente correlacionadas, o que pode distorcer os resultados do modelo. Assim é possível eliminar uma das variáveis dummy para cada categoria, reduzindo a redundância e melhorando a interpretabilidade do modelo.

<p><strong>Cuidado com a multicolinearidade</strong> (variáveis altamente correlacionadas entre si).</p>

<table style="border:1px solid #ccc; border-collapse:collapse;">
<tr>
<td style="padding:10px; vertical-align:top;">
<strong>Você faz atividade física?</strong>
</td>
<td style="padding:10px;">
<table style="border-collapse:collapse;">
<tr><td><b>A = 0</b></td><td>Não.</td></tr>
<tr><td><b>B = 1</b></td><td>Sim, um ou dois dias por semana.</td></tr>
<tr><td><b>C = 2</b></td><td>Sim, três ou quatro dias por semana.</td></tr>
<tr><td><b>D = 3</b></td><td>Sim, pelo menos cinco dias por semana.</td></tr>
</table>
</td>
</tr>
</table>

<pre>
A B C D
1 0 0 0
0 1 0 0
0 0 1 0
0 0 0 1
</pre>



In [90]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

**ColumnTransformer**  
É uma classe do scikit-learn que permite aplicar transformações diferentes para diferentes colunas de um DataFrame. 

Parâmetros principais:  
* name: nome da transformação
* transformer: tipo de transformador (OneHotEncoder, LabelEncoder, StandardScaler, etc.)
* columns: colunas que serão transformadas
* remainder: como lidar com colunas que não foram transformadas
  * 'drop': remove colunas que não foram transformadas (default)
  * 'passthrough': mantém colunas que não foram transformadas
* sparse: parâmentro de classificação de matriz esparsa (default=0.3)
* n_jobs: número de processos para paralelizar a transformação (default=None)
* transformes_weights: pesos das transformações
* verbose: default = False, se True, exibe a execução na tela.

In [91]:
previsores2 = ColumnTransformer(transformers=[('OneHot',OneHotEncoder(),indices_categoricos)], remainder='passthrough',verbose=True).fit_transform(previsores1)

[ColumnTransformer] ........ (1 of 2) Processing OneHot, total=   0.2s
[ColumnTransformer] ..... (2 of 2) Processing remainder, total=   0.0s


In [92]:
pd.DataFrame(previsores2).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99990,99991,99992,99993,99994,99995,99996,99997,99998,99999
0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
4,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,1740,6147,4280,3812,5486,4526,5682,4590,5055,2951,...,3537,0,0,4158,4709,3285,3233,3820,3347,4120
61,49,86,85,42,91,74,83,68,78,48,...,65,54,63,54,87,100,71,61,78,55
62,3,0,1,2,3,5,2,0,4,1,...,2,2,5,3,1,3,2,2,1,1
63,4,0,0,3,0,2,0,0,2,2,...,0,1,0,0,1,2,0,0,0,0


## Escalonamento

* Padronização: Utiliza a média e desvio padrão para transformar os dados em uma distribuição normal.
* Normalização: Utiliza valores máximos e mínimos para transformar os dados para que fiquem entre 0 e 1.

Escalonar no machine learning serve para colocar todas as variáveis na mesma escala, evitando que atributos com valores numericamente maiores tenham mais influência no modelo apenas por causa do tamanho dos números. Isso é essencial em algoritmos que usam distância ou gradiente, como KNN, SVM, regressões e redes neurais, pois melhora a estabilidade, a velocidade de treinamento e a qualidade das soluções encontradas. Sem escalonamento, o modelo pode aprender de forma distorcida, confundindo magnitude com importância real, enquanto em modelos baseados em árvores esse cuidado costuma ser menos necessário.


In [93]:
from sklearn.preprocessing import StandardScaler

In [94]:
previsores0_esc = StandardScaler().fit_transform(previsores0)

In [95]:
previsores0_esc

array([[ 0.89561131, -1.19488412, -0.05980173, ...,  0.42114253,
         3.28829207, -0.90189184],
       [-0.96277333, -1.55105614, -0.80666984, ..., -1.47205832,
        -0.91531426,  0.52486325],
       [-0.96277333,  1.47640604,  1.43393448, ..., -0.84099137,
        -0.91531426,  1.23824079],
       ...,
       [ 0.89561131, -1.55105614,  1.43393448, ..., -0.20992442,
        -0.91531426, -0.04583879],
       [-0.96277333,  0.40788998, -0.80666984, ..., -0.84099137,
        -0.91531426,  0.81021426],
       [ 0.89561131,  0.58597599, -0.05980173, ..., -0.84099137,
        -0.91531426, -0.3311898 ]], shape=(100000, 32))

In [96]:
pd.DataFrame(previsores0_esc).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99990,99991,99992,99993,99994,99995,99996,99997,99998,99999
0,0.895611,-0.962773,-0.962773,-0.962773,-0.962773,-0.962773,0.895611,0.895611,0.895611,0.895611,...,-0.962773,-0.962773,0.895611,-0.962773,0.895611,-0.962773,-0.962773,0.895611,-0.962773,0.895611
1,-1.194884,-1.551056,1.476406,0.585976,-1.194884,-0.838712,-0.660626,-0.48254,-1.551056,1.120234,...,0.764062,0.051718,0.764062,-1.016798,0.051718,1.476406,-0.304454,-1.551056,0.40789,0.585976
2,-0.059802,-0.80667,1.433934,-0.059802,1.433934,-0.80667,-0.059802,1.433934,-0.80667,-0.80667,...,1.433934,1.433934,-0.80667,-0.80667,1.433934,-0.80667,-0.80667,1.433934,-0.80667,-0.059802
3,0.866303,0.866303,0.866303,-1.051655,-0.092676,0.866303,-1.051655,-1.051655,0.866303,0.866303,...,-1.051655,-0.092676,1.825283,0.866303,0.866303,0.866303,-0.092676,-1.051655,0.866303,0.866303
4,-1.257334,-1.257334,0.540321,0.540321,-1.257334,0.540321,-1.257334,0.540321,2.337976,-1.257334,...,-1.257334,0.540321,-1.257334,2.337976,0.540321,0.540321,0.540321,-1.257334,0.540321,-1.257334
5,0.934636,-0.164716,0.934636,-1.264068,0.934636,0.934636,-1.264068,-0.164716,-1.264068,0.934636,...,0.934636,-1.264068,-1.264068,0.934636,-0.164716,0.934636,-1.264068,0.934636,-1.264068,-0.164716
6,-1.449435,0.337603,0.784362,1.677881,0.337603,-1.449435,-0.109157,0.784362,-0.109157,-1.002675,...,-1.002675,-1.449435,-1.002675,1.677881,-1.449435,-1.002675,-1.002675,1.231121,-1.002675,1.677881
7,0.75927,0.75927,0.75927,-1.285828,0.75927,0.75927,0.75927,0.75927,0.75927,-0.604129,...,-0.604129,0.75927,-0.604129,-1.285828,0.75927,-0.604129,-0.604129,0.077571,-0.604129,-1.285828
8,1.125125,-1.957156,-0.416015,-0.416015,-1.957156,1.125125,1.125125,-0.416015,1.125125,-0.416015,...,-0.416015,1.125125,-0.416015,-0.416015,1.125125,-0.416015,-0.416015,1.125125,-0.416015,-0.416015
9,0.402599,1.878635,-0.335419,-0.581426,1.878635,0.402599,1.140617,-0.335419,1.140617,-1.073438,...,-1.073438,0.402599,-1.073438,-0.581426,0.402599,-1.073438,-1.073438,-0.089413,-1.073438,-0.581426


In [97]:
previsores1_esc = StandardScaler().fit_transform(previsores1)
previsores1_esc

array([[ 0.89561131, -1.19488412, -0.05980173, ...,  0.42114253,
         3.28829207, -0.90189184],
       [-0.96277333, -1.55105614, -0.80666984, ..., -1.47205832,
        -0.91531426,  0.52486325],
       [-0.96277333,  1.47640604,  1.43393448, ..., -0.84099137,
        -0.91531426,  1.23824079],
       ...,
       [ 0.89561131, -1.55105614,  1.43393448, ..., -0.20992442,
        -0.91531426, -0.04583879],
       [-0.96277333,  0.40788998, -0.80666984, ..., -0.84099137,
        -0.91531426,  0.81021426],
       [ 0.89561131,  0.58597599, -0.05980173, ..., -0.84099137,
        -0.91531426, -0.3311898 ]], shape=(100000, 32))

In [98]:
previsores2_esc = StandardScaler().fit_transform(previsores2)
previsores2_esc

array([[-1.00374702,  1.04461427, -0.14263833, ...,  0.42114253,
         3.28829207, -0.90189184],
       [ 0.99626697, -0.95729115, -0.14263833, ..., -1.47205832,
        -0.91531426,  0.52486325],
       [ 0.99626697, -0.95729115, -0.14263833, ..., -0.84099137,
        -0.91531426,  1.23824079],
       ...,
       [-1.00374702,  1.04461427, -0.14263833, ..., -0.20992442,
        -0.91531426, -0.04583879],
       [ 0.99626697, -0.95729115, -0.14263833, ..., -0.84099137,
        -0.91531426,  0.81021426],
       [-1.00374702,  1.04461427, -0.14263833, ..., -0.84099137,
        -0.91531426, -0.3311898 ]], shape=(100000, 65))

In [99]:
target = df_transformed_numeric.iloc[:, len(df_transformed_numeric.columns) - 1].values
target

array([1, 0, 1, ..., 0, 0, 0], shape=(100000,))

In [100]:
lista_previsores_sem_esc = [previsores0,previsores1,previsores2]
lista_previsores_esc = [previsores0_esc,previsores1_esc, previsores2_esc]

## Redução de Dimensionalidade

A redução de dimensionalidade em machine learning busca diminuir o número de variáveis de um conjunto de dados sem perder as informações mais relevantes. Isso torna o treinamento mais rápido, reduz o uso de recursos computacionais e ajuda a evitar overfitting, já que o modelo passa a focar no que realmente importa. Além disso, esse processo lida melhor com variáveis redundantes ou altamente correlacionadas, reduz o ruído e pode melhorar tanto a generalização quanto a interpretação dos resultados, principalmente quando se trabalha com bases grandes e complexas.


### Análise dos Componentes Principais (PCA)

* **Seleção das características:** seleciona os melhores atributos e utiliza sem transformações.
* **Extração de características:** Encontra relacionamentos dos melhores atributos e cria novos atributos.

É um algoritmo de aprendizado não supervisionado. Aplica-se em dados linearmente separáveis.

In [101]:
from sklearn.decomposition import PCA

In [102]:
pca = PCA(n_components=10)

In [103]:
lista_pca_sem_esc = []
lista_pca_esc = []

for i in range(len(lista_previsores_sem_esc)):
   if i == 0:
      previsores0_pca = pca.fit_transform(previsores0,)
      lista_pca_sem_esc.append(previsores0_pca)
   elif i == 1:
      previsores1_pca = pca.fit_transform(previsores1)
      lista_pca_sem_esc.append(previsores1_pca)
   elif i == 2:
      previsores2_pca = pca.fit_transform(previsores2)
      lista_pca_sem_esc.append(previsores2_pca)

for i in range(len(lista_previsores_esc)):
   if i == 0:
      previsores0_esc_pca = pca.fit_transform(previsores0_esc)
      lista_pca_esc.append(previsores0_esc_pca)
   elif i == 1:
      previsores1_esc_pca = pca.fit_transform(previsores1_esc)
      lista_pca_esc.append(previsores1_esc_pca)
   elif i == 2:
      previsores2_esc_pca = pca.fit_transform(previsores2_esc)
      lista_pca_esc.append(previsores2_esc_pca)

In [104]:
previsores2_esc_pca.shape

(100000, 10)

In [105]:
pca.explained_variance_ratio_

array([0.08299716, 0.06116598, 0.04779294, 0.04036857, 0.03663379,
       0.03457196, 0.03412799, 0.03085378, 0.030157  , 0.0293362 ])

In [106]:
print(f'O modelo explicou {pca.explained_variance_ratio_.sum()}% das {previsores0.shape[1]} variáveis originais.')

pca.explained_variance_ratio_.sum()

O modelo explicou 0.4280053817161597% das 32 variáveis originais.


np.float64(0.4280053817161597)

In [107]:
previsores1_pca = pca.fit_transform(previsores1)

In [108]:
print(f'O modelo explicou {pca.explained_variance_ratio_.sum()}% das {previsores1.shape[1]} variáveis originais.')

pca.explained_variance_ratio_.sum()

O modelo explicou 0.9999882477185812% das 32 variáveis originais.


np.float64(0.9999882477185812)

In [109]:
previsores2_pca = pca.fit_transform(previsores2)

In [110]:
print(f'O modelo explicou {pca.explained_variance_ratio_.sum()}% das {previsores2.shape[1]} variáveis originais.')

pca.explained_variance_ratio_.sum()

O modelo explicou 0.9999902788145614% das 65 variáveis originais.


np.float64(0.9999902788145614)

### Karnel PCA

É um algoritmo de aprendizado não supervisionado. Aplica-se em dados linearmente separáveis.

In [111]:
from sklearn.decomposition import KernelPCA

In [112]:
#kpca = KernelPCA(n_components=4, kernel='linear')

In [113]:
#previsores1_kernel = kpca.fit_transform(previsores1)

### Análise do Discriminante Linear (LDA)

Algoritmo de aprendizagem supervisionado, pois utiliza a classe como referência para seleção. Aplicado em situações com muitos atributos previsores e também com target com muitas classes.

In [114]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis(n_components=1)

previsores0_lda = lda.fit_transform(previsores0, target)


In [115]:
lista_lda_sem_esc = []
lista_lda_esc = []

for i in range(len(lista_previsores_sem_esc)):
   if i == 0:
      previsores0_lda = lda.fit_transform(previsores0, target)
      lista_lda_sem_esc.append(previsores0_lda)
   elif i == 1:
      previsores1_lda = lda.fit_transform(previsores1, target)
      lista_lda_sem_esc.append(previsores1_lda)
   else:
      previsores2_lda = lda.fit_transform(previsores2, target)
      lista_lda_sem_esc.append(previsores2_lda)

for i in range(len(lista_previsores_esc)):
   if i == 0:
      previsores0_lda_esc = lda.fit_transform(previsores0_esc, target)
      lista_lda_esc.append(previsores0_lda_esc)
   elif i == 1:
      previsores1_lda_esc = lda.fit_transform(previsores1_esc, target)
      lista_lda_esc.append(previsores1_lda_esc)
   else:
      previsores2_lda_esc = lda.fit_transform(previsores2_esc, target)
      lista_lda_esc.append(previsores2_lda_esc)
      
   

In [116]:
previsores0_lda

array([[ 0.15351522],
       [ 0.57008873],
       [ 1.71195477],
       ...,
       [ 0.19547673],
       [-0.47193456],
       [ 0.60342709]], shape=(100000, 1))

In [117]:
lda.explained_variance_ratio_

array([1.])

## Salvar os dados

In [118]:
import pickle
import os
lista_previsores = [previsores0,previsores1,previsores2]
lista_previsores_esc = [previsores0_esc,previsores1_esc,previsores2_esc]
os.makedirs('../pickles/previsores/sem_reducao/sem_escalonamento', exist_ok=True)
os.makedirs('../pickles/target', exist_ok=True)
os.makedirs('../pickles/previsores/sem_reducao/escalonados', exist_ok=True)
os.makedirs('../pickles/previsores/reduzidos/lda/sem_escalonamento', exist_ok=True)
os.makedirs('../pickles/previsores/reduzidos/pca/sem_escalonamento', exist_ok=True)
os.makedirs('../pickles/previsores/reduzidos/lda/escalonados', exist_ok=True)
os.makedirs('../pickles/previsores/reduzidos/pca/escalonados', exist_ok=True)

for i in range(len(lista_previsores_sem_esc)):
    with open(f'../pickles/previsores/sem_reducao/sem_escalonamento/previsores{str(i)}.pkl', 'wb') as f:
        pickle.dump(lista_previsores[i], f)

for i in range(len(lista_previsores_esc)):
    with open(f'../pickles/previsores/sem_reducao/escalonados/previsores{str(i)}_esc.pkl', 'wb') as f:
        pickle.dump(lista_previsores_esc[i], f)

for i in range(len(lista_pca_sem_esc)):
    with open(f'../pickles/previsores/reduzidos/pca/sem_escalonamento/previsores{str(i)}_red_pca.pkl', 'wb') as f:
        pickle.dump(lista_pca_sem_esc[i], f)

for i in range(len(lista_pca_esc)):
    with open(f'../pickles/previsores/reduzidos/pca/escalonados/previsores{str(i)}_red_pca_esc.pkl', 'wb') as f:
        pickle.dump(lista_pca_esc[i], f)

for i in range(len(lista_lda_sem_esc)):
    with open(f'../pickles/previsores/reduzidos/lda/sem_escalonamento/previsores{str(i)}_red_lda.pkl', 'wb') as f:
        pickle.dump(lista_lda_sem_esc[i], f)

for i in range(len(lista_lda_esc)):
    with open(f'../pickles/previsores/reduzidos/lda/escalonados/previsores{str(i)}_red_lda_esc.pkl', 'wb') as f:
        pickle.dump(lista_lda_esc[i], f)

with open('../pickles/target/target.pkl', 'wb') as f:
    pickle.dump(target, f)

**RESUMO DO PRÉ-PROCESSAMENTO, ESCALONAMENTO E SEPARAÇÃO**
* Target: variável de interesse, que se pretende prever;
* previsores: conjunto de variáveis previsoras com variáveis categóricas transformadas em numéricas manualmente e sem escalonamento;
* previsores0_esc: conjunto de variáveis previsoras com variáveis categóricas transformadas em numéricas manualmente e escalonadas;
* previsores1: conjunto de variáveis previsoras com variáveis categóricas transformadas em numéricas com LabelEncoder e sem escalonamento;
* previsores1_esc: conjunto de variáveis previsoras com variáveis categóricas transformadas em numéricas com LabelEncoder e escalonadas;
* previsores2: conjunto de variáveis previsoras transformadas com LabelEncoder e OneHotEncoder e sem escalonamento;
* previsores2_esc: conjunto de variáveis previsoras transformadas com LabelEncoder e OneHotEncoder e escalonadas;
* escalonamento: transformação dos dados para que fiquem na mesma escala;
