##  Tradução e limpeza
Objetivo:
Facilictar a interface do usuário através do streamlit, vou traduzir o dataframe e fazer uma limpeza de colunas desnecessárias

In [1]:
import matplotlib as matplotlib
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
import sklearn

from platform import python_version
from ydata_profiling import ProfileReport

from src.config import DADOS_ORIGINAIS, DADOS_LIMPOS
from src.graficos import PALETTE

sns.set_theme(palette="terrain")

#pandas configurando para mostrar todas as linhas e colunas
pd.set_option ('display.max_columns', None)
#configurando pandas para não mostrar notação científica
pd.set_option('display.float_format', lambda x: '%.2f' % x)


bibliotecas = {
    "Pandas": pd,
    "Matplotlib": matplotlib,
    "Seaborn": sns,
    "NumPy": np,
    "Scikit-Learn": sklearn,
}

print("Versões das bibliotecas:\n")
print(f"{'':-^20} | {'':-^10}")
print(f"{'Biblioteca':^20} | {'Versão':^10}")
print(f"{'':-^20} | {'':-^10}")

for nome, biblioteca in sorted(bibliotecas.items()):
    print(f"{nome:<20} | {biblioteca.__version__:>10}")

print()
print(f"Versão do Python: {python_version()}")



Versões das bibliotecas:

-------------------- | ----------
     Biblioteca      |   Versão  
-------------------- | ----------
Matplotlib           |      3.9.2
NumPy                |     1.26.4
Pandas               |      2.2.3
Scikit-Learn         |      1.5.1
Seaborn              |     0.13.2

Versão do Python: 3.12.3


In [2]:
df = pd.read_csv(DADOS_ORIGINAIS)
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Y,Yes,11,3,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,Y,No,23,4,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Y,Yes,15,3,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Y,Yes,11,3,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,Y,No,12,3,4,80,1,6,3,3,2,2,2,2


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

In [4]:
df.describe()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,MonthlyRate,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92,802.49,9.19,2.91,1.0,1024.87,2.72,65.89,2.73,2.06,2.73,6502.93,14313.1,2.69,15.21,3.15,2.71,80.0,0.79,11.28,2.8,2.76,7.01,4.23,2.19,4.12
std,9.14,403.51,8.11,1.02,0.0,602.02,1.09,20.33,0.71,1.11,1.1,4707.96,7117.79,2.5,3.66,0.36,1.08,0.0,0.85,7.78,1.29,0.71,6.13,3.62,3.22,3.57
min,18.0,102.0,1.0,1.0,1.0,1.0,1.0,30.0,1.0,1.0,1.0,1009.0,2094.0,0.0,11.0,3.0,1.0,80.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,465.0,2.0,2.0,1.0,491.25,2.0,48.0,2.0,1.0,2.0,2911.0,8047.0,1.0,12.0,3.0,2.0,80.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,802.0,7.0,3.0,1.0,1020.5,3.0,66.0,3.0,2.0,3.0,4919.0,14235.5,2.0,14.0,3.0,3.0,80.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,1157.0,14.0,4.0,1.0,1555.75,4.0,83.75,3.0,3.0,4.0,8379.0,20461.5,4.0,18.0,3.0,4.0,80.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0
max,60.0,1499.0,29.0,5.0,1.0,2068.0,4.0,100.0,4.0,5.0,4.0,19999.0,26999.0,9.0,25.0,4.0,4.0,80.0,3.0,40.0,6.0,4.0,40.0,18.0,15.0,17.0


In [5]:
df.describe(exclude="number")

Unnamed: 0,Attrition,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus,Over18,OverTime
count,1470,1470,1470,1470,1470,1470,1470,1470,1470
unique,2,3,3,6,2,9,3,1,2
top,No,Travel_Rarely,Research & Development,Life Sciences,Male,Sales Executive,Married,Y,No
freq,1233,1043,961,606,882,326,673,1470,1054


In [6]:
df.isnull().sum()

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

In [7]:
df[df.duplicated()]

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager


In [8]:
df[df.duplicated('EmployeeNumber')]

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager


In [9]:
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [10]:
# Dicionário de mapeamento (inglês -> português)
rename_columns = {
    "Age": "Idade",
    "Attrition": "Attrition",
    "BusinessTravel": "Viagens trab",
    "DailyRate": "Tarifa diária",
    "Department": "Departamento",
    "DistanceFromHome":"Distância casa",
    "Education": "Formação acad",
    "EducationField": "Área form", 
    "EmployeeCount": "Cont func",
    "EmployeeNumber": "Nº func",
    "EnvironmentSatisfaction": "Satisf amb",
    "Gender": "Gênero",
    "HourlyRate": "Tarifa hora",
    "JobInvolvement": "Envolv trab", 
    "JobLevel": "Nível cargo", 
    "JobRole": "Cargo",
    "JobSatisfaction": "Satisf trab",
    "MaritalStatus":"Estado civil", 
    "MonthlyIncome": "Renda mensal",
    "MonthlyRate":"Tarifa mensal",
    "NumCompaniesWorked": "Nº empresas trab",
    "Over18": "Maior de 18", 
    "OverTime": "Hora extra", 
    "PercentSalaryHike":"% aumento sal",
    "PerformanceRating": "Avaliação desemp",
    "RelationshipSatisfaction": "Satisf relac",
    "StandardHours":"Horas padrão",
    "StockOptionLevel":"Opc ações",
    "TotalWorkingYears":"Anos traba",
    "TrainingTimesLastYear": "Treinam ultm ano", 
    "WorkLifeBalance": "Equil vida-trab",
    "YearsAtCompany": "Anos empresa", 
    "YearsInCurrentRole": "Anos cargo atual",
    "YearsSinceLastPromotion":"Anos ult prom",
    "YearsWithCurrManager": "Anos gerente atual"
}
    
  

df_port= df.copy()
# Renomeando as colunas
df_port = df_port.rename(columns=rename_columns)

In [11]:
df_port.head()

Unnamed: 0,Idade,Attrition,Viagens trab,Tarifa diária,Departamento,Distância casa,Formação acad,Área form,Cont func,Nº func,Satisf amb,Gênero,Tarifa hora,Envolv trab,Nível cargo,Cargo,Satisf trab,Estado civil,Renda mensal,Tarifa mensal,Nº empresas trab,Maior de 18,Hora extra,% aumento sal,Avaliação desemp,Satisf relac,Horas padrão,Opc ações,Anos traba,Treinam ultm ano,Equil vida-trab,Anos empresa,Anos cargo atual,Anos ult prom,Anos gerente atual
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Y,Yes,11,3,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,Y,No,23,4,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Y,Yes,15,3,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Y,Yes,11,3,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,Y,No,12,3,4,80,1,6,3,3,2,2,2,2


In [12]:
df_port.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Idade               1470 non-null   int64 
 1   Attrition           1470 non-null   object
 2   Viagens trab        1470 non-null   object
 3   Tarifa diária       1470 non-null   int64 
 4   Departamento        1470 non-null   object
 5   Distância casa      1470 non-null   int64 
 6   Formação acad       1470 non-null   int64 
 7   Área form           1470 non-null   object
 8   Cont func           1470 non-null   int64 
 9   Nº func             1470 non-null   int64 
 10  Satisf amb          1470 non-null   int64 
 11  Gênero              1470 non-null   object
 12  Tarifa hora         1470 non-null   int64 
 13  Envolv trab         1470 non-null   int64 
 14  Nível cargo         1470 non-null   int64 
 15  Cargo               1470 non-null   object
 16  Satisf trab         1470

In [13]:
df_port.nunique()

Idade                   43
Attrition                2
Viagens trab             3
Tarifa diária          886
Departamento             3
Distância casa          29
Formação acad            5
Área form                6
Cont func                1
Nº func               1470
Satisf amb               4
Gênero                   2
Tarifa hora             71
Envolv trab              4
Nível cargo              5
Cargo                    9
Satisf trab              4
Estado civil             3
Renda mensal          1349
Tarifa mensal         1427
Nº empresas trab        10
Maior de 18              1
Hora extra               2
% aumento sal           15
Avaliação desemp         2
Satisf relac             4
Horas padrão             1
Opc ações                4
Anos traba              40
Treinam ultm ano         7
Equil vida-trab          4
Anos empresa            37
Anos cargo atual        19
Anos ult prom           16
Anos gerente atual      18
dtype: int64

In [14]:
#localizar colunas com apenas um valor único
col_valores_unicos = list(df_port.nunique()[df_port.nunique()==1].index)
col_valores_unicos 

['Cont func', 'Maior de 18', 'Horas padrão']

In [15]:
#retirar dados que não acrescentam nenhuma informação relevante
col_drop = col_valores_unicos + ['Nº func'] 


In [16]:
df_port = df_port.drop(columns=col_drop)

In [17]:
df_port.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 31 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Idade               1470 non-null   int64 
 1   Attrition           1470 non-null   object
 2   Viagens trab        1470 non-null   object
 3   Tarifa diária       1470 non-null   int64 
 4   Departamento        1470 non-null   object
 5   Distância casa      1470 non-null   int64 
 6   Formação acad       1470 non-null   int64 
 7   Área form           1470 non-null   object
 8   Satisf amb          1470 non-null   int64 
 9   Gênero              1470 non-null   object
 10  Tarifa hora         1470 non-null   int64 
 11  Envolv trab         1470 non-null   int64 
 12  Nível cargo         1470 non-null   int64 
 13  Cargo               1470 non-null   object
 14  Satisf trab         1470 non-null   int64 
 15  Estado civil        1470 non-null   object
 16  Renda mensal        1470

In [18]:
colunas_str = df_port.select_dtypes("object").columns
colunas_str 

Index(['Attrition', 'Viagens trab', 'Departamento', 'Área form', 'Gênero',
       'Cargo', 'Estado civil', 'Hora extra'],
      dtype='object')

In [19]:
df_port["Attrition"].unique()

array(['Yes', 'No'], dtype=object)

In [20]:
df_port['Hora extra'].unique()

array(['Yes', 'No'], dtype=object)

In [21]:
colunas_s_n=["Attrition", "Hora extra"]
s_n= {
    "Yes": "Sim",
    "No": "Não",
}
for coluna in colunas_s_n:
    df_port[coluna] = df_port[coluna].map(s_n) 
df_port.head()

Unnamed: 0,Idade,Attrition,Viagens trab,Tarifa diária,Departamento,Distância casa,Formação acad,Área form,Satisf amb,Gênero,Tarifa hora,Envolv trab,Nível cargo,Cargo,Satisf trab,Estado civil,Renda mensal,Tarifa mensal,Nº empresas trab,Hora extra,% aumento sal,Avaliação desemp,Satisf relac,Opc ações,Anos traba,Treinam ultm ano,Equil vida-trab,Anos empresa,Anos cargo atual,Anos ult prom,Anos gerente atual
0,41,Sim,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Sim,11,3,1,0,8,0,1,6,4,0,5
1,49,Não,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,Não,23,4,4,1,10,3,3,10,7,1,7
2,37,Sim,Travel_Rarely,1373,Research & Development,2,2,Other,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Sim,15,3,2,0,7,3,3,0,0,0,0
3,33,Não,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Sim,11,3,3,0,8,3,3,8,7,3,0
4,27,Não,Travel_Rarely,591,Research & Development,2,1,Medical,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,Não,12,3,4,1,6,3,3,2,2,2,2


In [22]:
df_port["Viagens trab"].unique()

array(['Travel_Rarely', 'Travel_Frequently', 'Non-Travel'], dtype=object)

In [23]:
viagens= {
   "Travel_Rarely": "Raramente",
   "Travel_Frequently":"Frequentemente",
   "Non-Travel": "Não_viaja",
}

df_port["Viagens trab"] = df_port["Viagens trab"].map(viagens) 

In [24]:
df_port['Departamento'].unique() 

array(['Sales', 'Research & Development', 'Human Resources'], dtype=object)

In [25]:
departamentos = {
   "Sales": "Vendas",
   "Research & Development": "Pesquisa e Desenvolvimento",
   "Human Resources":"Recursos Humanos",
}

df_port["Departamento"] = df_port["Departamento"].map(departamentos) 


In [26]:
df_port['Área form'].unique() 

array(['Life Sciences', 'Other', 'Medical', 'Marketing',
       'Technical Degree', 'Human Resources'], dtype=object)

In [27]:
area = {
   "Life Sciences": "Ciências biológicas",
   "Other": "Outros",
   "Medical":"Ciências médicas",
    "Marketing":"Marketing",
    "Technical Degree":"Formação técnica",
    "Human Resources":"Recursos Humanos",
    
}

df_port["Área form"] = df_port["Área form"].map(area) 


In [28]:
df_port['Gênero'].unique() 

array(['Female', 'Male'], dtype=object)

In [29]:
genero = {
   "Female": "Feminino",
   "Male": "Masculino",
   
}

df_port["Gênero"] = df_port["Gênero"].map(genero) 


In [30]:
df_port["Cargo"].unique()

array(['Sales Executive', 'Research Scientist', 'Laboratory Technician',
       'Manufacturing Director', 'Healthcare Representative', 'Manager',
       'Sales Representative', 'Research Director', 'Human Resources'],
      dtype=object)

In [31]:
cargo = {
   "Sales Executive": "Executivo de vendas",
   "Research Scientist": "Cientista pesquisador",
   "Laboratory Technician":"Técnico de laboratório",
    "Manufacturing Director":"Diretor de produção",
    "Healthcare Representative":"Representante de saúde",
    "Manager":"Gerente",
    "Sales Representative":"Representante de vendas",
    "Research Director":"Diretor de pesquisa",
    "Human Resources":"Recursos Humanos",
}

df_port["Cargo"] = df_port["Cargo"].map(cargo) 


In [32]:
df_port["Estado civil"].unique()

array(['Single', 'Married', 'Divorced'], dtype=object)

In [33]:
estado_civil = {
   "Single": "Solteiro",
   "Married": "Casado",
   "Divorced":"Divorciado",
    
}

df_port["Estado civil"] = df_port["Estado civil"].map(estado_civil) 


In [34]:
df_port["Distância casa"].describe()

count   1470.00
mean       9.19
std        8.11
min        1.00
25%        2.00
50%        7.00
75%       14.00
max       29.00
Name: Distância casa, dtype: float64

In [35]:
#transformar milhas em km
df_port["Distância casa"] = df_port["Distância casa"]*1.60934

In [36]:
df_port["Distância casa"].describe()

count   1470.00
mean      14.79
std       13.05
min        1.61
25%        3.22
50%       11.27
75%       22.53
max       46.67
Name: Distância casa, dtype: float64

In [37]:
df_port.head()

Unnamed: 0,Idade,Attrition,Viagens trab,Tarifa diária,Departamento,Distância casa,Formação acad,Área form,Satisf amb,Gênero,Tarifa hora,Envolv trab,Nível cargo,Cargo,Satisf trab,Estado civil,Renda mensal,Tarifa mensal,Nº empresas trab,Hora extra,% aumento sal,Avaliação desemp,Satisf relac,Opc ações,Anos traba,Treinam ultm ano,Equil vida-trab,Anos empresa,Anos cargo atual,Anos ult prom,Anos gerente atual
0,41,Sim,Viaja_raramente,1102,Vendas,1.61,2,Ciências biológicas,2,Feminino,94,3,2,Executivo de vendas,4,Solteiro,5993,19479,8,Sim,11,3,1,0,8,0,1,6,4,0,5
1,49,Não,Viaja_frequentemente,279,Pesquisa e Desenvolvimento,12.87,1,Ciências biológicas,3,Masculino,61,2,2,Cientista pesquisador,2,Casado,5130,24907,1,Não,23,4,4,1,10,3,3,10,7,1,7
2,37,Sim,Viaja_raramente,1373,Pesquisa e Desenvolvimento,3.22,2,Outros,4,Masculino,92,2,1,Técnico de laboratório,3,Solteiro,2090,2396,6,Sim,15,3,2,0,7,3,3,0,0,0,0
3,33,Não,Viaja_frequentemente,1392,Pesquisa e Desenvolvimento,4.83,4,Ciências biológicas,4,Feminino,56,3,1,Cientista pesquisador,3,Casado,2909,23159,1,Sim,11,3,3,0,8,3,3,8,7,3,0
4,27,Não,Viaja_raramente,591,Pesquisa e Desenvolvimento,3.22,1,Ciências médicas,1,Masculino,40,3,1,Técnico de laboratório,2,Casado,3468,16632,9,Não,12,3,4,1,6,3,3,2,2,2,2


In [38]:
df_port.to_parquet(DADOS_LIMPOS, index=False)