#### 1. Importação de Bibliotecas e Carregamento de Dados

In [None]:
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

# Carregando dataset 
df = pd.read_csv('https://raw.githubusercontent.com/erpereira/postech_fase4/refs/heads/main/data/Obesity.csv')

#### 2. Análise dos Dados e ETL

In [None]:
# Esses comandos mostram as primeiras linhas
display(df.head())

# Tipos de dados
display(df.info())

# Estatística descritiva básica
display(df.describe())

# Tamanho do dataset
display(df.shape)

# Verificando dados faltantes
print(f"Possuem {df.isnull().sum().sum()} dados faltantes (nulos)")

df['Height'].value_counts()

In [66]:
print(df.head())
print(df['Weight'].value_counts)

   Gender  Age  Height  Weight family_history FAVC  FCVC  NCP       CAEC  \
0  Female   21    1.62    64.0            yes   no   2.0  3.0  Sometimes   
1  Female   21    1.52    56.0            yes   no   3.0  3.0  Sometimes   
2    Male   23    1.80    77.0            yes   no   2.0  3.0  Sometimes   
3    Male   27    1.80    87.0             no   no   3.0  3.0  Sometimes   
4    Male   22    1.78    89.8             no   no   2.0  1.0  Sometimes   

  SMOKE  CH2O  SCC  FAF  TUE        CALC                 MTRANS  \
0    no   2.0   no  0.0  1.0          no  Public_Transportation   
1   yes   3.0  yes  3.0  0.0   Sometimes  Public_Transportation   
2    no   2.0   no  2.0  1.0  Frequently  Public_Transportation   
3    no   2.0   no  2.0  0.0  Frequently                Walking   
4    no   2.0   no  0.0  0.0   Sometimes  Public_Transportation   

               Obesity  
0        Normal_Weight  
1        Normal_Weight  
2        Normal_Weight  
3   Overweight_Level_I  
4  Overweight_L

In [None]:
# [Age] (Idade) Idade em anos
df['Age'] = df['Age'].astype('int64')

In [None]:
# Arredondando os valores com 2 digitos
column_names = ["Height","Weight"]
for column_name in column_names:
    df[column_name] = df[column_name].apply(lambda x: round(x, 2))

In [None]:
# Transformando valores categoricos yes | no em binarios 1 | 0
map_yesno = {'no':0,'yes':1}
column_names = ["family_history","FAVC","SMOKE","SCC"]
for column_name in column_names:
    df[column_name] = df[column_name].map(map_yesno)


#### 3. Criacao de classes utilizadas na Pipeline

In [75]:
display(df)

Unnamed: 0,Gender,Age,Height,Weight,family_history,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,Obesity
0,Female,21,1.62,64.00,1,0,2.0,3.0,Sometimes,0,2.000000,0,0.000000,1.000000,no,Public_Transportation,Normal_Weight
1,Female,21,1.52,56.00,1,0,3.0,3.0,Sometimes,1,3.000000,1,3.000000,0.000000,Sometimes,Public_Transportation,Normal_Weight
2,Male,23,1.80,77.00,1,0,2.0,3.0,Sometimes,0,2.000000,0,2.000000,1.000000,Frequently,Public_Transportation,Normal_Weight
3,Male,27,1.80,87.00,0,0,3.0,3.0,Sometimes,0,2.000000,0,2.000000,0.000000,Frequently,Walking,Overweight_Level_I
4,Male,22,1.78,89.80,0,0,2.0,1.0,Sometimes,0,2.000000,0,0.000000,0.000000,Sometimes,Public_Transportation,Overweight_Level_II
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,Female,20,1.71,131.41,1,1,3.0,3.0,Sometimes,0,1.728139,0,1.676269,0.906247,Sometimes,Public_Transportation,Obesity_Type_III
2107,Female,21,1.75,133.74,1,1,3.0,3.0,Sometimes,0,2.005130,0,1.341390,0.599270,Sometimes,Public_Transportation,Obesity_Type_III
2108,Female,22,1.75,133.69,1,1,3.0,3.0,Sometimes,0,2.054193,0,1.414209,0.646288,Sometimes,Public_Transportation,Obesity_Type_III
2109,Female,24,1.74,133.35,1,1,3.0,3.0,Sometimes,0,2.852339,0,1.139107,0.586035,Sometimes,Public_Transportation,Obesity_Type_III


#### 3. Análise Univariada (Distribuições e Histogramas)

In [None]:
# Histogramas para variáveis numéricas
num_cols = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
df[num_cols].hist(bins=30, figsize=(12,8))
plt.suptitle('Distribuição de Varáveis Numéricas')
plt.show()

# Boxplots para detectar outliers
plt.figure(figsize=(12, 6))
for i, col in enumerate(num_cols, 1):
    plt.subplot(2, 4, i)
    sns.boxplot(y=df[col])
    plt.title(f'Boxplot de {col}')
plt.tight_layout()
plt.show()

# Estatísticas de skewness e kurtosis
print("Skewness:\n", df[num_cols].skew())
print("Kurtosis:\n", df[num_cols].kurtosis())

#### 4. Análise Categórica

In [None]:
df.columns

In [None]:
cat_cols = ['Gender', 'family_history', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS', 'Obesity']  # Ajuste colunas
for col in cat_cols:
    plt.figure(figsize=(8, 4))
    sns.countplot(data=df, x=col, hue='Obesity')
    plt.title(f'Distribuição de {col} por Obesidade')
    plt.xticks(rotation=45)
    plt.show()
    print(df[col].value_counts())


In [None]:
# One-Hot Encoding com pandas.get_dummies()
df_encoded = pd.get_dummies(df, columns=['Obesity'], prefix='Obesity', dtype=int)
print("\nDataset após One-Hot Encoding:")
print(df_encoded.head())

# Matriz de correlação
plt.figure(figsize=(10,8))
correlation_matrix = df_encoded[num_cols + ['Obesity_Insufficient_Weight','Obesity_Normal_Weight','Obesity_Obesity_Type_I','Obesity_Obesity_Type_II','Obesity_Obesity_Type_III']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm',center=0)
plt.title('Matriz de Correlação')
plt.show()

# Scatterplots para pares relevantes
sns.pairplot(df_encoded[num_cols], hue='Obesity')
plt.suptitle('Pairplot por Classe de Obesidade', y=1.02)
plt.show()