In [None]:
### questao 7 - Considere o powerlifting database (powerlifting-database)
# ### a) Vetorize as variáveis categóricas usando One-hot Encoding. Apresente os resultados obtidos.

In [2]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder


def carregar_dataset():
    caminho = "../powerlift/openpowerlifting.csv"
    df = pd.read_csv(caminho)
    print("\n=== Amostra Original ===")
    print(df.head(3))
    return df


def aplicar_one_hot_encoding(df, colunas_categoricas):
    encoder = OneHotEncoder(sparse_output=False, drop=None)
    transformado = encoder.fit_transform(df[colunas_categoricas])
    df_encoded = pd.DataFrame(transformado, columns=encoder.get_feature_names_out(colunas_categoricas))
    df_resultado = pd.concat([df.drop(columns=colunas_categoricas).reset_index(drop=True), df_encoded], axis=1)
    print("\n=== Após One Hot Encoding ===")
    print(df_resultado.head(3))
    return df_resultado


df = carregar_dataset().head(500)
colunas_categoricas = ['Sex', 'Equipment']
df_one_hot = aplicar_one_hot_encoding(df, colunas_categoricas)



=== Amostra Original ===
   MeetID              Name Sex   Equipment   Age     Division  BodyweightKg  \
0       0  Angie Belk Terry   F       Wraps  47.0    Mst 45-49         59.60   
1       0       Dawn Bogart   F  Single-ply  42.0    Mst 40-44         58.51   
2       0       Dawn Bogart   F  Single-ply  42.0  Open Senior         58.51   

  WeightClassKg  Squat4Kg  BestSquatKg  Bench4Kg  BestBenchKg  Deadlift4Kg  \
0            60       NaN        47.63       NaN        20.41          NaN   
1            60       NaN       142.88       NaN        95.25          NaN   
2            60       NaN       142.88       NaN        95.25          NaN   

   BestDeadliftKg  TotalKg Place   Wilks  
0           70.31   138.35     1  155.05  
1          163.29   401.42     1  456.38  
2          163.29   401.42     1  456.38  

=== Após One Hot Encoding ===
   MeetID              Name   Age     Division  BodyweightKg WeightClassKg  \
0       0  Angie Belk Terry  47.0    Mst 45-49         59.6

### questao 7 - Considere o powerlifting database (powerlifting-database)
##### b) Vetorize as variáveis categóricas usando Dummy Coding. Compare os resultados desta vetorização com aqueles obtidos no item (a)

In [3]:

def aplicar_dummy_encoding(df, colunas_categoricas):
    df_encoded = pd.get_dummies(df, columns=colunas_categoricas, drop_first=True)
    print("\n=== Após Dummy Encoding ===")
    print(df_encoded.head(3))
    return df_encoded


df = carregar_dataset()
colunas_categoricas = ['Sex', 'Equipment', 'Place']
df_dummy = aplicar_dummy_encoding(df, colunas_categoricas)


=== Amostra Original ===
   MeetID              Name Sex   Equipment   Age     Division  BodyweightKg  \
0       0  Angie Belk Terry   F       Wraps  47.0    Mst 45-49         59.60   
1       0       Dawn Bogart   F  Single-ply  42.0    Mst 40-44         58.51   
2       0       Dawn Bogart   F  Single-ply  42.0  Open Senior         58.51   

  WeightClassKg  Squat4Kg  BestSquatKg  Bench4Kg  BestBenchKg  Deadlift4Kg  \
0            60       NaN        47.63       NaN        20.41          NaN   
1            60       NaN       142.88       NaN        95.25          NaN   
2            60       NaN       142.88       NaN        95.25          NaN   

   BestDeadliftKg  TotalKg Place   Wilks  
0           70.31   138.35     1  155.05  
1          163.29   401.42     1  456.38  
2          163.29   401.42     1  456.38  

=== Após Dummy Encoding ===
   MeetID              Name   Age     Division  BodyweightKg WeightClassKg  \
0       0  Angie Belk Terry  47.0    Mst 45-49         59.60 

# Questão 10 - Lung Cancer prediction dataset -  Aplique o algoritmo de PCA e selecione

In [5]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from IPython.display import display

def carregar_dataset():
    caminho = "../lung_db/LungCancerDataset.csv"
    df = pd.read_csv(caminho)
    print("\n=== Amostra Original ===")
    display(df.head(3))
    return df

def aplicar_pca(df, n_componentes):
    features = df.drop(columns=['GENDER'], errors='ignore')
    scaler = StandardScaler()
    features_normalizadas = scaler.fit_transform(features.select_dtypes(include=[np.number]))
    pca = PCA(n_components=n_componentes, whiten=False)
    componentes_principais = pca.fit_transform(features_normalizadas)

    df_pca = pd.DataFrame(componentes_principais, columns=[f'PC{i+1}' for i in range(n_componentes)])
    print(f"\n=== Dados após PCA com {n_componentes} componentes ===")
    display(df_pca.head(3))
    print("Variancia   por   componente:")
    print(pca.explained_variance_ratio_)
    return df_pca, pca

df = carregar_dataset()
df_pca, pca_model = aplicar_pca(df, n_componentes=2)



=== Amostra Original ===


Unnamed: 0,AGE,GENDER,SMOKING,FINGER_DISCOLORATION,MENTAL_STRESS,EXPOSURE_TO_POLLUTION,LONG_TERM_ILLNESS,ENERGY_LEVEL,IMMUNE_WEAKNESS,BREATHING_ISSUE,ALCOHOL_CONSUMPTION,THROAT_DISCOMFORT,OXYGEN_SATURATION,CHEST_TIGHTNESS,FAMILY_HISTORY,SMOKING_FAMILY_HISTORY,STRESS_IMMUNE,PULMONARY_DISEASE
0,68,1,1,1,1,1,0,57.831178,0,0,1,1,95.977287,1,0,0,0,NO
1,81,1,1,0,0,1,1,47.694835,1,1,0,1,97.184483,0,0,0,0,YES
2,58,1,1,0,0,0,0,59.577435,0,1,1,0,94.974939,0,0,0,0,NO



=== Dados após PCA com 2 componentes ===


Unnamed: 0,PC1,PC2
0,-0.610103,-0.397012
1,-0.463975,-0.024817
2,-0.717411,-1.131849



Variância explicada por cada componente:
[0.1096558  0.10524464]
