# Preparação inicial

Vamos investigar os datasets fornecidos para verificar quais os que melhor se adequam aos objetivos e restrições do exercicio. Com base nessa investigação vamos selecionar alguns datasets para as etapas seguintes

## Primeiro carregamos os datasets disponiveis

In [30]:
### Carregar as bibliotecas
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import numpy as np
import pandas as pd

dataset_list = [
    "Abalone.csv",
    "Adult.csv",
    "Arrhythmia.csv",
    "Breast_cancer.csv",
    "Car.csv",
    "Credit.csv",
    "Dermatology.csv",
    "Diabetes.csv",
    "Ecoli.csv",
    "Glass.csv",
    "Image.csv",
    "Iris.csv",
    "Madelon.csv",
    "Movie_Dataset.csv",
    "Nursery.csv",
    "Semeion.csv",
    "SolarFlare.csv",
    "Waveform.csv",
    "Wine.csv",
    "Yeast.csv",
    "DELL/Abalone.csv",
    "DELL/base-1.csv",
    "DELL/base-10.csv",
    "DELL/base-2.csv",
    "DELL/base-3.csv",
    "DELL/base-4.csv",
    "DELL/base-5.csv",
    "DELL/base-6.csv",
    "DELL/base-7.csv",
    "DELL/base-8.csv",
    "DELL/base-9.csv",
    "NoClass/Abalone_NoClass.csv",
    "NoClass/Adult_noClass.csv",
    "NoClass/Adult.csv",
    "NoClass/Arrhythmia_PCA.csv",
    "NoClass/Breast_cancer_2.csv",
    "NoClass/Breast_cancer2.csv",
    "NoClass/Breast-cancer3.csv",
    "NoClass/Diabetes_NoClass.csv",
    "NoClass/Diabetes_NoClassNormOfNumber.csv",
    "NoClass/Diabetes_Reduced.csv",
    "NoClass/Diabetes-labels.csv",
    "NoClass/Diabetes.csv",
    "NoClass/Image.csv",
    "NoClass/Nursery.csv",
    "NoClass/NurseryNominalToBinary_NoClass.csv",
    "NoClass/NurseryNominalToBinary.csv",
    "NoClass/Pessoa_transformada.csv",
    "NoClass/Pessoa.csv",
    "NoClass/Pessoa02.csv",
    "NoClass/PessoaNormBinary.csv"
]
base_url = "https://raw.githubusercontent.com/gilcesarf/imd3002-202502/main/Aula08/data/"

dslist = []
### Le todos os datasets
for datasetName in dataset_list:
  print("Lendo ", datasetName)

  dslist.append({ "name": datasetName, "dataset": pd.read_csv(base_url + datasetName,encoding='utf-8')})


Lendo  Abalone.csv
Lendo  Adult.csv
Lendo  Arrhythmia.csv
Lendo  Breast_cancer.csv
Lendo  Car.csv
Lendo  Credit.csv
Lendo  Dermatology.csv
Lendo  Diabetes.csv
Lendo  Ecoli.csv
Lendo  Glass.csv
Lendo  Image.csv
Lendo  Iris.csv
Lendo  Madelon.csv
Lendo  Movie_Dataset.csv
Lendo  Nursery.csv
Lendo  Semeion.csv
Lendo  SolarFlare.csv
Lendo  Waveform.csv
Lendo  Wine.csv
Lendo  Yeast.csv
Lendo  DELL/Abalone.csv
Lendo  DELL/base-1.csv
Lendo  DELL/base-10.csv
Lendo  DELL/base-2.csv
Lendo  DELL/base-3.csv
Lendo  DELL/base-4.csv
Lendo  DELL/base-5.csv
Lendo  DELL/base-6.csv
Lendo  DELL/base-7.csv
Lendo  DELL/base-8.csv
Lendo  DELL/base-9.csv
Lendo  NoClass/Abalone_NoClass.csv
Lendo  NoClass/Adult_noClass.csv
Lendo  NoClass/Adult.csv
Lendo  NoClass/Arrhythmia_PCA.csv
Lendo  NoClass/Breast_cancer_2.csv
Lendo  NoClass/Breast_cancer2.csv
Lendo  NoClass/Breast-cancer3.csv
Lendo  NoClass/Diabetes_NoClass.csv
Lendo  NoClass/Diabetes_NoClassNormOfNumber.csv
Lendo  NoClass/Diabetes_Reduced.csv
Lendo  NoCla

## Depois descobrimos o conjunto de todos os tipos de todas as colunas dos datasets

In [31]:
all_types = set()
for d in dslist:
    tipos = d["dataset"].dtypes.apply(lambda x: str(x))
    all_types.update(tipos)

# Ordena os tipos para ficar consistente
all_types = sorted(all_types)

all_types

['float64', 'int64', 'object']

## E agora fazemos a contagem de colunas para cada tipo em cada um dos datasets

In [32]:
allInfo = pd.DataFrame([{
    "Dataset": d["dataset"],
    "Nome": d["name"],
    "Linhas": d["dataset"].shape[0],
    "Colunas": d["dataset"].shape[1],
    **{f"Tipos_{t}": (d["dataset"].dtypes.apply(lambda x: str(x)) == t).sum() for t in all_types}
} for d in dslist])

allInfo.loc[:, allInfo.columns != "Dataset"]

Unnamed: 0,Nome,Linhas,Colunas,Tipos_float64,Tipos_int64,Tipos_object
0,Abalone.csv,4177,9,7,1,1
1,Adult.csv,16281,15,0,6,9
2,Arrhythmia.csv,452,261,194,67,0
3,Breast_cancer.csv,699,10,0,8,2
4,Car.csv,1728,7,0,0,7
5,Credit.csv,1000,21,0,7,14
6,Dermatology.csv,366,35,0,34,1
7,Diabetes.csv,768,9,2,6,1
8,Ecoli.csv,336,8,7,0,1
9,Glass.csv,214,10,9,0,1


## Marcamos os datasets que respeitam as restrições impostas pelo exercicio

Ou seja:
- com mais de 20 colunas
- com ao menos uma coluna de tipo object
- ao menos uma coluna numerica (int ou float)

Mas também adicionamos por conta própria qualquer dataset que possua mais de 200 colunas, independente de qualquer outro critério. A idéia é ter datasets com maiores quantidades de colunas para ver o desempenho dos algoritmos.

In [33]:
# Agora adicionamos candidato usando as colunas criadas

allInfo["candidato"] = (
    ((allInfo["Colunas"] > 20) &
    (allInfo.get("Tipos_object", 0) > 0) &
    (
        (allInfo.get("Tipos_float64", 0) > 0) |
        (allInfo.get("Tipos_int64", 0) > 0)
    )) |
    (allInfo["Colunas"] > 200)
)

allInfo.loc[:, allInfo.columns != "Dataset"]


Unnamed: 0,Nome,Linhas,Colunas,Tipos_float64,Tipos_int64,Tipos_object,candidato
0,Abalone.csv,4177,9,7,1,1,False
1,Adult.csv,16281,15,0,6,9,False
2,Arrhythmia.csv,452,261,194,67,0,True
3,Breast_cancer.csv,699,10,0,8,2,False
4,Car.csv,1728,7,0,0,7,False
5,Credit.csv,1000,21,0,7,14,True
6,Dermatology.csv,366,35,0,34,1,True
7,Diabetes.csv,768,9,2,6,1,False
8,Ecoli.csv,336,8,7,0,1,False
9,Glass.csv,214,10,9,0,1,False


## Para finalizar a analise inicial, descartamos os datasets que não foram selecionados.

Deixamos agora apenas os datasets que serão efetivamente utilizados no exercicio.

In [34]:
allInfo = allInfo[allInfo["candidato"]].reset_index(drop=True)

allInfo = allInfo.drop(columns=["candidato"])

allInfo.loc[:, allInfo.columns != "Dataset"]

Unnamed: 0,Nome,Linhas,Colunas,Tipos_float64,Tipos_int64,Tipos_object
0,Arrhythmia.csv,452,261,194,67,0
1,Credit.csv,1000,21,0,7,14
2,Dermatology.csv,366,35,0,34,1
3,Madelon.csv,2600,501,0,501,0
4,Semeion.csv,1593,257,0,257,0
5,NoClass/NurseryNominalToBinary.csv,12960,27,0,26,1


# Pré-processamento

Primeiramente precisamos aplicar o pré-processamento sobre as bases selecionados. Vamos investigar os tipos de datos de cada dataset.

## Missing Values

Não existem missing values nas bases selecionadas, como podemos ver abaixo:

In [35]:
def mostrar_colunas_tipo_objeto(df: pd.DataFrame):
    object_cols = df.select_dtypes(include=["object"])

    if object_cols.shape[1] == 0:
        print("Nenhuma coluna do tipo object encontrada.")
        return

    print(f"Colunas do tipo object ({object_cols.shape[1]} encontradas):")
    print(list(object_cols.columns))
    display(object_cols.sample(10, random_state=42))

    return object_cols

for row in allInfo.itertuples():
    dataset = row.Dataset
    print(row.Nome)
    #dataset.info()

    missing_values = dataset.isnull().sum()
    missing_values = missing_values[missing_values > 0]

    if not missing_values.empty:
        print("Missing Values:")
        print(missing_values)
    else:
        print("Sem Missing Values.")

    # print(row.Dataset)
    mostrar_colunas_tipo_objeto(dataset)

    print()
    print()
    print("#########")
    print()


Arrhythmia.csv
Sem Missing Values.
Nenhuma coluna do tipo object encontrada.


#########

Credit.csv
Sem Missing Values.
Colunas do tipo object (14 encontradas):
['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'own_telephone', 'foreign_worker', 'class']


Unnamed: 0,checking_status,credit_history,purpose,savings_status,employment,personal_status,other_parties,property_magnitude,other_payment_plans,housing,job,own_telephone,foreign_worker,class
521,<0,'existing paid',radio/tv,<100,1<=X<4,'female div/dep/mar',none,'real estate',none,own,skilled,none,yes,bad
737,<0,'existing paid','new car',100<=X<500,1<=X<4,'male single',none,car,none,own,'unskilled resident',yes,yes,good
740,<0,'all paid','new car',100<=X<500,4<=X<7,'male single',none,car,bank,own,skilled,none,yes,good
660,>=200,'existing paid',radio/tv,<100,1<=X<4,'male mar/wid',none,'real estate',none,rent,skilled,none,yes,good
411,'no checking','critical/other existing credit','used car',<100,4<=X<7,'male single',none,car,none,own,'high qualif/self emp/mgmt',yes,yes,good
678,<0,'existing paid',radio/tv,<100,>=7,'male single',none,'real estate',bank,rent,'unskilled resident',none,yes,good
626,>=200,'existing paid',furniture/equipment,<100,1<=X<4,'male single',none,'real estate',none,own,skilled,yes,yes,good
513,0<=X<200,'delayed previously',radio/tv,<100,1<=X<4,'male mar/wid','co applicant','real estate',none,rent,skilled,none,yes,good
859,'no checking','existing paid','new car',100<=X<500,1<=X<4,'male single',guarantor,'real estate',none,rent,skilled,none,no,good
136,'no checking','delayed previously','used car',>=1000,1<=X<4,'male single',none,car,none,own,skilled,none,yes,good




#########

Dermatology.csv
Sem Missing Values.
Colunas do tipo object (1 encontradas):
['ATT34']


Unnamed: 0,ATT34
193,18
33,?
15,40
310,50
57,29
183,20
76,33
119,0
152,46
126,8




#########

Madelon.csv
Sem Missing Values.
Nenhuma coluna do tipo object encontrada.


#########

Semeion.csv
Sem Missing Values.
Nenhuma coluna do tipo object encontrada.


#########

NoClass/NurseryNominalToBinary.csv
Sem Missing Values.
Colunas do tipo object (1 encontradas):
['class']


Unnamed: 0,class
6407,not_recom
6301,spec_prior
304,priority
12520,spec_prior
2417,not_recom
6312,priority
8246,not_recom
2655,priority
6249,priority
5206,priority




#########



## Limpando Dermatology.csv

Podemos observar que Dermatology.csv possui a coluna ["ATT34"] do tipo object, mas é possível ver que alguns valores possuem uma interrogação.

In [36]:
def replace_non_numeric_with_nan(df: pd.DataFrame, cols):
    """
    Substitui valores não numéricos por NaN nas colunas especificadas.

    Parâmetros:
        df   : DataFrame
        cols : str ou lista de colunas

    Retorno:
        DataFrame com as colunas alteradas
    """
    df_copy = df.copy()

    if isinstance(cols, str):
        cols = [cols]

    for col in cols:
        df_copy[col] = pd.to_numeric(df_copy[col], errors="coerce")

    return df_copy

Dermatology = allInfo.loc[2, ]

d_linha = replace_non_numeric_with_nan(Dermatology.Dataset, ["ATT34"])

print(d_linha.ATT34.value_counts(dropna=False))
print()
print(f"[ATT34]: {d_linha.ATT34.isna().sum()} NaN de {d_linha.ATT34.count() + d_linha.ATT34.isna().sum()}")
#

ATT34
40.0    17
50.0    17
27.0    16
36.0    16
22.0    15
        ..
58.0     1
49.0     1
63.0     1
68.0     1
75.0     1
Name: count, Length: 61, dtype: int64

[ATT34]: 8 NaN de 366


Vamos substituir esses valores por NaN e avaliar a distribuição depois dessa mudança.

In [37]:
print("Moda (d_linha.ATT34.mode()): ")
print(d_linha.ATT34.mode())

print("Mediana (d_linha.ATT34.median()): ")
print(d_linha.ATT34.median())

d_linha["ATT34"] = d_linha["ATT34"].fillna(d_linha["ATT34"].median())

print(f"[ATT34]: {d_linha.ATT34.isna().sum()} NaN de {d_linha.ATT34.count() + d_linha.ATT34.isna().sum()}")
mostrar_colunas_tipo_objeto(d_linha)

#devolvendo o dataset limpo para o conjunto
allInfo.loc[2, "Dataset"] = d_linha

Moda (d_linha.ATT34.mode()): 
0    40.0
1    50.0
Name: ATT34, dtype: float64
Mediana (d_linha.ATT34.median()): 
35.0
[ATT34]: 0 NaN de 366
Nenhuma coluna do tipo object encontrada.


## Limpando NoClass/NurseryNominalToBinary.csv

O dataset NoClass/NurseryNominalToBinary.csv possui a classe como object.

In [38]:
NurseryNominalToBinary = allInfo.loc[5, ]
# NurseryNominalToBinary.Dataset.info()
NurseryNominalToBinary.Dataset


Unnamed: 0,parents=usual,parents=pretentious,parents=great_pret,has_nurs=proper,has_nurs=less_proper,has_nurs=improper,has_nurs=critical,has_nurs=very_crit,form=complete,form=completed,...,housing=less_conv,housing=critical,finance=inconv,social=nonprob,social=slightly_prob,social=problematic,health=recommended,health=priority,health=not_recom,class
0,1,0,0,1,0,0,0,0,1,0,...,0,0,0,1,0,0,1,0,0,recommend
1,1,0,0,1,0,0,0,0,1,0,...,0,0,0,1,0,0,0,1,0,priority
2,1,0,0,1,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,1,not_recom
3,1,0,0,1,0,0,0,0,1,0,...,0,0,0,0,1,0,1,0,0,recommend
4,1,0,0,1,0,0,0,0,1,0,...,0,0,0,0,1,0,0,1,0,priority
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12955,0,0,1,0,0,0,0,1,0,0,...,0,1,1,0,1,0,0,1,0,spec_prior
12956,0,0,1,0,0,0,0,1,0,0,...,0,1,1,0,1,0,0,0,1,not_recom
12957,0,0,1,0,0,0,0,1,0,0,...,0,1,1,0,0,1,1,0,0,spec_prior
12958,0,0,1,0,0,0,0,1,0,0,...,0,1,1,0,0,1,0,1,0,spec_prior


Vamos remover a ultima coluna para manter apenas dados numericos.

In [40]:
df = NurseryNominalToBinary.Dataset.drop(columns=NurseryNominalToBinary.Dataset.columns[-1])
allInfo.loc[5, "Dataset" ] = df
df

Unnamed: 0,parents=usual,parents=pretentious,parents=great_pret,has_nurs=proper,has_nurs=less_proper,has_nurs=improper,has_nurs=critical,has_nurs=very_crit,form=complete,form=completed,...,housing=convenient,housing=less_conv,housing=critical,finance=inconv,social=nonprob,social=slightly_prob,social=problematic,health=recommended,health=priority,health=not_recom
0,1,0,0,1,0,0,0,0,1,0,...,1,0,0,0,1,0,0,1,0,0
1,1,0,0,1,0,0,0,0,1,0,...,1,0,0,0,1,0,0,0,1,0
2,1,0,0,1,0,0,0,0,1,0,...,1,0,0,0,1,0,0,0,0,1
3,1,0,0,1,0,0,0,0,1,0,...,1,0,0,0,0,1,0,1,0,0
4,1,0,0,1,0,0,0,0,1,0,...,1,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12955,0,0,1,0,0,0,0,1,0,0,...,0,0,1,1,0,1,0,0,1,0
12956,0,0,1,0,0,0,0,1,0,0,...,0,0,1,1,0,1,0,0,0,1
12957,0,0,1,0,0,0,0,1,0,0,...,0,0,1,1,0,0,1,1,0,0
12958,0,0,1,0,0,0,0,1,0,0,...,0,0,1,1,0,0,1,0,1,0


## Limpando Credit.csv



In [41]:
Credit = allInfo.loc[1, ]
Credit.Dataset.info()
Credit.Dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   checking_status         1000 non-null   object
 1   duration                1000 non-null   int64 
 2   credit_history          1000 non-null   object
 3   purpose                 1000 non-null   object
 4   credit_amount           1000 non-null   int64 
 5   savings_status          1000 non-null   object
 6   employment              1000 non-null   object
 7   installment_commitment  1000 non-null   int64 
 8   personal_status         1000 non-null   object
 9   other_parties           1000 non-null   object
 10  residence_since         1000 non-null   int64 
 11  property_magnitude      1000 non-null   object
 12  age                     1000 non-null   int64 
 13  other_payment_plans     1000 non-null   object
 14  housing                 1000 non-null   object
 15  exist

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6,'critical/other existing credit',radio/tv,1169,'no known savings',>=7,4,'male single',none,...,'real estate',67,none,own,2,skilled,1,yes,yes,good
1,0<=X<200,48,'existing paid',radio/tv,5951,<100,1<=X<4,2,'female div/dep/mar',none,...,'real estate',22,none,own,1,skilled,1,none,yes,bad
2,'no checking',12,'critical/other existing credit',education,2096,<100,4<=X<7,2,'male single',none,...,'real estate',49,none,own,1,'unskilled resident',2,none,yes,good
3,<0,42,'existing paid',furniture/equipment,7882,<100,4<=X<7,2,'male single',guarantor,...,'life insurance',45,none,'for free',1,skilled,2,none,yes,good
4,<0,24,'delayed previously','new car',4870,<100,1<=X<4,3,'male single',none,...,'no known property',53,none,'for free',2,skilled,2,none,yes,bad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,'no checking',12,'existing paid',furniture/equipment,1736,<100,4<=X<7,3,'female div/dep/mar',none,...,'real estate',31,none,own,1,'unskilled resident',1,none,yes,good
996,<0,30,'existing paid','used car',3857,<100,1<=X<4,4,'male div/sep',none,...,'life insurance',40,none,own,1,'high qualif/self emp/mgmt',1,yes,yes,good
997,'no checking',12,'existing paid',radio/tv,804,<100,>=7,4,'male single',none,...,car,38,none,own,1,skilled,1,none,yes,good
998,<0,45,'existing paid',radio/tv,1845,<100,1<=X<4,4,'male single',none,...,'no known property',23,none,'for free',1,skilled,1,yes,yes,bad


# Calculando Pearson

# Calculando PCA

In [None]:
### Normalizando o dataset
from sklearn.preprocessing import StandardScaler

X_std = StandardScaler().fit_transform(X)
print(X_std)

print(X_std.shape)

In [None]:
### Importando PCA do SkLearn
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

## Indicar o número de componentes
pca = PCA(n_components = 4)

## Execução do PCA
pca_result = pca.fit_transform(X_std)

### Impressão do resultado
print(pca_result)
print(pca_result.shape)

In [None]:
## Indicar o porcentagem de informação desejado
pca = PCA(n_components=0.75, whiten=True)

## Execução do PCA
pca_result = pca.fit_transform(X_std)

### Impressão do resultado
print(pca_result)
print(pca_result.shape)

In [None]:
## Indicar o porcentagem de informação desejado
pca = PCA(n_components=0.90, whiten=True)

## Execução do PCA
pca_result = pca.fit_transform(X_std)

### Impressão do resultado
print(pca_result)
print(pca_result.shape)

In [None]:
### Colocando o nome nos atributos
i = 1
atributos = []
while i <= pca_result.shape[1]:
  atributos.append('pca_'+str(i))
  i += 1

pca_dataset = pd.DataFrame(data = pca_result, columns = atributos)

In [None]:
## Juntando o atributo classe
final_data = pca_dataset.join(y)

# Visualização dos dados normalizados
final_data.head()

# Salvando Pessoa.csv transformado
df = pd.DataFrame(final_data)
df.to_csv('Semeion_PCA-90.csv', index=False)

***Nova Análise***

In [None]:
### Carregar as Libraries
import pandas as pd

### Importing Dataset
dataset = pd.read_csv('Madelon.csv',encoding='utf-8')

### Mostrando o dataset
dataset.head()

In [None]:
# Obtendo todas as colnas, exceto a última
dataset_X = dataset.iloc[:, :-1]

### print(atributos)
X = dataset_X           # Features
y = dataset.iloc[:, -1] # Target

print(X)

In [None]:
### Normalizando o dataset
from sklearn.preprocessing import StandardScaler

X_std = StandardScaler().fit_transform(X)
print(X_std)

print(X_std.shape)

In [None]:
### Importando PCA do SkLearn
from sklearn.decomposition import PCA

## Indicar o número de componentes
pca = PCA(n_components = 10)

## Execução do PCA
pca_result = pca.fit_transform(X_std)

### Impressão do resultado
print(pca_result)
print(pca_result.shape)

In [None]:
## Indicar o porcentagem de informação desejado
pca = PCA(n_components=0.90, whiten=True)

## Execução do PCA
pca_result = pca.fit_transform(X_std)

### Impressão do resultado
print(pca_result)
print(pca_result.shape)