# Titanic Dataset

https://www.kaggle.com/c/titanic/data

In [10]:
import pandas as pd

url_titanic =  "https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv"
titanic_data = pd.read_csv(url_titanic)

# Variáveis Explicativas
titanic_features = titanic_data.drop(["Survived", "Name"], axis=1)
titanic_features['Sex'] = titanic_features['Sex'].map({'male': 0, 'female': 1})
# Variável Resposta
titanic_target = titanic_data["Survived"]

print("Titanic Dataset Features:")
print(titanic_features.head())
print("\nTitanic Dataset Target:")
print(titanic_target.head())



X = titanic_features
y = titanic_target
print(set(y))

Titanic Dataset Features:
   Pclass  Sex   Age  Siblings/Spouses Aboard  Parents/Children Aboard  \
0       3    0  22.0                        1                        0   
1       1    1  38.0                        1                        0   
2       3    1  26.0                        0                        0   
3       1    1  35.0                        1                        0   
4       3    0  35.0                        0                        0   

      Fare  
0   7.2500  
1  71.2833  
2   7.9250  
3  53.1000  
4   8.0500  

Titanic Dataset Target:
0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64
{0, 1}


# Breast Cancer Data
https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic

In [11]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

breast_cancer_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
breast_cancer_columns = ["id", "diagnosis"] + [f"feature_{i}" for i in range(1, 31)]

breast_cancer_data = pd.read_csv(breast_cancer_url, header=None, names=breast_cancer_columns)

# Variáveis Explicativas
breast_cancer_features = breast_cancer_data.drop(["diagnosis", "id"], axis=1)

# Variável Resposta
breast_cancer_target = breast_cancer_data["diagnosis"]

print("Breast Cancer Dataset Features:")
print(breast_cancer_features.head())
print("\nBreast Cancer Dataset Target:")
print(breast_cancer_target.head())

X = breast_cancer_features
le = LabelEncoder()
y = le.fit_transform(breast_cancer_target)
print(set(y))

Breast Cancer Dataset Features:
   feature_1  feature_2  feature_3  feature_4  feature_5  feature_6  \
0      17.99      10.38     122.80     1001.0    0.11840    0.27760   
1      20.57      17.77     132.90     1326.0    0.08474    0.07864   
2      19.69      21.25     130.00     1203.0    0.10960    0.15990   
3      11.42      20.38      77.58      386.1    0.14250    0.28390   
4      20.29      14.34     135.10     1297.0    0.10030    0.13280   

   feature_7  feature_8  feature_9  feature_10  ...  feature_21  feature_22  \
0     0.3001    0.14710     0.2419     0.07871  ...       25.38       17.33   
1     0.0869    0.07017     0.1812     0.05667  ...       24.99       23.41   
2     0.1974    0.12790     0.2069     0.05999  ...       23.57       25.53   
3     0.2414    0.10520     0.2597     0.09744  ...       14.91       26.50   
4     0.1980    0.10430     0.1809     0.05883  ...       22.54       16.67   

   feature_23  feature_24  feature_25  feature_26  feature_27  fea

# Heart Disease Dataset

https://archive.ics.uci.edu/dataset/45/heart+disease

In [12]:
import pandas as pd

# Heart Disease Dataset
heart_disease_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
heart_disease_columns = ["Age", "Sex", "CP", "Trestbps", "Chol", "Fbs", "Restecg", "Thalach", "Exang", "Oldpeak", "Slope", "Ca", "Thal", "Num"]

heart_disease_data = pd.read_csv(heart_disease_url, header=None, names=heart_disease_columns, na_values="?")

# Variáveis Explicativas
heart_disease_features = heart_disease_data.drop("Num", axis=1)

# Variável Resposta
heart_disease_target = heart_disease_data["Num"].apply(lambda x: 1 if x > 0 else 0)

print("Heart Disease Dataset Features:")
print(heart_disease_features.head())
print("\nHeart Disease Dataset Target:")
print(heart_disease_target.head())


X = heart_disease_features
y =  heart_disease_target

print(set(y))

Heart Disease Dataset Features:
    Age  Sex   CP  Trestbps   Chol  Fbs  Restecg  Thalach  Exang  Oldpeak  \
0  63.0  1.0  1.0     145.0  233.0  1.0      2.0    150.0    0.0      2.3   
1  67.0  1.0  4.0     160.0  286.0  0.0      2.0    108.0    1.0      1.5   
2  67.0  1.0  4.0     120.0  229.0  0.0      2.0    129.0    1.0      2.6   
3  37.0  1.0  3.0     130.0  250.0  0.0      0.0    187.0    0.0      3.5   
4  41.0  0.0  2.0     130.0  204.0  0.0      2.0    172.0    0.0      1.4   

   Slope   Ca  Thal  
0    3.0  0.0   6.0  
1    2.0  3.0   3.0  
2    2.0  2.0   7.0  
3    3.0  0.0   3.0  
4    1.0  0.0   3.0  

Heart Disease Dataset Target:
0    0
1    1
2    1
3    0
4    0
Name: Num, dtype: int64
{0, 1}


# Diabetes Dataset

https://www.kaggle.com/datasets/mathchi/diabetes-data-set

In [13]:
import pandas as pd

# Diabetes Dataset (Pima Indians Diabetes Database)
diabetes_url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
diabetes_columns = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age", "Outcome"]

diabetes_data = pd.read_csv(diabetes_url, header=None, names=diabetes_columns)

# Variáveis Explicativas
diabetes_features = diabetes_data.drop("Outcome", axis=1)

# Variável Resposta
diabetes_target = diabetes_data["Outcome"]

print("Diabetes Dataset Features:")
print(diabetes_features.head())
print("\nDiabetes Dataset Target:")
print(diabetes_target.head())

X = diabetes_features
y = diabetes_target
print(set(y))


Diabetes Dataset Features:
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  
0                     0.627   50  
1                     0.351   31  
2                     0.672   32  
3                     0.167   21  
4                     2.288   33  

Diabetes Dataset Target:
0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64
{0, 1}


# Wine Dataset

In [14]:
from sklearn.datasets import load_wine
import pandas as pd

# https://scikit-learn.org/stable/datasets/toy_dataset.html#wine-dataset

# Carregar o conjunto de dados Wine
wine = load_wine()

# Criar DataFrame
df = pd.DataFrame(wine.data, columns=wine.feature_names)
df['target'] = wine.target


# Separar as variáveis explicativas (X) da resposta (y)
X = df.drop('target', axis=1)
y = df['target']

# Visualizar as primeiras linhas do DataFrame
print(df.head())


   alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
0    14.23        1.71  2.43               15.6      127.0           2.80   
1    13.20        1.78  2.14               11.2      100.0           2.65   
2    13.16        2.36  2.67               18.6      101.0           2.80   
3    14.37        1.95  2.50               16.8      113.0           3.85   
4    13.24        2.59  2.87               21.0      118.0           2.80   

   flavanoids  nonflavanoid_phenols  proanthocyanins  color_intensity   hue  \
0        3.06                  0.28             2.29             5.64  1.04   
1        2.76                  0.26             1.28             4.38  1.05   
2        3.24                  0.30             2.81             5.68  1.03   
3        3.49                  0.24             2.18             7.80  0.86   
4        2.69                  0.39             1.82             4.32  1.04   

   od280/od315_of_diluted_wines  proline  target  
0          