In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer

# Carregar os dados
data = pd.read_csv('World Wildlife Species.csv')

print(data.head())

   Unnamed: 0               Scientifc Name               Common Name  \
0           0           Loxodonta africana          African Elephant   
1           1                          NaN   African forest elephant   
2           2  Loxodonta africana africana  African savanna elephant   
3           3                Lycaon pictus          African Wild Dog   
4           4             Thunnus alalunga             Albacore Tuna   

     Conservation Status  
0                    NaN  
1  Critically Endangered  
2             Endangered  
3             Endangered  
4        Near Threatened  


In [10]:
# Verificar os nomes das colunas
print(data.columns)

# Corrigir a digitação e remover espaços extras nos nomes das colunas
data.columns = data.columns.str.strip()  # Remove espaços extras
data.rename(columns={'Scientifc Name': 'Scientific Name'}, inplace=True)

# Verificar novamente os nomes das colunas
print(data.columns)


Index(['Unnamed: 0', 'Scientifc Name', 'Common Name', 'Conservation Status'], dtype='object')
Index(['Unnamed: 0', 'Scientific Name', 'Common Name', 'Conservation Status'], dtype='object')


In [11]:
# Verificar valores nulos nas colunas
print(data.isnull().sum())

# Mostrar as primeiras linhas para entender os dados
data.head()


Unnamed: 0              0
Scientific Name        10
Common Name             0
Conservation Status    15
dtype: int64


Unnamed: 0.1,Unnamed: 0,Scientific Name,Common Name,Conservation Status
0,0,Loxodonta africana,African Elephant,
1,1,,African forest elephant,Critically Endangered
2,2,Loxodonta africana africana,African savanna elephant,Endangered
3,3,Lycaon pictus,African Wild Dog,Endangered
4,4,Thunnus alalunga,Albacore Tuna,Near Threatened


In [12]:
# Preencher valores nulos na coluna 'Scientific Name' com 'Unknown'
data['Scientific Name'].fillna('Unknown', inplace=True)

# Verificar se ainda há valores nulos
print(data['Scientific Name'].isnull().sum())


0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Scientific Name'].fillna('Unknown', inplace=True)


In [13]:
# Remover as linhas com valores nulos na coluna 'Scientific Name'
data.dropna(subset=['Scientific Name'], inplace=True)

# Verificar se ainda há valores nulos
print(data['Scientific Name'].isnull().sum())


0


In [14]:
# Contar as ocorrências de cada categoria na coluna 'Conservation Status'
print(data['Conservation Status'].value_counts())


Conservation Status
Endangered               29
Critically Endangered    17
Vulnerable               16
Least Concern            12
Near Threatened           8
Name: count, dtype: int64


In [15]:
# Preencher valores nulos nas outras colunas com um valor padrão (como 'Unknown')
data.fillna({'Common Name': 'Unknown', 'Conservation Status': 'Unknown'}, inplace=True)

# Verificar se ainda há valores nulos
print(data.isnull().sum())


Unnamed: 0             0
Scientific Name        0
Common Name            0
Conservation Status    0
dtype: int64


In [16]:
from sklearn.preprocessing import LabelEncoder

# Criar um LabelEncoder
le = LabelEncoder()

# Aplicar o LabelEncoder na coluna 'Conservation Status'
data['Conservation Status'] = le.fit_transform(data['Conservation Status'])

# Verificar as alterações
data.head()


Unnamed: 0.1,Unnamed: 0,Scientific Name,Common Name,Conservation Status
0,0,Loxodonta africana,African Elephant,4
1,1,Unknown,African forest elephant,0
2,2,Loxodonta africana africana,African savanna elephant,1
3,3,Lycaon pictus,African Wild Dog,1
4,4,Thunnus alalunga,Albacore Tuna,3


In [17]:
# Usando OneHotEncoder para converter variáveis categóricas
data_encoded = pd.get_dummies(data, columns=['Conservation Status'])

# Verificar os dados codificados
data_encoded.head()


Unnamed: 0.1,Unnamed: 0,Scientific Name,Common Name,Conservation Status_0,Conservation Status_1,Conservation Status_2,Conservation Status_3,Conservation Status_4,Conservation Status_5
0,0,Loxodonta africana,African Elephant,False,False,False,False,True,False
1,1,Unknown,African forest elephant,True,False,False,False,False,False
2,2,Loxodonta africana africana,African savanna elephant,False,True,False,False,False,False
3,3,Lycaon pictus,African Wild Dog,False,True,False,False,False,False
4,4,Thunnus alalunga,Albacore Tuna,False,False,False,True,False,False


In [18]:
from sklearn.model_selection import train_test_split

# Dividir os dados em variáveis independentes (X) e dependente (y)
X = data.drop(columns=['Conservation Status'])  # Variáveis independentes
y = data['Conservation Status']  # Variável dependente

# Dividir os dados em treino (80%) e teste (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verificar a divisão
print(X_train.shape, X_test.shape)


(77, 3) (20, 3)


In [21]:
# Excluir as colunas não numéricas (exemplo 'Scientific Name' e 'Common Name')
X = data.drop(columns=['Conservation Status', 'Scientific Name', 'Common Name'])

# Recriar a divisão treino/teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Treinar o modelo novamente
model.fit(X_train, y_train)


In [22]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Criar o modelo
model = DecisionTreeClassifier(random_state=42)

# Treinar o modelo
model.fit(X_train, y_train)

# Fazer previsões no conjunto de teste
y_pred = model.predict(X_test)

# Avaliar a acurácia
accuracy = accuracy_score(y_test, y_pred)
print(f'Acurácia do modelo: {accuracy:.4f}')



Acurácia do modelo: 0.3500
