# Machine Learning - Arboles de Decisión

## Introducción a la Inteligencia Artificial

**Por:** Jhon Carlos Valencia, Matteo Zuluaga, Kevin Trujillo

**Para:** Oscar Bedoya

In [100]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score

In [101]:
data = pd.read_csv(r"C:\Users\carlo\OneDrive\Documentos\Univalle\2022-2\Inteligencia Artificial\Entregas\Informe\Entrega\titanic.csv", sep = ';')
data.head(n=11)

Unnamed: 0,survived,Name,pclass,sex,age,sibsp,parch,fare,who,embark_town,alone
0,0,"Braund, Mr. Owen Harris",3,male,22.0,1,0,7.25,man,Southampton,no
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,1,0,71.2833,woman,Cherbourg,no
2,1,"Heikkinen, Miss. Laina",3,female,26.0,0,0,7.925,woman,Southampton,yes
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,female,35.0,1,0,53.1,woman,Southampton,no
4,0,"Allen, Mr. William Henry",3,male,35.0,0,0,8.05,man,Southampton,yes
5,0,"Moran, Mr. James",3,male,,0,0,8.4583,man,Queenstown,yes
6,0,"McCarthy, Mr. Timothy J",1,male,54.0,0,0,51.8625,man,Southampton,yes
7,0,"Palsson, Master. Gosta Leonard",3,male,2.0,3,1,21.075,child,Southampton,no
8,1,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",3,female,27.0,0,2,11.1333,woman,Southampton,no
9,1,"Nasser, Mrs. Nicholas (Adele Achem)",2,female,14.0,1,0,30.0708,child,Cherbourg,no


In [102]:
data.drop(["Name"], axis = 1, inplace = True)

In [103]:
## Normalizar datos
d = {'female': 0, 'male': 1}
data['sex'] = data['sex'].map(d)

d = {'woman': 0, 'child': 1, 'man': 2}
data['who'] = data['who'].map(d)

d = {'Cherbourg': 0, 'Queenstown': 1, 'Southampton': 2}
data['embark_town'] = data['embark_town'].map(d)

d = {'no': 0, 'yes': 1}
data['alone'] = data['alone'].map(d)

# Después de tener los valores vacíos como NaN, se normalizan a cero
data.fillna(0)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,who,embark_town,alone
0,0,3,1,22.0,1,0,7.2500,2,2.0,0
1,1,1,0,38.0,1,0,71.2833,0,0.0,0
2,1,3,0,26.0,0,0,7.9250,0,2.0,1
3,1,1,0,35.0,1,0,53.1000,0,2.0,0
4,0,3,1,35.0,0,0,8.0500,2,2.0,1
...,...,...,...,...,...,...,...,...,...,...
886,0,2,1,27.0,0,0,13.0000,2,2.0,1
887,1,1,0,19.0,0,0,30.0000,0,2.0,1
888,0,3,0,0.0,1,2,23.4500,0,2.0,0
889,1,1,1,26.0,0,0,30.0000,2,0.0,1


In [104]:
## Setear criterios
feature_cols = ['pclass', 'age', 'sibsp', 'parch', 'who', 'embark_town', 'alone']
X = data[feature_cols]
Y = data["survived"]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 1)
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

## Normalizar criterios
X_train = X_train.fillna(X_train.mean())
Y_train = Y_train.fillna(Y_train.mean())
##data = data[~data.isin([np.nan, np.inf, -np.inf]).any(1)]

(712, 7) (179, 7) (712,) (179,)


In [105]:
## Métricas
def tree_statistics(tree):
    print('Accuracy del clasificador: {0:.2f}'.format(accuracy_score(Y_test, tree.predict(X_test))))
    print('Matriz de confusión del clasificador: \n {0}'.format(confusion_matrix(Y_test, tree.predict(X_test))))
    print('Precision del clasificador: {0:.2f}'.format(precision_score(Y_test, tree.predict(X_test))))
    print('Recall del clasificador: {0:.2f}'.format(recall_score(Y_test, tree.predict(X_test))))
    print('F1 del clasificador: {0:.2f}'.format(f1_score(Y_test, tree.predict(X_test))))

In [106]:
## Arbol de decisión 1
data_tree1 = DecisionTreeClassifier(criterion = 'gini', splitter = 'best', random_state = 123, max_depth = 10)
data_tree1.fit(X_train, Y_train)

## Arbol de decisión 2
data_tree2 = DecisionTreeClassifier(criterion = 'gini', splitter = 'best', random_state = 123, max_depth = 20)
data_tree2.fit(X_train, Y_train)

## Arbol de decisión 3
data_tree3 = DecisionTreeClassifier(criterion = 'gini', splitter = 'best', random_state = 123, max_depth = 30)
data_tree3.fit(X_train, Y_train)

## Arbol de decisión 4
data_tree4 = DecisionTreeClassifier(criterion = 'gini', splitter = 'best', random_state = 123, max_depth = 40)
data_tree4.fit(X_train, Y_train)

## Arbol de decisión 5
data_tree5 = DecisionTreeClassifier(criterion = 'gini', splitter = 'best', random_state = 123, max_depth = 50)
data_tree5.fit(X_train, Y_train)

## Arbol de decisión 6
data_tree6 = DecisionTreeClassifier(criterion = 'gini', splitter = 'best', random_state = 123, max_depth = 60)
data_tree6.fit(X_train, Y_train)

## Arbol de decisión 7
data_tree7 = DecisionTreeClassifier(criterion = 'gini', splitter = 'best', random_state = 123, max_depth = 70)
data_tree7.fit(X_train, Y_train)

## Arbol de decisión 8
data_tree8 = DecisionTreeClassifier(criterion = 'gini', splitter = 'best', random_state = 123, max_depth = 80)
data_tree8.fit(X_train, Y_train)

## Arbol de decisión 9
data_tree9 = DecisionTreeClassifier(criterion = 'gini', splitter = 'best', random_state = 123, max_depth = 90)
data_tree9.fit(X_train, Y_train)

## Arbol de decisión 10
data_tree10 = DecisionTreeClassifier(criterion = 'gini', splitter = 'best', random_state = 123, max_depth = 100)
data_tree10.fit(X_train, Y_train)

DecisionTreeClassifier(max_depth=100, random_state=123)

In [108]:
## Arbol de decisión 2
tree_statistics(data_tree10)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [92]:
dtc_trees = [data_tree1, data_tree2, data_tree3, data_tree4, data_tree5, data_tree6, data_tree7, data_tree8, data_tree9, data_tree10]

# Generar métricas
i = 0
dtc_statistics = []
dtc_statistics.append([''.format(accuracy_score(Y_test, tree.predict(X_test)))])
"""
for tree in dtc_trees:
    print(i)
    dtc_statistics.append([
        ''.format(accuracy_score(Y_test, tree.predict(X_test))),
        ''.format(confusion_matrix(Y_test, tree.predict(X_test))),
        ''.format(precision_score(Y_test, tree.predict(X_test))),
        ''.format(recall_score(Y_test, tree.predict(X_test))),
        ''.format(f1_score(Y_test, tree.predict(X_test)))
    ])
    print(dtc_statistics[i])
    i = i + 1
"""   


ValueError: Input contains NaN, infinity or a value too large for dtype('float32').