In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import numpy as np
from sklearn import tree
from sklearn.model_selection import train_test_split

%matplotlib inline
sns.set()

In [None]:
test_df = pd.read_csv("../db/titanic-test.csv")
train_df = pd.read_csv("../db/titanic-train.csv")
train_df.head(5)

In [None]:
train_df.info()

In [None]:
train_df.Sex.value_counts().plot(kind = 'bar', color = ['b', 'r'])
plt.title('Distribucion de Personas por generos')
plt.show()

In [None]:
# arbol de decision
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()

encoder_sex = label_encoder.fit_transform(train_df["Sex"])
train_df.head(5)

In [None]:
# Llenamos los datos nulos de las columnas age y embarked con valores pre definidos. En numericos es recomendable usar la mediana o media 

train_df["Age"] = train_df['Age'].fillna(train_df['Age'].median())
train_df['Embarked'] = train_df['Embarked'].fillna('S')

In [None]:
# Filtramos las columnas por los valores que realmente vayamos a usar para predecir, esto con el drpo
# Seguido de eso, extraemos las variables categoricas de nuestro train predictos
train_predictors = train_df.drop(['PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin'], axis = 1)

categorical_cols = [cname for cname in train_predictors.columns if 
                    train_predictors[cname].nunique() < 10 and
                    train_predictors[cname].dtype == 'object']

train_predictors.columns


In [None]:
# Extraemos las variables numericas de nuestro train predictor

numerical_cols = [cname for cname in train_predictors.columns if
                    train_predictors[cname].dtype in ['int64', 'float64']
                ]

In [None]:
# Juntamos tanto las numericas como las categoricas

my_cols = categorical_cols + numerical_cols
my_cols

In [25]:
# sustituimos los valores de train predictos por sus mismos valores pero solo en las columnas my cols

train_predictors = train_predictors[my_cols]
train_predictors

Unnamed: 0,Sex,Embarked,Pclass,Age,SibSp,Parch,Fare
0,male,S,3,22.0,1,0,7.2500
1,female,C,1,38.0,1,0,71.2833
2,female,S,3,26.0,0,0,7.9250
3,female,S,1,35.0,1,0,53.1000
4,male,S,3,35.0,0,0,8.0500
...,...,...,...,...,...,...,...
886,male,S,2,27.0,0,0,13.0000
887,female,S,1,19.0,0,0,30.0000
888,female,S,3,28.0,1,2,23.4500
889,male,C,1,26.0,0,0,30.0000


In [26]:
# Creamos variables dummies de las variables categoricas que tenemos en nuestro data set

dummy_encoded_train_predictors = pd.get_dummies(train_predictors)
dummy_encoded_train_predictors.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,22.0,1,0,7.25,0,1,0,0,1
1,1,38.0,1,0,71.2833,1,0,1,0,0
2,3,26.0,0,0,7.925,1,0,0,0,1
3,1,35.0,1,0,53.1,1,0,0,0,1
4,3,35.0,0,0,8.05,0,1,0,0,1


In [24]:
train_df['Pclass'].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [28]:
# Division de los datos, y target son los valores que queremos predecir y x features son las variables que usaremos para predecir

y_target = train_df['Survived'].values
x_features_one = dummy_encoded_train_predictors.values

In [29]:
# Dividimos la data 

x_train, x_validation, y_train, y_validation = train_test_split(x_features_one, y_target, test_size = .25, random_state=1)

In [33]:
# Entrenamos el modelo

tree_one = tree.DecisionTreeClassifier()
tree_one = tree_one.fit(x_train, y_train)

In [37]:
# Verificamos el accurace y el roc auc score 

tree_one_accuracy = round(tree_one.score(x_validation, y_validation), 4)
print('Accuracy: %0.4f' % (tree_one_accuracy))

Accuracy: 0.7578


In [38]:
from io import StringIO
from IPython.display import Image, display
import pydotplus

out = StringIO()
tree.export_graphviz(tree_one, out_file=out)

graph = pydotplus.graph_from_dot_data(out.getvalue())
graph.write_png('titanic.png')

True