## Los supervivientes del Titanic con Python

In [6]:
import seaborn as sns
import numpy as np
import pandas as pd

In [3]:
titanic = sns.load_dataset("titanic")

In [4]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [5]:
titanic.shape

(891, 15)

In [9]:
titanic['survived'] = pd.Categorical(titanic['survived'])
#La funcion Categorical convierte variables numericas en factores

titanic['pclass'] = pd.Categorical(titanic['pclass']) 

titanic['sex'] = pd.Categorical(titanic['sex'])

titanic['deck'] = pd.Categorical(titanic['deck'])

titanic['survived'].head()

0    0
1    1
2    1
3    1
4    0
Name: survived, dtype: category
Categories (2, int64): [0, 1]

## Tablas de frecuencias de una variable

In [23]:
sobrevivientes = pd.crosstab(index = titanic['survived'],columns = 'count')
#La funcion crosstab hace un cruce para darnos una tabla de frecuencias de la columna indicada
#En este ejemplo esta correlacionando cuantos sobrevivieron y cuantos no
sobrevivientes

col_0,count
survived,Unnamed: 1_level_1
0,549
1,342


In [14]:
type(sobrevivientes) #Para demostrar que lo que se ha generado es de tipo dataframe

pandas.core.frame.DataFrame

In [24]:
pd.crosstab(index = titanic['pclass'],
           columns = "count")
#En este ejemplo esta correlacionando de que clase social eran los pasajeros que viajaban


col_0,count
pclass,Unnamed: 1_level_1
1,216
2,184
3,491


In [25]:
pd.crosstab(index = titanic['sex'], 
           columns = "count")

col_0,count
sex,Unnamed: 1_level_1
female,314
male,577


In [26]:
tab_c = pd.crosstab(index = titanic['deck'],
           columns = "count")
tab_c

col_0,count
deck,Unnamed: 1_level_1
A,15
B,47
C,59
D,33
E,32
F,13
G,4


In [27]:
tab_c.sum()
#Aqui se puede ver como la cantidad total de valores deck no son iguales a la cantidad total de registros
#Esto quiere decir que hay muchos valores NaN y con este resultado se comprueba

col_0
count    203
dtype: int64

In [28]:
tab_c.shape

(7, 1)

In [29]:
tab_c.iloc[1:5] #Para extraer datos de las columnas indicadas

col_0,count
deck,Unnamed: 1_level_1
B,47
C,59
D,33
E,32


In [30]:
tab_c/tab_c.sum() # De esta forma se saca las frecuencias relativas globales

col_0,count
deck,Unnamed: 1_level_1
A,0.073892
B,0.231527
C,0.29064
D,0.162562
E,0.157635
F,0.064039
G,0.019704


In [34]:
frecuencia_generos = pd.crosstab(index = titanic['sex'],columns = "count")
frecuencia_generos/frecuencia_generos.sum()*100

#Ahi se ve el porcentaje de hombres y mujeres que habia

col_0,count
sex,Unnamed: 1_level_1
female,35.241302
male,64.758698


In [35]:
frecuencia_survived = pd.crosstab(index = titanic['survived'],columns = "count")
frecuencia_survived/frecuencia_survived.sum()*100

#Ahi se ve el porcentaje de personas que sobrevivieron

col_0,count
survived,Unnamed: 1_level_1
0,61.616162
1,38.383838


## Tablas de contingencia de dos variables (tablas bidimensionales)

In [45]:
survived_sex = pd.crosstab(index = titanic['survived'],
                          columns = titanic['sex'])
survived_sex.index = ["died", "survived"] #Con esto le puedes cambiar el nombre a los indices
survived_sex 

sex,female,male
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,81,468
1,233,109


In [52]:
survived_class = pd.crosstab(index = titanic['survived'], 
                            columns = titanic['pclass'],
                            margins = True) 
#El parametro margins indica que ahora se mostrara el total de filas y el total de columnas

survived_class.index = ["murio", "sobrevivio", "total_por_clase"]
survived_class.columns = ["primera", "segunda", "tercera", "total_por_superv"]
#Con esto le puedes cambiar el nombre a las columnas
survived_class

Unnamed: 0,primera,segunda,tercera,total_por_superv
murio,80,97,372,549
sobrevivio,136,87,119,342
total_por_clase,216,184,491,891


### Frecuencias relativas globales

In [60]:
survived_class/survived_class.loc["total_por_clase", "total_por_superv"]
#survived_class hace referencia a la tabla anterior 
#La funcion loc permite llamar a un elemento con el name de la fila y la columna del dataframe
#survived_class.loc["total_clase", "total_superv"] hace referencia al elemento[3,2] de la tabla anterior
#Osea la suma total de pasajeros

Unnamed: 0,primera,segunda,tercera,total_por_superv
murio,0.089787,0.108866,0.417508,0.616162
sobrevivio,0.152637,0.097643,0.133558,0.383838
total_por_clase,0.242424,0.20651,0.551066,1.0


### Frecuencias relativas marginales

In [68]:
survived_class/survived_class.loc["total_por_clase"]
#En este caso sse ha dividido la cada valor de la columna por el total de su columna
#Para asi obtener el la frecuencia relativa marginal segun la clase (Osea cada clase es el 100%)
#A diferencia de la tabla anterior que era en base a la cantidad total de individuos

Unnamed: 0,primera,segunda,tercera,total_por_superv
murio,0.37037,0.527174,0.757637,0.616162
sobrevivio,0.62963,0.472826,0.242363,0.383838
total_por_clase,1.0,1.0,1.0,1.0


In [69]:
survived_class.div(survived_class.loc["total_por_clase"], axis = 1)
#Otra alternativa seria utilizar la funcion div para dividir 
#Indicandole que el axis es = 1

Unnamed: 0,primera,segunda,tercera,total_por_superv
murio,0.37037,0.527174,0.757637,0.616162
sobrevivio,0.62963,0.472826,0.242363,0.383838
total_por_clase,1.0,1.0,1.0,1.0


In [70]:
survived_class/survived_class.loc["total_por_superv"] 
#Esta sentencia dara error porque la funcion loc se utiliza para localizar filas no columnas

survived_class/survived_class["total_por_superv"]
#Esta sentencia tambien dara error porque python no sabe como dividir el valor de una fila con el de una columna

In [73]:
survived_class.T/survived_class["total_por_superv"]
#Por lo que la forma correcta seria con el .T, transponer la tabla 
#y luego dividirla entre el total_por_clase sin usar loc

Unnamed: 0,murio,sobrevivio,total_por_clase
primera,0.145719,0.397661,0.242424
segunda,0.176685,0.254386,0.20651
tercera,0.677596,0.347953,0.551066
total_por_superv,1.0,1.0,1.0


In [74]:
survived_class.div(survived_class["total_por_superv"], axis = 0)
#Otra alternativa seria utilizar la funcion div para dividir entre el valor de una columna
#Indicandole que el axis es = 0

Unnamed: 0,primera,segunda,tercera,total_por_superv
murio,0.145719,0.176685,0.677596,1.0
sobrevivio,0.397661,0.254386,0.347953,1.0
total_por_clase,0.242424,0.20651,0.551066,1.0


## Tablas multi dimensionales

In [80]:
surv_sex_class = pd.crosstab(index = titanic["survived"],
                            columns = [titanic["sex"], titanic["pclass"]],
                            margins = True) 

#Igual que las tablas bidimensionales pero se indica mas de una columna

surv_sex_class

sex,female,female,female,male,male,male,All
pclass,1,2,3,1,2,3,Unnamed: 7_level_1
survived,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
0,3,6,72,77,91,300,549
1,91,70,72,45,17,47,342
All,94,76,144,122,108,347,891


In [76]:
surv_sex_class["female"] #Para substraer solo las mujeres segun su clase y supervivencia

pclass,1,2,3
survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,3,6,72
1,91,70,72
All,94,76,144


In [87]:
surv_sex_class["female"][1] #Para substraer solo las mujeres segun su supervivencia

survived
0       3
1      91
All    94
Name: 1, dtype: int64

### Frecuencias relativas marginales

In [88]:
surv_sex_class/surv_sex_class.loc["All"]
#Frecuencia relativa marginal de quienes sobrevivieron segun la clase social (osea la clase social es el 100%)

sex,female,female,female,male,male,male,All
pclass,1,2,3,1,2,3,Unnamed: 7_level_1
survived,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
0,0.031915,0.078947,0.5,0.631148,0.842593,0.864553,0.616162
1,0.968085,0.921053,0.5,0.368852,0.157407,0.135447,0.383838
All,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [90]:
surv_sex_class = pd.crosstab(index = titanic["survived"],
                            columns = [titanic["pclass"],titanic["sex"]],
                            margins = True) 

surv_sex_class/surv_sex_class.loc["All"]
#Frecuencia relativa marginal de quienes sobrevivieron segun el sexo (osea el sexo es el 100%)

pclass,1,1,2,2,3,3,All
sex,female,male,female,male,female,male,Unnamed: 7_level_1
survived,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
0,0.031915,0.631148,0.078947,0.842593,0.5,0.864553,0.616162
1,0.968085,0.368852,0.921053,0.157407,0.5,0.135447,0.383838
All,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [93]:
surv_sex_class = pd.crosstab(index = titanic["pclass"],
                            columns = [titanic["sex"],titanic["survived"]],
                            margins = True) 

surv_sex_class/surv_sex_class.loc["All"]
#Frecuencia relativa marginal de la clase social segun quienes sobrevivieron

sex,female,female,male,male,All
survived,0,1,0,1,Unnamed: 5_level_1
pclass,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
1,0.037037,0.390558,0.16453,0.412844,0.242424
2,0.074074,0.300429,0.194444,0.155963,0.20651
3,0.888889,0.309013,0.641026,0.431193,0.551066
All,1.0,1.0,1.0,1.0,1.0
