#Carga y limpieza de datos

Este notebook trabajaremos en el datset Titanic, que incluye información de pasajeros del titanic y su sobrevivencia.

## Carga de archivos

In [1]:
# Importamos las librerías
import pandas as pd
import urllib.request

In [2]:
# Definimos el link a descargar

# Dataset titanic.csv
file_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"

In [3]:
# Guardamos en ambiente de colab
!wget  https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv

--2023-07-28 01:03:36--  https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 60302 (59K) [text/plain]
Saving to: ‘titanic.csv’


2023-07-28 01:03:36 (5.13 MB/s) - ‘titanic.csv’ saved [60302/60302]



In [7]:
# Cargamos en un dataframe de pandas
df_titanic = pd.read_csv('titanic.csv')

## Exploración del dataset

In [None]:
# Presentamos el dataset
display(df_titanic.head())

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [9]:
df_titanic.shape

(891, 12)

In [8]:
# Describimos el dataset
df_titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [10]:
# Presentamos sus estadisticas principales
display(df_titanic.describe())

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [11]:
# Describe de las variables categoricas
df_titanic.describe(include='O')

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Braund, Mr. Owen Harris",male,347082,B96 B98,S
freq,1,577,7,4,644


In [15]:
# Ordenar entre variables
df_titanic.sort_values(['PassengerId', 'Sex'], ascending=False)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.7500,,Q
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


## Operaciones de filtro y selección

Por defecto una columna sola es almacenada como 'series', para que sea almacenado como un data frame usamos la función to_frame()

In [26]:
type(df_titanic[['Survived','Sex']])

pandas.core.frame.DataFrame

In [27]:
# Filtrar columnas
# Seleccionar columnas de interés
cols_interes = ['Survived', 'Sex']
df_sobrevivencia = df_titanic[cols_interes] #['Survived'].to_frame()
display(df_sobrevivencia.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Survived  891 non-null    int64 
 1   Sex       891 non-null    object
dtypes: int64(1), object(1)
memory usage: 14.0+ KB


None

In [28]:
#Calculamos estadisticas
Pasajeros = df_sobrevivencia['Survived'].count()
Sobrevivientes = df_sobrevivencia['Survived'].sum()
TasaSobrevivencia = df_sobrevivencia['Survived'].mean()

print ("Pasajeros: {},\nSobrevivientes: {}\nTasa de sobrevivencia: {}".format(Pasajeros,Sobrevivientes,TasaSobrevivencia))

Pasajeros: 891,
Sobrevivientes: 342
Tasa de sobrevivencia: 0.3838383838383838


### Operar sobre columnas

In [None]:
# Crear una nueva columna
import numpy as np
# Una columna categórica a partir de una columna numérica
df_titanic["SurvivedY/n"] = np.where(df_titanic["Survived"]==1, "Y","N")

# Una columna numerica a partir de una columna categórica

df_titanic["SexMale"] = np.where(df_titanic["Sex"]=="male",1,0)
display(df_titanic.head()) # validamos la nueva columna

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,SurvivedY/n,SexMale
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,N,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Y,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Y,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Y,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,N,1


Para filtrar sobre una columna y calcular una estadistica sobre su resultado, para cada columna del data frame.

In [None]:
#Características de los sobrevivientes
df_titanic[df_titanic["SurvivedY/n"]=="Y"].mean()

  df_titanic[df_titanic["SurvivedY/n"]=="Y"].mean()


PassengerId    444.368421
Survived         1.000000
Pclass           1.950292
Age             28.343690
SibSp            0.473684
Parch            0.464912
Fare            48.395408
SexMale          0.318713
dtype: float64

In [None]:
### Caracteristicas de los no sobrevivientes
df_titanic[df_titanic["SurvivedY/n"]!="Y"].mean()

  df_titanic[df_titanic["SurvivedY/n"]!="Y"].mean()


PassengerId    447.016393
Survived         0.000000
Pclass           2.531876
Age             30.626179
SibSp            0.553734
Parch            0.329690
Fare            22.117887
SexMale          0.852459
dtype: float64

Pausa para interpretar:
¿Qué características tienen los sobrevivientes diferentes a los no sobrevivientes?. Escribamos un breve resumen caracterizando las diferencias

---- Análisis aquí -----

## Datos faltantes

In [None]:
### Datos faltantes por edad
df_titanic_faltantes = df_titanic[df_titanic['Age'].isnull()]
# df_titanic_sinfaltantes = df_titanic[df_titanic['Age'].isnull()]

In [None]:
display(df_titanic_faltantes.describe())

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,SexMale
count,177.0,177.0,177.0,0.0,177.0,177.0,177.0,177.0
mean,435.581921,0.293785,2.59887,,0.564972,0.180791,22.158567,0.700565
std,250.552901,0.456787,0.763216,,1.626316,0.534145,31.874608,0.45931
min,6.0,0.0,1.0,,0.0,0.0,0.0,0.0
25%,230.0,0.0,3.0,,0.0,0.0,7.75,0.0
50%,452.0,0.0,3.0,,0.0,0.0,8.05,1.0
75%,634.0,1.0,3.0,,0.0,0.0,24.15,1.0
max,889.0,1.0,3.0,,8.0,2.0,227.525,1.0


## Eliminar registros

Vamos a crear una versión del dataset, eliminando las observaciones con edad faltante

In [None]:
filtro = df_titanic['Age'].isnull()

# recomendable guardar en un data frame diferente
df_titanic_conedad = df_titanic.drop(df_titanic[filtro].index)

# Presentar resultados
display(df_titanic_conedad.describe())

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,SexMale
count,714.0,714.0,714.0,714.0,714.0,714.0,714.0,714.0
mean,448.582633,0.406162,2.236695,29.699118,0.512605,0.431373,34.694514,0.634454
std,259.119524,0.49146,0.83825,14.526497,0.929783,0.853289,52.91893,0.481921
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0,0.0
25%,222.25,0.0,1.0,20.125,0.0,0.0,8.05,0.0
50%,445.0,0.0,2.0,28.0,0.0,0.0,15.7417,1.0
75%,677.75,1.0,3.0,38.0,1.0,1.0,33.375,1.0
max,891.0,1.0,3.0,80.0,5.0,6.0,512.3292,1.0


#### Crear SQL queries en pandas dataframe

In [30]:
import pandas as pd

data = {
    'nombre': ['Juan', 'María', 'Pedro'],
    'edad': [30, 25, 28]
}
df = pd.DataFrame(data)

df.head()

Unnamed: 0,nombre,edad
0,Juan,30
1,María,25
2,Pedro,28


En este ejemplo, utilizaré la biblioteca "sqlite3" que ya está incluida en Python y permite trabajar con bases de datos SQLite, una base de datos relacional de código abierto y ligera

In [31]:
import sqlite3

# Crear una conexión a la base de datos (si no existe, se creará un nuevo archivo de base de datos)
conn = sqlite3.connect("ejemplo_db.sqlite")

# Guardar el DataFrame en la base de datos en una tabla llamada "personas"
df.to_sql('personas', conn, if_exists='replace', index=False)

# Realizar una consulta SQL utilizando Pandas
query = """
    SELECT *
    FROM personas
    WHERE edad > 26
    """
# Aqui estoy leyendo mi tabla de la bd y la cnvierto a dataframe
resultado = pd.read_sql_query(query, conn)

# Mostrar los resultados
print("Lista de personas mayores de 26 años:")
print(resultado)

# Cerrar la conexión
conn.close()


Lista de personas mayores de 26 años:
  nombre  edad
0   Juan    30
1  Pedro    28


In [40]:
import pandas as pd
from sqlalchemy import create_engine

# Create the tables as DataFrames
employees_data = {
    'employee_id': [1, 2, 3, 4],
    'employee_name': ['Alice', 'Bob', 'Charlie', 'David'],
    'department_id': [101, 102, 101, 103]
}

departments_data = {
    'department_id': [101, 102, 103],
    'department_name': ['HR', 'Finance', 'Marketing']
}

salaries_data = {
    'employee_id': [1, 2, 3, 4],
    'salary': [50000, 60000, 55000, 70000]
}

employees_df = pd.DataFrame(employees_data)
departments_df = pd.DataFrame(departments_data)
salaries_df = pd.DataFrame(salaries_data)

# Create an SQLite in-memory database
engine = sqlite3.connect("ejemplo_db.sqlite")

# Store the DataFrames into the database
employees_df.to_sql('employees', engine, if_exists='replace', index=False)
departments_df.to_sql('departments', engine,if_exists='replace', index=False)
salaries_df.to_sql('salaries', engine, if_exists='replace',index=False)

# Perform a complex SQL query using Pandas to join the tables
query = """
    SELECT e.employee_id, e.employee_name, d.department_name, s.salary
    FROM employees e
    INNER JOIN departments d ON e.department_id = d.department_id
    INNER JOIN salaries s ON e.employee_id = s.employee_id
    WHERE s.salary > 55000
"""

# Execute the query using the engine object and read the results into a DataFrame

result = pd.read_sql_query(query, engine)

print(result)




   employee_id employee_name department_name  salary
0            2           Bob         Finance   60000
1            4         David       Marketing   70000
