In [2]:
import numpy as np
import pandas as pd

In [3]:
pd.__version__

'1.4.1'

In [4]:
# Creando dataframe desde diccionario
data = {"CH": [100, 800, 200], "CO": [100, 200, 300], "MX": [300, 500, 400]}
data

{'CH': [100, 800, 200], 'CO': [100, 200, 300], 'MX': [300, 500, 400]}

In [6]:
df = pd.DataFrame(data)
df

Unnamed: 0,CH,CO,MX
0,100,100,300
1,800,200,500
2,200,300,400


Con datos más complejos

In [7]:
dict_data = {
    'edad': [10,9,13,14,12,11,12],
    'cm': [115,110,130,155,125,120,125],
    'pais': ['co','mx','co','mx','mx','ch','ch'],
    'genero': ['M','F','F','M','M','M','F'],
    'Q1': [5,10,8,np.nan,7,8,3],
    'Q2': [7,9,9,8,8,8,9]
}

In [8]:
df = pd.DataFrame(dict_data)
df

Unnamed: 0,edad,cm,pais,genero,Q1,Q2
0,10,115,co,M,5.0,7
1,9,110,mx,F,10.0,9
2,13,130,co,F,8.0,9
3,14,155,mx,M,,8
4,12,125,mx,M,7.0,8
5,11,120,ch,M,8.0,8
6,12,125,ch,F,3.0,9


In [9]:
# que el índice sea el nombre del estudiante
student_names = ["Ana", "Benito", "Camilo", "Daniel", "Erika", "Fabian", "Gabriela"]
df_idx = pd.DataFrame(dict_data, index=student_names)
df_idx

Unnamed: 0,edad,cm,pais,genero,Q1,Q2
Ana,10,115,co,M,5.0,7
Benito,9,110,mx,F,10.0,9
Camilo,13,130,co,F,8.0,9
Daniel,14,155,mx,M,,8
Erika,12,125,mx,M,7.0,8
Fabian,11,120,ch,M,8.0,8
Gabriela,12,125,ch,F,3.0,9


In [10]:
df_idx.index

Index(['Ana', 'Benito', 'Camilo', 'Daniel', 'Erika', 'Fabian', 'Gabriela'], dtype='object')

In [11]:
df_idx.columns

Index(['edad', 'cm', 'pais', 'genero', 'Q1', 'Q2'], dtype='object')

In [12]:
df_idx.values

array([[10, 115, 'co', 'M', 5.0, 7],
       [9, 110, 'mx', 'F', 10.0, 9],
       [13, 130, 'co', 'F', 8.0, 9],
       [14, 155, 'mx', 'M', nan, 8],
       [12, 125, 'mx', 'M', 7.0, 8],
       [11, 120, 'ch', 'M', 8.0, 8],
       [12, 125, 'ch', 'F', 3.0, 9]], dtype=object)

In [15]:
# Seleccionado solo una columna
df_idx["edad"]

Ana         10
Benito       9
Camilo      13
Daniel      14
Erika       12
Fabian      11
Gabriela    12
Name: edad, dtype: int64

In [16]:
# Seleccionando solo ciertas columnas
columns = ["edad", "pais", "Q1"]
subset_ = df_idx[columns]
subset_

Unnamed: 0,edad,pais,Q1
Ana,10,co,5.0
Benito,9,mx,10.0
Camilo,13,co,8.0
Daniel,14,mx,
Erika,12,mx,7.0
Fabian,11,ch,8.0
Gabriela,12,ch,3.0


**Columnas específicas de un cierto registro**

In [19]:
# retorna una serie
example = df_idx.loc["Daniel", columns]
print(type(example))
example

<class 'pandas.core.series.Series'>


edad     14
pais     mx
Q1      NaN
Name: Daniel, dtype: object

In [22]:
# retorna un dataframe
example = df_idx.loc[["Ana", "Daniel"], columns]
print(type(example))
example

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,edad,pais,Q1
Ana,10,co,5.0
Daniel,14,mx,


In [23]:
# un valor en específico
df_idx.loc["Daniel", "Q1"]

nan

**Usando iloc**

In [24]:
# vemos el df para tener referencia
df_idx

Unnamed: 0,edad,cm,pais,genero,Q1,Q2
Ana,10,115,co,M,5.0,7
Benito,9,110,mx,F,10.0,9
Camilo,13,130,co,F,8.0,9
Daniel,14,155,mx,M,,8
Erika,12,125,mx,M,7.0,8
Fabian,11,120,ch,M,8.0,8
Gabriela,12,125,ch,F,3.0,9


In [25]:
df_idx.iloc[2, 1]

130

In [26]:
df_idx.iloc[2, [0, 1, 2]]

edad     13
cm      130
pais     co
Name: Camilo, dtype: object

In [29]:
df_idx.iloc[[2, 3], [0, 1, 2]]

Unnamed: 0,edad,cm,pais
Camilo,13,130,co
Daniel,14,155,mx


In [30]:
# seleccionar todas las filas de ciertas columnas
df_idx.iloc[:, [0, 1, 2]]

Unnamed: 0,edad,cm,pais
Ana,10,115,co
Benito,9,110,mx
Camilo,13,130,co
Daniel,14,155,mx
Erika,12,125,mx
Fabian,11,120,ch
Gabriela,12,125,ch


In [31]:
df_idx.iloc[2]

edad       13
cm        130
pais       co
genero      F
Q1        8.0
Q2          9
Name: Camilo, dtype: object

**Condiciones**

In [32]:
df_idx["edad"] >= 12

Ana         False
Benito      False
Camilo       True
Daniel       True
Erika        True
Fabian      False
Gabriela     True
Name: edad, dtype: bool

In [34]:
# subconjunto de datos filtrado por la edad
df_idx[df_idx["edad"] >= 12]

Unnamed: 0,edad,cm,pais,genero,Q1,Q2
Camilo,13,130,co,F,8.0,9
Daniel,14,155,mx,M,,8
Erika,12,125,mx,M,7.0,8
Gabriela,12,125,ch,F,3.0,9


In [35]:
# varias condiciones
df_idx[(df_idx["edad"] >= 12) & (df_idx["pais"] == "mx")]

Unnamed: 0,edad,cm,pais,genero,Q1,Q2
Daniel,14,155,mx,M,,8
Erika,12,125,mx,M,7.0,8


In [40]:
# usando el método query
df_idx.query("edad > 12")

Unnamed: 0,edad,cm,pais,genero,Q1,Q2
Camilo,13,130,co,F,8.0,9
Daniel,14,155,mx,M,,8
