In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# Series

In [2]:
#Series
s = pd.Series(np.random.randn(5), index=['a','b','c','d','e'])
s

a    0.242592
b   -0.117140
c   -0.159138
d    1.123509
e   -1.158131
dtype: float64

In [3]:
s.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [4]:
pd.Series(np.random.randn(5))

0   -1.494889
1   -0.832292
2   -0.742797
3    0.249853
4    0.732546
dtype: float64

In [5]:
#Series creadas por diccionario
d = {'a' : 0., 'b' : 1., 'c' : 2.}

pd.Series(d)



a    0.0
b    1.0
c    2.0
dtype: float64

In [6]:
d = {'a' : 0, 'b' : 1, 'c' : 2}

pd.Series(d)

a    0
b    1
c    2
dtype: int64

In [7]:
#Series creadas por escalares

pd.Series(5, index=['a','b','c','d','e'])

a    5
b    5
c    5
d    5
e    5
dtype: int64

In [8]:
#slicing Series
# s = pd.Series(np.random.randn(5), index=['a','b','c','d','e'])

#  s[0]

s[:3]

a    0.242592
b   -0.117140
c   -0.159138
dtype: float64

In [9]:
s

a    0.242592
b   -0.117140
c   -0.159138
d    1.123509
e   -1.158131
dtype: float64

In [10]:
s.median()

-0.11714014976440427

In [11]:
s[s > s.median()]

a    0.242592
d    1.123509
dtype: float64

In [12]:
s[ s > s[2]]

a    0.242592
b   -0.117140
d    1.123509
dtype: float64

# DataFrame

In [13]:
#DataFrame is a 2-dimensional labeled data structure with columns of potentially different types. 

### From dict of Series or dicts

In [14]:


d= {'uno' : pd.Series([1,2,3], index = ['a','b','c']), 
    'dos' : pd.Series([1,2,3,4], index = ['a','b','c','d'])}

df = pd.DataFrame(d)

df
#  If no columns are passed, the columns will be the sorted list of dict keys.

Unnamed: 0,dos,uno
a,1,1.0
b,2,2.0
c,3,3.0
d,4,


In [15]:
pd.DataFrame(d, index = ['b', 'd', 'a'])

Unnamed: 0,dos,uno
b,2,2.0
d,4,
a,1,1.0


In [16]:
#columns

pd.DataFrame(d, index = ['b','d','a'], columns = [ 'dos', 'tres'])

# When a particular set of columns is passed along with a dict of data,
#  the passed columns override the keys in the dict.

Unnamed: 0,dos,tres
b,2,
d,4,
a,1,


In [17]:
df.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [18]:
df.columns

Index(['dos', 'uno'], dtype='object')

### From dict of ndarrays / lists

The ndarrays must all be the same length. If an index is passed, it must clearly also be the same length as the arrays. If no index is passed, the result will be range(n), where n is the array length.

In [19]:
d = {'uno': [1,2,3,4], 'dos': [4,3,2,1]}

pd.DataFrame(d)


Unnamed: 0,dos,uno
0,4,1
1,3,2
2,2,3
3,1,4


In [20]:
pd.DataFrame(d, index = ['a','b','c','d'], columns = ['uno','dos'])

Unnamed: 0,uno,dos
a,1,4
b,2,3
c,3,2
d,4,1


### Anexo: np.zeros

In [21]:
s = (6)
np.zeros(s)

array([ 0.,  0.,  0.,  0.,  0.,  0.])

In [22]:
np.zeros((5,2))

array([[ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.]])

### Desde una lista de diccionarios


In [23]:
datos = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]

pd.DataFrame(datos)
#recordar que si no se le pasa columnas explícitamente, éstas serán las keys ordenadas alfabéticamente

Unnamed: 0,a,b,c
0,1,2,
1,5,10,20.0


In [24]:
pd.DataFrame(datos, index=['primero', 'segundo'])

Unnamed: 0,a,b,c
primero,1,2,
segundo,5,10,20.0


In [25]:
pd.DataFrame(datos,columns=['b','a','c'])

Unnamed: 0,b,a,c
0,2,1,
1,10,5,20.0


### creando con datetime

In [26]:
fechas = pd.date_range('20130101', periods=6)
fechas

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [27]:
df = pd.DataFrame(np.random.randn(6,4),index = fechas, columns=list('ABCD'))

df

Unnamed: 0,A,B,C,D
2013-01-01,0.155808,0.72154,0.913868,1.381206
2013-01-02,1.054537,-0.849267,-0.392365,-0.861322
2013-01-03,-0.241447,-0.671533,-0.564314,-1.784999
2013-01-04,-0.957377,1.264513,-0.859521,-1.057047
2013-01-05,1.18115,-0.563432,1.097595,0.542655
2013-01-06,-1.37159,-2.381579,-0.505298,1.314011


In [28]:
pd.Series(1,index=list(range(4)))

0    1
1    1
2    1
3    1
dtype: int64

###  Mostrando datos

In [29]:
# df = pd.DataFrame(np.random.randn(6,4),index = fechas, columns=list('ABCD'))

df.head()

#por defecto head muestra sólo los 5 primeros resultados, podemos indicar el número que queramos ver: df.head(7)

Unnamed: 0,A,B,C,D
2013-01-01,0.155808,0.72154,0.913868,1.381206
2013-01-02,1.054537,-0.849267,-0.392365,-0.861322
2013-01-03,-0.241447,-0.671533,-0.564314,-1.784999
2013-01-04,-0.957377,1.264513,-0.859521,-1.057047
2013-01-05,1.18115,-0.563432,1.097595,0.542655


In [30]:
df.tail(2)

#pasamos a tail() el número de últimos resultados que queremos ver, si no indicamos nada, muestra los 5 últimos

Unnamed: 0,A,B,C,D
2013-01-05,1.18115,-0.563432,1.097595,0.542655
2013-01-06,-1.37159,-2.381579,-0.505298,1.314011


In [31]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [32]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [33]:
# Describe shows a quick statistic summary of your data

df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.02982,-0.413293,-0.051672,-0.077583
std,1.037519,1.285016,0.835506,1.337023
min,-1.37159,-2.381579,-0.859521,-1.784999
25%,-0.778394,-0.804833,-0.54956,-1.008116
50%,-0.04282,-0.617482,-0.448832,-0.159334
75%,0.829854,0.400297,0.58731,1.121172
max,1.18115,1.264513,1.097595,1.381206


In [34]:
# Traspuesta

df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,0.155808,1.054537,-0.241447,-0.957377,1.18115,-1.37159
B,0.72154,-0.849267,-0.671533,1.264513,-0.563432,-2.381579
C,0.913868,-0.392365,-0.564314,-0.859521,1.097595,-0.505298
D,1.381206,-0.861322,-1.784999,-1.057047,0.542655,1.314011


In [35]:
# sort_index()

df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,1.381206,0.913868,0.72154,0.155808
2013-01-02,-0.861322,-0.392365,-0.849267,1.054537
2013-01-03,-1.784999,-0.564314,-0.671533,-0.241447
2013-01-04,-1.057047,-0.859521,1.264513,-0.957377
2013-01-05,0.542655,1.097595,-0.563432,1.18115
2013-01-06,1.314011,-0.505298,-2.381579,-1.37159


In [56]:
df.sort_index(ascending=False) # axis=1 significa columnas, si no se pone o se indica axis=0 se ordena por filas

Unnamed: 0,A,B,C,D
2013-01-06,-1.37159,-2.381579,-0.505298,1.314011
2013-01-05,1.18115,-0.563432,1.097595,0.542655
2013-01-04,-0.957377,1.264513,-0.859521,-1.057047
2013-01-03,-0.241447,-0.671533,-0.564314,-1.784999
2013-01-02,1.054537,-0.849267,-0.392365,-0.861322
2013-01-01,0.155808,0.72154,0.913868,1.381206


In [59]:
# ordenando por valores

df.sort_values(by='C')

Unnamed: 0,A,B,C,D
2013-01-04,-0.957377,1.264513,-0.859521,-1.057047
2013-01-03,-0.241447,-0.671533,-0.564314,-1.784999
2013-01-06,-1.37159,-2.381579,-0.505298,1.314011
2013-01-02,1.054537,-0.849267,-0.392365,-0.861322
2013-01-01,0.155808,0.72154,0.913868,1.381206
2013-01-05,1.18115,-0.563432,1.097595,0.542655


In [37]:
# getting

df['A']

2013-01-01    0.155808
2013-01-02    1.054537
2013-01-03   -0.241447
2013-01-04   -0.957377
2013-01-05    1.181150
2013-01-06   -1.371590
Freq: D, Name: A, dtype: float64

In [38]:
df [0:2]

Unnamed: 0,A,B,C,D
2013-01-01,0.155808,0.72154,0.913868,1.381206
2013-01-02,1.054537,-0.849267,-0.392365,-0.861322


In [39]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,1.054537,-0.849267,-0.392365,-0.861322
2013-01-03,-0.241447,-0.671533,-0.564314,-1.784999
2013-01-04,-0.957377,1.264513,-0.859521,-1.057047


In [40]:
# loc Purely label-location based indexer for selection by label
# loc Indicador basado exclusivamente en la ubicación de la etiqueta para la selección por etiqueta.
# fechas = pd.date_range('20130101', periods=6)

df.loc[fechas[0]]

A    0.155808
B    0.721540
C    0.913868
D    1.381206
Name: 2013-01-01 00:00:00, dtype: float64

In [41]:
fechas

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [42]:
#Selecting on a multi-axis by label

df.loc[:,['A','B']]

Unnamed: 0,A,B
2013-01-01,0.155808,0.72154
2013-01-02,1.054537,-0.849267
2013-01-03,-0.241447,-0.671533
2013-01-04,-0.957377,1.264513
2013-01-05,1.18115,-0.563432
2013-01-06,-1.37159,-2.381579


In [65]:
df.loc['20130102':'20130104',['A','B']]

Unnamed: 0,A,B
2013-01-02,1.054537,-0.849267
2013-01-03,-0.241447,-0.671533
2013-01-04,-0.957377,1.264513


In [66]:
df.loc['20130102':'20130104',:]

Unnamed: 0,A,B,C,D
2013-01-02,1.054537,-0.849267,-0.392365,-0.861322
2013-01-03,-0.241447,-0.671533,-0.564314,-1.784999
2013-01-04,-0.957377,1.264513,-0.859521,-1.057047


In [43]:
# Showing label slicing, both endpoints are included

df.loc['20130103':'20130105',['A','B']]

Unnamed: 0,A,B
2013-01-03,-0.241447,-0.671533
2013-01-04,-0.957377,1.264513
2013-01-05,1.18115,-0.563432


In [44]:
#Reduction in the dimensions of the returned object

df.loc['20130102',['A','B']]

A    1.054537
B   -0.849267
Name: 2013-01-02 00:00:00, dtype: float64

In [45]:
# Seleccionando por posición

df.iloc[3]
# accede al index 3 (contando desde 0)

A   -0.957377
B    1.264513
C   -0.859521
D   -1.057047
Name: 2013-01-04 00:00:00, dtype: float64

In [46]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,-0.957377,1.264513
2013-01-05,1.18115,-0.563432


In [47]:
df.iloc[:,1:3]


Unnamed: 0,B,C
2013-01-01,0.72154,0.913868
2013-01-02,-0.849267,-0.392365
2013-01-03,-0.671533,-0.564314
2013-01-04,1.264513,-0.859521
2013-01-05,-0.563432,1.097595
2013-01-06,-2.381579,-0.505298


In [48]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,1.054537,-0.392365
2013-01-03,-0.241447,-0.564314
2013-01-05,1.18115,1.097595


In [49]:
df.iloc[1:3,:]

Unnamed: 0,A,B,C,D
2013-01-02,1.054537,-0.849267,-0.392365,-0.861322
2013-01-03,-0.241447,-0.671533,-0.564314,-1.784999


In [50]:
#boolean indexing

df[df.A > 0]

Unnamed: 0,A,B,C,D
2013-01-01,0.155808,0.72154,0.913868,1.381206
2013-01-02,1.054537,-0.849267,-0.392365,-0.861322
2013-01-05,1.18115,-0.563432,1.097595,0.542655


In [67]:
#Seleccionando valores de un DataFrame donde la condición booleana es conocida

df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,0.155808,0.72154,0.913868,1.381206
2013-01-02,1.054537,,,
2013-01-03,,,,
2013-01-04,,1.264513,,
2013-01-05,1.18115,,1.097595,0.542655
2013-01-06,,,,1.314011


In [86]:
milista = 'uno-dos-tres-cuatro-cinco-seis'.split('-')

In [94]:
#utilizando método isin() para filtrar

df2 = df.copy()

df2['E'] = ['uno','dos', 'tres','cuatro', 'cinco', 'seis']

df2



Unnamed: 0,A,B,C,D,E
2013-01-01,0.155808,0.72154,0.913868,1.381206,uno
2013-01-02,1.054537,-0.849267,-0.392365,-0.861322,dos
2013-01-03,-0.241447,-0.671533,-0.564314,-1.784999,tres
2013-01-04,-0.957377,1.264513,-0.859521,-1.057047,cuatro
2013-01-05,1.18115,-0.563432,1.097595,0.542655,cinco
2013-01-06,-1.37159,-2.381579,-0.505298,1.314011,seis


In [97]:
df2[df2['E'].isin(['dos', 'tres'])]

Unnamed: 0,A,B,C,D,E
2013-01-02,1.054537,-0.849267,-0.392365,-0.861322,dos
2013-01-03,-0.241447,-0.671533,-0.564314,-1.784999,tres


In [99]:
#acceso rápido a un escalar

df.at[fechas[0], 'A']
#es equivalente a:

#df.loc[fechas[0], 'A']

0.15580809615706578

In [101]:
# Establecer/cambiar valores por etiqueta:

df.at[fechas[0], 'A'] = 0

In [102]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.72154,0.913868,1.381206
2013-01-02,1.054537,-0.849267,-0.392365,-0.861322
2013-01-03,-0.241447,-0.671533,-0.564314,-1.784999
2013-01-04,-0.957377,1.264513,-0.859521,-1.057047
2013-01-05,1.18115,-0.563432,1.097595,0.542655
2013-01-06,-1.37159,-2.381579,-0.505298,1.314011


In [105]:
# establecer valores por posición:

df.iat[0,1] = 0
df

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,0.913868,1.381206
2013-01-02,1.054537,-0.849267,-0.392365,-0.861322
2013-01-03,-0.241447,-0.671533,-0.564314,-1.784999
2013-01-04,-0.957377,1.264513,-0.859521,-1.057047
2013-01-05,1.18115,-0.563432,1.097595,0.542655
2013-01-06,-1.37159,-2.381579,-0.505298,1.314011


In [108]:
# establecer por asignación con un nparray:

df.loc[:,'D'] = np.array([5] * len(df))

df

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,0.913868,5
2013-01-02,1.054537,-0.849267,-0.392365,5
2013-01-03,-0.241447,-0.671533,-0.564314,5
2013-01-04,-0.957377,1.264513,-0.859521,5
2013-01-05,1.18115,-0.563432,1.097595,5
2013-01-06,-1.37159,-2.381579,-0.505298,5


In [109]:
# estableciendo una nueva columna alineada automáticamente los datos con los índices:

s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))

s1


2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [110]:
df['F'] = s1


In [111]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.913868,5,
2013-01-02,1.054537,-0.849267,-0.392365,5,1.0
2013-01-03,-0.241447,-0.671533,-0.564314,5,2.0
2013-01-04,-0.957377,1.264513,-0.859521,5,3.0
2013-01-05,1.18115,-0.563432,1.097595,5,4.0
2013-01-06,-1.37159,-2.381579,-0.505298,5,5.0


In [112]:
# algo similiar a una sentencia WHERE:

df2 = df.copy()

df2[df2 > 0] = -df2

df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.913868,-5,
2013-01-02,-1.054537,-0.849267,-0.392365,-5,-1.0
2013-01-03,-0.241447,-0.671533,-0.564314,-5,-2.0
2013-01-04,-0.957377,-1.264513,-0.859521,-5,-3.0
2013-01-05,-1.18115,-0.563432,-1.097595,-5,-4.0
2013-01-06,-1.37159,-2.381579,-0.505298,-5,-5.0
