In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# Series

In [2]:
#Series
s = pd.Series(np.random.randn(5), index=['a','b','c','d','e'])
s

a   -2.046886
b    0.852974
c   -0.187211
d   -0.360604
e    2.172107
dtype: float64

In [5]:
s.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [17]:
pd.Series(np.random.randn(5))

0    0.387878
1   -0.926806
2   -0.051762
3    1.384435
4    0.892812
dtype: float64

In [10]:
#Series creadas por diccionario
d = {'a' : 0., 'b' : 1., 'c' : 2.}

pd.Series(d)



a    0.0
b    1.0
c    2.0
dtype: float64

In [11]:
d = {'a' : 0, 'b' : 1, 'c' : 2}

pd.Series(d)

a    0
b    1
c    2
dtype: int64

In [12]:
#Series creadas por escalares

pd.Series(5, index=['a','b','c','d','e'])

a    5
b    5
c    5
d    5
e    5
dtype: int64

In [20]:
#slicing Series
# s = pd.Series(np.random.randn(5), index=['a','b','c','d','e'])

#  s[0]

s[:3]

a    0.257553
b    0.337755
c   -1.092920
dtype: float64

In [27]:
s

a    0.257553
b    0.337755
c   -1.092920
d   -2.080203
e   -0.701316
dtype: float64

In [25]:
s.median()

-0.7013160466229131

In [26]:
s[s > s.median()]

a    0.257553
b    0.337755
dtype: float64

In [3]:
s[ s > s[2]]

b    0.852974
e    2.172107
dtype: float64

# DataFrame

In [28]:
#DataFrame is a 2-dimensional labeled data structure with columns of potentially different types. 

### From dict of Series or dicts

In [6]:


d= {'uno' : pd.Series([1,2,3], index = ['a','b','c']), 
    'dos' : pd.Series([1,2,3,4], index = ['a','b','c','d'])}

df = pd.DataFrame(d)

df
#  If no columns are passed, the columns will be the sorted list of dict keys.

Unnamed: 0,dos,uno
a,1,1.0
b,2,2.0
c,3,3.0
d,4,


In [40]:
pd.DataFrame(d, index = ['b', 'd', 'a'])

Unnamed: 0,dos,uno
b,2,2.0
d,4,
a,1,1.0


In [48]:
#columns

pd.DataFrame(d, index = ['b','d','a'], columns = [ 'dos', 'tres'])

# When a particular set of columns is passed along with a dict of data,
#  the passed columns override the keys in the dict.

Unnamed: 0,dos,tres
b,2,
d,4,
a,1,


In [45]:
df.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [46]:
df.columns

Index(['dos', 'uno'], dtype='object')

### From dict of ndarrays / lists

The ndarrays must all be the same length. If an index is passed, it must clearly also be the same length as the arrays. If no index is passed, the result will be range(n), where n is the array length.

In [3]:
d = {'uno': [1,2,3,4], 'dos': [4,3,2,1]}

pd.DataFrame(d)


Unnamed: 0,dos,uno
0,4,1
1,3,2
2,2,3
3,1,4


In [5]:
pd.DataFrame(d, index = ['a','b','c','d'], columns = ['uno','dos'])

Unnamed: 0,uno,dos
a,1,4
b,2,3
c,3,2
d,4,1


### Anexo: np.zeros

In [21]:
s = (6)
np.zeros(s)

array([ 0.,  0.,  0.,  0.,  0.,  0.])

In [20]:
np.zeros((5,2))

array([[ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.]])

### Desde una lista de diccionarios


In [29]:
datos = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]

pd.DataFrame(datos)
#recordar que si no se le pasa columnas explícitamente, éstas serán las keys ordenadas alfabéticamente

Unnamed: 0,a,b,c
0,1,2,
1,5,10,20.0


In [32]:
pd.DataFrame(datos, index=['primero', 'segundo'])

Unnamed: 0,a,b,c
primero,1,2,
segundo,5,10,20.0


In [36]:
pd.DataFrame(datos,columns=['b','a','c'])

Unnamed: 0,b,a,c
0,2,1,
1,10,5,20.0


### creando con datetime

In [41]:
fechas = pd.date_range('20130101', periods=6)
fechas

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [52]:
df = pd.DataFrame(np.random.randn(6,4),index = fechas, columns=list('ABCD'))

df

Unnamed: 0,A,B,C,D
2013-01-01,-0.139857,-0.038673,0.313229,0.319289
2013-01-02,0.710438,0.104254,-0.941878,-0.382516
2013-01-03,-1.606338,-0.284036,0.132778,-0.858894
2013-01-04,-0.693793,-1.700663,0.175149,0.423152
2013-01-05,0.420211,-0.275943,-0.774692,0.030419
2013-01-06,-0.326979,0.501658,1.220925,-0.628359


In [48]:
pd.Series(1,index=list(range(4)))

0    1
1    1
2    1
3    1
dtype: int64

###  Mostrando datos

In [53]:
# df = pd.DataFrame(np.random.randn(6,4),index = fechas, columns=list('ABCD'))

df.head()

#por defecto head muestra sólo los 5 primeros resultados, podemos indicar el número que queramos ver: df.head(7)

Unnamed: 0,A,B,C,D
2013-01-01,-0.139857,-0.038673,0.313229,0.319289
2013-01-02,0.710438,0.104254,-0.941878,-0.382516
2013-01-03,-1.606338,-0.284036,0.132778,-0.858894
2013-01-04,-0.693793,-1.700663,0.175149,0.423152
2013-01-05,0.420211,-0.275943,-0.774692,0.030419


In [56]:
df.tail(2)

#pasamos a tail() el número de últimos resultados que queremos ver, si no indicamos nada, muestra los 5 últimos

Unnamed: 0,A,B,C,D
2013-01-05,0.420211,-0.275943,-0.774692,0.030419
2013-01-06,-0.326979,0.501658,1.220925,-0.628359


In [57]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [58]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [59]:
# Describe shows a quick statistic summary of your data

df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.27272,-0.282234,0.020919,-0.182818
std,0.827429,0.753016,0.790156,0.521583
min,-1.606338,-1.700663,-0.941878,-0.858894
25%,-0.60209,-0.282013,-0.547824,-0.566898
50%,-0.233418,-0.157308,0.153964,-0.176049
75%,0.280194,0.068522,0.278709,0.247071
max,0.710438,0.501658,1.220925,0.423152


In [60]:
# Traspuesta

df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,-0.139857,0.710438,-1.606338,-0.693793,0.420211,-0.326979
B,-0.038673,0.104254,-0.284036,-1.700663,-0.275943,0.501658
C,0.313229,-0.941878,0.132778,0.175149,-0.774692,1.220925
D,0.319289,-0.382516,-0.858894,0.423152,0.030419,-0.628359


In [66]:
# sort_index()

df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,0.319289,0.313229,-0.038673,-0.139857
2013-01-02,-0.382516,-0.941878,0.104254,0.710438
2013-01-03,-0.858894,0.132778,-0.284036,-1.606338
2013-01-04,0.423152,0.175149,-1.700663,-0.693793
2013-01-05,0.030419,-0.774692,-0.275943,0.420211
2013-01-06,-0.628359,1.220925,0.501658,-0.326979


In [67]:
# ordenando por valores

df.sort_values(by='C')

Unnamed: 0,A,B,C,D
2013-01-02,0.710438,0.104254,-0.941878,-0.382516
2013-01-05,0.420211,-0.275943,-0.774692,0.030419
2013-01-03,-1.606338,-0.284036,0.132778,-0.858894
2013-01-04,-0.693793,-1.700663,0.175149,0.423152
2013-01-01,-0.139857,-0.038673,0.313229,0.319289
2013-01-06,-0.326979,0.501658,1.220925,-0.628359


In [68]:
# getting

df['A']

2013-01-01   -0.139857
2013-01-02    0.710438
2013-01-03   -1.606338
2013-01-04   -0.693793
2013-01-05    0.420211
2013-01-06   -0.326979
Freq: D, Name: A, dtype: float64

In [69]:
df [0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-0.139857,-0.038673,0.313229,0.319289
2013-01-02,0.710438,0.104254,-0.941878,-0.382516
2013-01-03,-1.606338,-0.284036,0.132778,-0.858894


In [71]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,0.710438,0.104254,-0.941878,-0.382516
2013-01-03,-1.606338,-0.284036,0.132778,-0.858894
2013-01-04,-0.693793,-1.700663,0.175149,0.423152


In [72]:
# loc Purely label-location based indexer for selection by label.
df.loc[fechas[0]]

A   -0.139857
B   -0.038673
C    0.313229
D    0.319289
Name: 2013-01-01 00:00:00, dtype: float64

In [73]:
fechas

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [74]:
#Selecting on a multi-axis by label

df.loc[:,['A','B']]

Unnamed: 0,A,B
2013-01-01,-0.139857,-0.038673
2013-01-02,0.710438,0.104254
2013-01-03,-1.606338,-0.284036
2013-01-04,-0.693793,-1.700663
2013-01-05,0.420211,-0.275943
2013-01-06,-0.326979,0.501658


In [75]:
# Showing label slicing, both endpoints are included

df.loc['20130103':'20130105',['A','B']]

Unnamed: 0,A,B
2013-01-03,-1.606338,-0.284036
2013-01-04,-0.693793,-1.700663
2013-01-05,0.420211,-0.275943


In [76]:
#Reduction in the dimensions of the returned object

df.loc['20130102',['A','B']]

A    0.710438
B    0.104254
Name: 2013-01-02 00:00:00, dtype: float64

In [77]:
# Seleccionando por posición

df.iloc[3]
# accede al index 3 (contando desde 0)

A   -0.693793
B   -1.700663
C    0.175149
D    0.423152
Name: 2013-01-04 00:00:00, dtype: float64

In [78]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,-0.693793,-1.700663
2013-01-05,0.420211,-0.275943


In [79]:
df.iloc[:,1:3]


Unnamed: 0,B,C
2013-01-01,-0.038673,0.313229
2013-01-02,0.104254,-0.941878
2013-01-03,-0.284036,0.132778
2013-01-04,-1.700663,0.175149
2013-01-05,-0.275943,-0.774692
2013-01-06,0.501658,1.220925


In [80]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,0.710438,-0.941878
2013-01-03,-1.606338,0.132778
2013-01-05,0.420211,-0.774692


In [81]:
df.iloc[1:3,:]

Unnamed: 0,A,B,C,D
2013-01-02,0.710438,0.104254,-0.941878,-0.382516
2013-01-03,-1.606338,-0.284036,0.132778,-0.858894


In [82]:
#boolean indexing

df[df.A > 0]

Unnamed: 0,A,B,C,D
2013-01-02,0.710438,0.104254,-0.941878,-0.382516
2013-01-05,0.420211,-0.275943,-0.774692,0.030419
