# Pandas

In [1]:
import numpy as np
import pandas as pd
from numpy.random import randn
np.random.seed(20200409)

## Series

In [3]:
labels = ['a', 'b', 'c']
mylist = [10, 20, 30]
myarray = np.array([10, 20, 30])
mydict = {'a':10, 'b':20, 'c':30}

In [6]:
pd.Series(data = mylist, index = labels)

a    10
b    20
c    30
dtype: int64

In [7]:
pd.Series(data = myarray, index = labels)

a    10
b    20
c    30
dtype: int64

In [8]:
pd.Series(data = mydict)

a    10
b    20
c    30
dtype: int64

In [12]:
myserie = pd.Series([1,2,3,4],index = ['Chile', 'Bolivia','Argentina', 'Brasil'])

In [13]:
myserie['Chile']

1

## DataFrame

In [17]:
df = pd.DataFrame(data = randn(5,4), index = 'A B C D E'.split(), columns = 'W X Y Z'.split())

In [19]:
df

Unnamed: 0,W,X,Y,Z
A,-2.35417,-0.223082,1.12346,-2.695251
B,-0.662797,-1.32884,-0.811654,1.636465
C,-1.042307,2.282787,0.468138,0.704339
D,-0.033946,0.993245,-2.563723,1.029603
E,-0.686164,0.014241,-0.719187,-1.035712


In [20]:
df['W']

A   -2.354170
B   -0.662797
C   -1.042307
D   -0.033946
E   -0.686164
Name: W, dtype: float64

In [21]:
df[['W', 'Y']]

Unnamed: 0,W,Y
A,-2.35417,1.12346
B,-0.662797,-0.811654
C,-1.042307,0.468138
D,-0.033946,-2.563723
E,-0.686164,-0.719187


In [24]:
df.X

A   -0.223082
B   -1.328840
C    2.282787
D    0.993245
E    0.014241
Name: X, dtype: float64

In [31]:
df['NewCol'] = df['X'] + df['Y']
df

Unnamed: 0,W,X,Y,Z,NewCol
A,-2.35417,-0.223082,1.12346,-2.695251,0.900378
B,-0.662797,-1.32884,-0.811654,1.636465,-2.140494
C,-1.042307,2.282787,0.468138,0.704339,2.750925
D,-0.033946,0.993245,-2.563723,1.029603,-1.570478
E,-0.686164,0.014241,-0.719187,-1.035712,-0.704946


In [32]:
df.drop('NewCol', axis = 1, inplace = True)
df

Unnamed: 0,W,X,Y,Z
A,-2.35417,-0.223082,1.12346,-2.695251
B,-0.662797,-1.32884,-0.811654,1.636465
C,-1.042307,2.282787,0.468138,0.704339
D,-0.033946,0.993245,-2.563723,1.029603
E,-0.686164,0.014241,-0.719187,-1.035712


In [33]:
df.drop('C', axis = 0, inplace = True)
df

Unnamed: 0,W,X,Y,Z
A,-2.35417,-0.223082,1.12346,-2.695251
B,-0.662797,-1.32884,-0.811654,1.636465
D,-0.033946,0.993245,-2.563723,1.029603
E,-0.686164,0.014241,-0.719187,-1.035712


In [36]:
df.loc['A']

W   -2.354170
X   -0.223082
Y    1.123460
Z   -2.695251
Name: A, dtype: float64

In [38]:
df.iloc[0]

W   -2.354170
X   -0.223082
Y    1.123460
Z   -2.695251
Name: A, dtype: float64

In [39]:
df.loc['A', 'W']

-2.354170083220232

In [40]:
df.loc[['A','B'],['W','Y']]

Unnamed: 0,W,Y
A,-2.35417,1.12346
B,-0.662797,-0.811654


In [41]:
df > 0

Unnamed: 0,W,X,Y,Z
A,False,False,True,False
B,False,False,False,True
D,False,True,False,True
E,False,True,False,False


In [42]:
df[df > 0]

Unnamed: 0,W,X,Y,Z
A,,,1.12346,
B,,,,1.636465
D,,0.993245,,1.029603
E,,0.014241,,


In [46]:
df.reset_index(inplace = True)

In [47]:
df

Unnamed: 0,index,W,X,Y,Z
0,A,-2.35417,-0.223082,1.12346,-2.695251
1,B,-0.662797,-1.32884,-0.811654,1.636465
2,D,-0.033946,0.993245,-2.563723,1.029603
3,E,-0.686164,0.014241,-0.719187,-1.035712


In [49]:
df.set_index('index', inplace = True)

In [51]:
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = pd.MultiIndex.from_tuples(list(zip(outside,inside)))

In [53]:
df = pd.DataFrame(np.random.randn(6,2), index = hier_index, columns = ['A','B'])
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,-0.224418,-1.19532
G1,2,0.01092,-2.498004
G1,3,-0.873533,-0.06211
G2,1,1.633851,1.875256
G2,2,-0.045723,0.678543
G2,3,-0.621329,-0.882155


In [54]:
df.loc['G1']

Unnamed: 0,A,B
1,-0.224418,-1.19532
2,0.01092,-2.498004
3,-0.873533,-0.06211


In [55]:
df.index.names = ['Group','Num']

In [56]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,-0.224418,-1.19532
G1,2,0.01092,-2.498004
G1,3,-0.873533,-0.06211
G2,1,1.633851,1.875256
G2,2,-0.045723,0.678543
G2,3,-0.621329,-0.882155


In [57]:
df.xs(['G1',1])

A   -0.224418
B   -1.195320
Name: (G1, 1), dtype: float64

In [58]:
df.xs(1, level = 'Num')

Unnamed: 0_level_0,A,B
Group,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,-0.224418,-1.19532
G2,1.633851,1.875256


## Datos perdidos

In [59]:
df = pd.DataFrame({'A':[1,2,np.nan],
                  'B':[5,np.nan,np.nan],
                  'C':[1,2,3]})

In [60]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [61]:
df.dropna()

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [62]:
df.dropna(axis = 1)

Unnamed: 0,C
0,1
1,2
2,3


In [63]:
df.dropna(thresh = 2)

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2


In [64]:
df.fillna(value = 'MY VALUE')

Unnamed: 0,A,B,C
0,1,5,1
1,2,MY VALUE,2
2,MY VALUE,MY VALUE,3


In [65]:
df['A'].fillna(value = df['A'].mean())

0    1.0
1    2.0
2    1.5
Name: A, dtype: float64

## GroupBy

In [67]:
data = {'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'],
       'Person':['John','George','Robert','Sarah','Maisse','Michael'],
       'Sales':[200,120,340,124,243,350]}
df = pd.DataFrame(data)

In [68]:
df

Unnamed: 0,Company,Person,Sales
0,GOOG,John,200
1,GOOG,George,120
2,MSFT,Robert,340
3,MSFT,Sarah,124
4,FB,Maisse,243
5,FB,Michael,350


In [69]:
df.groupby('Company')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fcd017116a0>

In [72]:
df_grouped = df.groupby('Company')

In [73]:
df_grouped.mean()

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
FB,296.5
GOOG,160.0
MSFT,232.0


In [74]:
df_grouped.std()

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
FB,75.660426
GOOG,56.568542
MSFT,152.735065


In [76]:
df_grouped.min()

Unnamed: 0_level_0,Person,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,Maisse,243
GOOG,George,120
MSFT,Robert,124


In [77]:
df_grouped.max()

Unnamed: 0_level_0,Person,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,Michael,350
GOOG,John,200
MSFT,Sarah,340


In [78]:
df_grouped.count()

Unnamed: 0_level_0,Person,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,2,2
GOOG,2,2
MSFT,2,2


In [82]:
df_grouped.describe().transpose()

Unnamed: 0,Company,FB,GOOG,MSFT
Sales,count,2.0,2.0,2.0
Sales,mean,296.5,160.0,232.0
Sales,std,75.660426,56.568542,152.735065
Sales,min,243.0,120.0,124.0
Sales,25%,269.75,140.0,178.0
Sales,50%,296.5,160.0,232.0
Sales,75%,323.25,180.0,286.0
Sales,max,350.0,200.0,340.0
