## Introduction to Pandas - DataFrame

In [1]:
import pandas as pd
import numpy as np

In [2]:
# name attribute
s = pd.Series(np.random.randn(5), name='My Attr')
print (s)

0    0.060830
1    0.797448
2   -0.895407
3    0.149045
4   -1.090624
Name: My Attr, dtype: float64


In [3]:
s.name

'My Attr'

### DataFrame

In [4]:
d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
     'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [5]:
df['one']

a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64

In [6]:
df.one

a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64

In [11]:
4.0 in df

False

In [12]:
pd.DataFrame(d, index=['d', 'b', 'a'])

Unnamed: 0,one,two
d,,4.0
b,2.0,2.0
a,1.0,1.0


In [13]:
df.iloc['d']

TypeError: Cannot index by location index with a non-integer key

In [14]:
pd.DataFrame(d, index="d")

TypeError: Index(...) must be called with a collection of some kind, 'd' was passed

### How to get row by index in this 2D array?

In [17]:
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [16]:
df.loc['d']

one    NaN
two    4.0
Name: d, dtype: float64

In [18]:
s = pd.Series(np.nan, index=[49,48,47,46,45, 1, 2, 3, 4, 5])
s

49   NaN
48   NaN
47   NaN
46   NaN
45   NaN
1    NaN
2    NaN
3    NaN
4    NaN
5    NaN
dtype: float64

In [19]:
s.iloc[:3]

49   NaN
48   NaN
47   NaN
dtype: float64

In [22]:
s.loc[:3] # what is the difference between LOC and ILOC?

49   NaN
48   NaN
47   NaN
46   NaN
45   NaN
1    NaN
2    NaN
3    NaN
dtype: float64

In [23]:
df.iloc[1].name

'b'

In [25]:
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [26]:
# row names
df.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [27]:
# column names
df.columns

Index(['one', 'two'], dtype='object')

In [30]:
# create it from a dict of lists
d = {'one' : [1., 2., 3., 4.], 'two' : [4., 3., 2., 1.]}
df = pd.DataFrame(d, dtype='int64')
df

Unnamed: 0,one,two
0,1,4
1,2,3
2,3,2
3,4,1


In [34]:
data = np.zeros((2,), dtype=[('A', 'i4'),('B', 'f4'),('C', 'a10')])
print("data is " + str(data))
data[:] = [(1,2.,'Hello'), (2,3.,"World")]
pd.DataFrame(data)

data is [(0, 0., b'') (0, 0., b'')]


Unnamed: 0,A,B,C
0,1,2.0,b'Hello'
1,2,3.0,b'World'


In [35]:
df

Unnamed: 0,one,two
0,1,4
1,2,3
2,3,2
3,4,1


In [38]:
df['three'] = df['one'] * df['two'] # element-wise multiplication
print(df) 

   one  two  three
0    1    4      4
1    2    3      6
2    3    2      6
3    4    1      4


In [39]:
df['flag'] = df['one'] > 2
df

Unnamed: 0,one,two,three,flag
0,1,4,4,False
1,2,3,6,False
2,3,2,6,True
3,4,1,4,True


In [40]:
del df['two']
df

Unnamed: 0,one,three,flag
0,1,4,False
1,2,6,False
2,3,6,True
3,4,4,True


In [41]:
three = df.pop('three')
df

Unnamed: 0,one,flag
0,1,False
1,2,False
2,3,True
3,4,True


In [42]:
three

0    4
1    6
2    6
3    4
Name: three, dtype: int64

In [43]:
df['foo'] = 'bar'
df

Unnamed: 0,one,flag,foo
0,1,False,bar
1,2,False,bar
2,3,True,bar
3,4,True,bar


In [44]:
# insert a column
df.insert(1, 'bar', df['one'])
df

Unnamed: 0,one,bar,flag,foo
0,1,1,False,bar
1,2,2,False,bar
2,3,3,True,bar
3,4,4,True,bar


In [45]:
# select row at index 2
df.loc[2]

one        3
bar        3
flag    True
foo      bar
Name: 2, dtype: object

Data Alignment and Arithmetic

In [46]:
df = pd.DataFrame(np.random.randn(10, 4), columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,-0.133504,0.252096,-0.170884,-0.622795
1,0.98015,-1.677947,-1.18976,0.248337
2,-0.607983,-0.719347,-0.000536,-0.528656
3,-1.564018,0.563505,1.557547,-0.826306
4,1.08741,1.992611,-2.542912,-0.352474
5,-0.702935,0.129518,1.31323,-0.523482
6,0.219891,-0.379236,0.873371,-0.677469
7,0.282576,1.588518,-1.353623,0.623556
8,-0.972584,-2.786954,1.02399,-2.546894
9,1.450001,1.265896,0.673572,0.575456


In [47]:
df2 = pd.DataFrame(np.random.randn(7, 3), columns=['A', 'B', 'C'])
df2

Unnamed: 0,A,B,C
0,1.312765,1.210509,0.432165
1,0.947667,1.378796,-0.869231
2,-1.444504,-0.25555,-1.986846
3,-0.678943,0.099087,2.387064
4,0.98439,-0.944786,0.398525
5,1.64378,0.154014,-0.01962
6,-0.782221,-1.185442,0.137586


In [48]:
df + df2

Unnamed: 0,A,B,C,D
0,1.179261,1.462606,0.261281,
1,1.927816,-0.299151,-2.058991,
2,-2.052487,-0.974897,-1.987383,
3,-2.24296,0.662592,3.944611,
4,2.071801,1.047825,-2.144387,
5,0.940846,0.283532,1.29361,
6,-0.562329,-1.564679,1.010957,
7,,,,
8,,,,
9,,,,


In [50]:
df

Unnamed: 0,A,B,C,D
0,-0.133504,0.252096,-0.170884,-0.622795
1,0.98015,-1.677947,-1.18976,0.248337
2,-0.607983,-0.719347,-0.000536,-0.528656
3,-1.564018,0.563505,1.557547,-0.826306
4,1.08741,1.992611,-2.542912,-0.352474
5,-0.702935,0.129518,1.31323,-0.523482
6,0.219891,-0.379236,0.873371,-0.677469
7,0.282576,1.588518,-1.353623,0.623556
8,-0.972584,-2.786954,1.02399,-2.546894
9,1.450001,1.265896,0.673572,0.575456


In [49]:
df - df.iloc[0]

Unnamed: 0,A,B,C,D
0,0.0,0.0,0.0,0.0
1,1.113654,-1.930043,-1.018876,0.871133
2,-0.474479,-0.971443,0.170347,0.09414
3,-1.430514,0.311408,1.72843,-0.20351
4,1.220915,1.740515,-2.372029,0.270321
5,-0.569431,-0.122578,1.484113,0.099313
6,0.353395,-0.631333,1.044254,-0.054673
7,0.41608,1.336422,-1.182739,1.246352
8,-0.83908,-3.03905,1.194873,-1.924098
9,1.583505,1.0138,0.844455,1.198251


In [52]:
# the dataframe index is time series
index = pd.date_range('21/10/2020', periods=8)
df = pd.DataFrame(np.random.randn(8, 3), index=index, columns=list('ABC'))
df

Unnamed: 0,A,B,C
2020-10-21,-0.60116,0.829648,-1.168094
2020-10-22,1.070526,-0.920863,2.055315
2020-10-23,1.078916,0.317161,-1.698528
2020-10-24,-1.648217,-2.512515,-0.858417
2020-10-25,-0.551385,-0.584629,0.851391
2020-10-26,0.367123,2.345758,0.800178
2020-10-27,-1.131586,0.980926,0.825412
2020-10-28,-0.921526,-0.299357,-0.728633


In [53]:
type(df['A'])

pandas.core.series.Series

In [54]:
df.sub(df['A'], axis=0) # it was df - df['A'], depricated now

Unnamed: 0,A,B,C
2020-10-21,0.0,1.430808,-0.566934
2020-10-22,0.0,-1.991389,0.984789
2020-10-23,0.0,-0.761755,-2.777444
2020-10-24,0.0,-0.864298,0.7898
2020-10-25,0.0,-0.033244,1.402776
2020-10-26,0.0,1.978635,0.433055
2020-10-27,0.0,2.112512,1.956998
2020-10-28,0.0,0.622168,0.192893


In [56]:
df * 5 + 2

Unnamed: 0,A,B,C
2020-10-21,-1.0058,6.148238,-3.840472
2020-10-22,7.352628,-2.604316,12.276573
2020-10-23,7.394581,3.585805,-6.492638
2020-10-24,-6.241085,-10.562574,-2.292086
2020-10-25,-0.756925,-0.923147,6.256956
2020-10-26,3.835615,13.728789,6.000891
2020-10-27,-3.657929,6.904628,6.127059
2020-10-28,-2.607629,0.503214,-1.643164


In [57]:
1 / df

Unnamed: 0,A,B,C
2020-10-21,-1.663451,1.205331,-0.856095
2020-10-22,0.934121,-1.085938,0.486544
2020-10-23,0.926856,3.152974,-0.588745
2020-10-24,-0.606716,-0.398008,-1.164935
2020-10-25,-1.813615,-1.710486,1.174548
2020-10-26,2.723883,0.426301,1.249722
2020-10-27,-0.883716,1.019445,1.211516
2020-10-28,-1.085157,-3.34049,-1.372433


Boolean Operators

In [58]:
df1 = pd.DataFrame({'a' : [1, 0, 1], 'b' : [0, 1, 1] }, dtype=bool)
df1

Unnamed: 0,a,b
0,True,False
1,False,True
2,True,True


In [59]:
df2 = pd.DataFrame({'a' : [0, 1, 1], 'b' : [1, 1, 0] }, dtype=bool)
df2

Unnamed: 0,a,b
0,False,True
1,True,True
2,True,False


In [60]:
df1 & df2

Unnamed: 0,a,b
0,False,False
1,False,True
2,True,False


In [61]:
df1 ^ df2

Unnamed: 0,a,b
0,True,True
1,True,False
2,False,True


In [62]:
-df1

Unnamed: 0,a,b
0,False,True
1,True,False
2,False,False


Trasposing

In [63]:
df[:5]

Unnamed: 0,A,B,C
2020-10-21,-0.60116,0.829648,-1.168094
2020-10-22,1.070526,-0.920863,2.055315
2020-10-23,1.078916,0.317161,-1.698528
2020-10-24,-1.648217,-2.512515,-0.858417
2020-10-25,-0.551385,-0.584629,0.851391


In [64]:
#trapspose 
df[:5].T

Unnamed: 0,2020-10-21,2020-10-22,2020-10-23,2020-10-24,2020-10-25
A,-0.60116,1.070526,1.078916,-1.648217,-0.551385
B,0.829648,-0.920863,0.317161,-2.512515,-0.584629
C,-1.168094,2.055315,-1.698528,-0.858417,0.851391


Interoperability with Numpy Functions

In [66]:
np.asarray(df)

array([[-0.60115992,  0.82964759, -1.16809434],
       [ 1.07052566, -0.92086312,  2.05531463],
       [ 1.07891619,  0.31716092, -1.69852767],
       [-1.64821697, -2.51251476, -0.85841723],
       [-0.55138496, -0.5846293 ,  0.85139114],
       [ 0.36712303,  2.34575771,  0.80017813],
       [-1.13158589,  0.98092565,  0.82541184],
       [-0.92152572, -0.29935727, -0.72863282]])

Matrix Multiplication

In [67]:
df.T.dot(df)

Unnamed: 0,A,B,C
A,7.956599,3.348202,2.046516
B,3.348202,14.845826,1.163374
C,2.046516,1.163374,11.788001


GroupBy

In [69]:
df = pd.DataFrame({'BoolCol': [True, False, False, True, True]}, index=[10,20,30,40,50])
print(df)

    BoolCol
10     True
20    False
30    False
40     True
50     True


In [71]:
print(df[df['BoolCol']])

    BoolCol
10     True
40     True
50     True


In [72]:
df[df['BoolCol']].index.tolist()

[10, 40, 50]