## Series in pandas

In [1]:
import numpy as np

In [2]:
import pandas as pd 

In [3]:
labels = ['a','b','c']
my_data = [10,20,30]
arr = np.array(my_data)
d = {'a':10, 'b':20,'c':30,'d':40}

In [4]:
pd.Series(arr,labels)

a    10
b    20
c    30
dtype: int32

In [5]:
pd.Series(data=labels)

0    a
1    b
2    c
dtype: object

In [6]:
ser1 = pd.Series([1,2,3,4],['USA','China','India','Itely'])

In [7]:
ser1

USA      1
China    2
India    3
Itely    4
dtype: int64

In [8]:
ser2 = pd.Series([1,2,5,6],['China','India','USA','brazil'])

In [9]:
ser2

China     1
India     2
USA       5
brazil    6
dtype: int64

In [10]:
ser1['India']

3

In [11]:
ser1 + ser2

China     3.0
India     5.0
Itely     NaN
USA       6.0
brazil    NaN
dtype: float64

## Pandas - DataFrames

In [15]:
from numpy.random import randn

In [16]:
np.random.seed(101)

In [23]:
df = pd.DataFrame(randn(5,4),['a','b','c','d','e'],['w','x','y','z'])

In [24]:
df

Unnamed: 0,w,x,y,z
a,0.38603,2.084019,-0.376519,0.230336
b,0.681209,1.035125,-0.03116,1.939932
c,-1.005187,-0.74179,0.187125,-0.732845
d,-1.38292,1.482495,0.961458,-2.141212
e,0.992573,1.192241,-1.04678,1.292765


In [29]:
df['x']

a    2.084019
b    1.035125
c   -0.741790
d    1.482495
e    1.192241
Name: x, dtype: float64

In [30]:
type(df['x'])

pandas.core.series.Series

In [33]:
type(df)

pandas.core.frame.DataFrame

In [35]:
df[['x','w']]

Unnamed: 0,x,w
a,2.084019,0.38603
b,1.035125,0.681209
c,-0.74179,-1.005187
d,1.482495,-1.38292
e,1.192241,0.992573


In [37]:
df['new'] = df['w'] + df['x']

In [38]:
df['new']

a    2.470049
b    1.716334
c   -1.746977
d    0.099575
e    2.184814
Name: new, dtype: float64

In [39]:
df

Unnamed: 0,w,x,y,z,new
a,0.38603,2.084019,-0.376519,0.230336,2.470049
b,0.681209,1.035125,-0.03116,1.939932,1.716334
c,-1.005187,-0.74179,0.187125,-0.732845,-1.746977
d,-1.38292,1.482495,0.961458,-2.141212,0.099575
e,0.992573,1.192241,-1.04678,1.292765,2.184814


In [44]:
df.drop('new',axis=1,inplace=True)

In [45]:
df

Unnamed: 0,w,x,y,z
a,0.38603,2.084019,-0.376519,0.230336
b,0.681209,1.035125,-0.03116,1.939932
c,-1.005187,-0.74179,0.187125,-0.732845
d,-1.38292,1.482495,0.961458,-2.141212
e,0.992573,1.192241,-1.04678,1.292765


In [49]:
df.drop('e',inplace=True)

In [50]:
df

Unnamed: 0,w,x,y,z
a,0.38603,2.084019,-0.376519,0.230336
b,0.681209,1.035125,-0.03116,1.939932
c,-1.005187,-0.74179,0.187125,-0.732845
d,-1.38292,1.482495,0.961458,-2.141212


In [51]:
df.shape

(4, 4)

In [52]:
df[['x','z']]

Unnamed: 0,x,z
a,2.084019,0.230336
b,1.035125,1.939932
c,-0.74179,-0.732845
d,1.482495,-2.141212


In [53]:
## Rows

In [54]:
df

Unnamed: 0,w,x,y,z
a,0.38603,2.084019,-0.376519,0.230336
b,0.681209,1.035125,-0.03116,1.939932
c,-1.005187,-0.74179,0.187125,-0.732845
d,-1.38292,1.482495,0.961458,-2.141212


In [57]:
df.loc['a']

w    0.386030
x    2.084019
y   -0.376519
z    0.230336
Name: a, dtype: float64

In [60]:
df.iloc[2]

w   -1.005187
x   -0.741790
y    0.187125
z   -0.732845
Name: c, dtype: float64

In [61]:
df.loc['b','y']

-0.031160481493099617

In [63]:
df.loc[['b','c'],['x','z']]

Unnamed: 0,x,z
b,1.035125,1.939932
c,-0.74179,-0.732845


In [62]:
df

Unnamed: 0,w,x,y,z
a,0.38603,2.084019,-0.376519,0.230336
b,0.681209,1.035125,-0.03116,1.939932
c,-1.005187,-0.74179,0.187125,-0.732845
d,-1.38292,1.482495,0.961458,-2.141212


## conditional selection - Pandas

In [65]:
booldf = df > 0

In [66]:
booldf

Unnamed: 0,w,x,y,z
a,True,True,False,True
b,True,True,False,True
c,False,False,True,False
d,False,True,True,False


In [67]:
df[booldf]

Unnamed: 0,w,x,y,z
a,0.38603,2.084019,,0.230336
b,0.681209,1.035125,,1.939932
c,,,0.187125,
d,,1.482495,0.961458,


In [69]:
df[df>0]

Unnamed: 0,w,x,y,z
a,0.38603,2.084019,,0.230336
b,0.681209,1.035125,,1.939932
c,,,0.187125,
d,,1.482495,0.961458,


In [70]:
df['w'] >0

a     True
b     True
c    False
d    False
Name: w, dtype: bool

In [71]:
df[df['w']>0]

Unnamed: 0,w,x,y,z
a,0.38603,2.084019,-0.376519,0.230336
b,0.681209,1.035125,-0.03116,1.939932


In [74]:
df[df['z']<0]

Unnamed: 0,w,x,y,z
c,-1.005187,-0.74179,0.187125,-0.732845
d,-1.38292,1.482495,0.961458,-2.141212


In [75]:
df[df['w']>0]['x']

a    2.084019
b    1.035125
Name: x, dtype: float64

In [78]:
df[(df['w']>0) | (df['y']>1)]

Unnamed: 0,w,x,y,z
a,0.38603,2.084019,-0.376519,0.230336
b,0.681209,1.035125,-0.03116,1.939932


In [79]:
df

Unnamed: 0,w,x,y,z
a,0.38603,2.084019,-0.376519,0.230336
b,0.681209,1.035125,-0.03116,1.939932
c,-1.005187,-0.74179,0.187125,-0.732845
d,-1.38292,1.482495,0.961458,-2.141212


In [81]:
## Reset index

In [85]:
df.reset_index(inplace=True)

In [86]:
df

Unnamed: 0,level_0,index,w,x,y,z
0,0,a,0.38603,2.084019,-0.376519,0.230336
1,1,b,0.681209,1.035125,-0.03116,1.939932
2,2,c,-1.005187,-0.74179,0.187125,-0.732845
3,3,d,-1.38292,1.482495,0.961458,-2.141212


In [89]:
newind = 'CA NY OR CO'.split()

In [90]:
newind

['CA', 'NY', 'OR', 'CO']

In [91]:
df['state'] = newind

In [92]:
df

Unnamed: 0,level_0,index,w,x,y,z,state
0,0,a,0.38603,2.084019,-0.376519,0.230336,CA
1,1,b,0.681209,1.035125,-0.03116,1.939932,NY
2,2,c,-1.005187,-0.74179,0.187125,-0.732845,OR
3,3,d,-1.38292,1.482495,0.961458,-2.141212,CO


In [93]:
df.set_index('state')

Unnamed: 0_level_0,level_0,index,w,x,y,z
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CA,0,a,0.38603,2.084019,-0.376519,0.230336
NY,1,b,0.681209,1.035125,-0.03116,1.939932
OR,2,c,-1.005187,-0.74179,0.187125,-0.732845
CO,3,d,-1.38292,1.482495,0.961458,-2.141212


## MultiIndex - Pandas

In [95]:
#index levels
outside = ['G1','G1','G1','G2','G2','G3']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside, inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [98]:
hier_index

MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G3', 3)],
           )

In [103]:
df = pd.DataFrame(randn(6,2),hier_index,['A','B'])

In [104]:
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,0.04146,-0.411055
G1,2,-0.771329,0.110477
G1,3,-0.804652,0.253548
G2,1,0.649148,0.358941
G2,2,-1.080471,0.902398
G3,3,0.161781,0.833029


In [111]:
df.index.names = ['Groups','Num']

In [112]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,0.04146,-0.411055
G1,2,-0.771329,0.110477
G1,3,-0.804652,0.253548
G2,1,0.649148,0.358941
G2,2,-1.080471,0.902398
G3,3,0.161781,0.833029


In [120]:
df.loc['G2'].loc[2]['B'] 

0.902397757200862

In [127]:
df.xs(1,level='Num')     ## xs- cross section- for finding indexes. Better than 'loc'

Unnamed: 0_level_0,A,B
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,0.04146,-0.411055
G2,0.649148,0.358941
