# Pandas

## Series

### Part-1

In [1]:
import pandas as pd

In [2]:
import numpy as np

In [4]:
labels = ['a','b','c']
my_data = [10,20,30]

arr = np.array(my_data)

d = {
    'a':10,'b':20,'c':30
}

In [5]:
labels

['a', 'b', 'c']

In [6]:
my_data

[10, 20, 30]

In [8]:
arr

array([10, 20, 30])

In [9]:
d

{'a': 10, 'b': 20, 'c': 30}

In [12]:
pd.Series(data = my_data)

0    10
1    20
2    30
dtype: int64

In [14]:
pd.Series(data=my_data,index=labels)

a    10
b    20
c    30
dtype: int64

In [15]:
pd.Series(my_data,labels)

a    10
b    20
c    30
dtype: int64

In [16]:
pd.Series(arr)

0    10
1    20
2    30
dtype: int64

In [17]:
pd.Series(arr,labels)

a    10
b    20
c    30
dtype: int64

In [18]:
pd.Series(d)

a    10
b    20
c    30
dtype: int64

In [19]:
pd.Series(labels)

0    a
1    b
2    c
dtype: object

In [20]:
pd.Series(data=[sum,print,len])

0      <built-in function sum>
1    <built-in function print>
2      <built-in function len>
dtype: object

In [21]:
ser1 = pd.Series([1,2,3,4,5],['India','USA',"Russia","UK","UAE"])

In [22]:
ser1

India     1
USA       2
Russia    3
UK        4
UAE       5
dtype: int64

In [23]:
ser2 = pd.Series([1,3,5,4,2],["India","USA","Italy","France","Germany"])

In [24]:
ser2

India      1
USA        3
Italy      5
France     4
Germany    2
dtype: int64

In [25]:
ser1['USA']

np.int64(2)

In [26]:
ser2['USA']

np.int64(3)

In [27]:
pd.Series(labels)

0    a
1    b
2    c
dtype: object

In [28]:
ser1 + ser2

France     NaN
Germany    NaN
India      2.0
Italy      NaN
Russia     NaN
UAE        NaN
UK         NaN
USA        5.0
dtype: float64

## DataFrames

In [29]:
import numpy as np
import pandas as pd
from numpy.random import randn

In [30]:
np.random.seed(101)

In [32]:
data = pd.DataFrame(randn(5,4),["A","B","C","D","E"],["W","X","Y","Z"])

In [33]:
data

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [34]:
type(data)

pandas.core.frame.DataFrame

In [36]:
type(data["W"])

pandas.core.series.Series

In [37]:
data["W"]

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [38]:
data.W

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [39]:
data.X

A    0.628133
B   -0.319318
C    0.740122
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [40]:
data.Y

A    0.907969
B   -0.848077
C    0.528813
D   -0.933237
E    2.605967
Name: Y, dtype: float64

In [41]:
data.Z

A    0.503826
B    0.605965
C   -0.589001
D    0.955057
E    0.683509
Name: Z, dtype: float64

In [42]:
data[['X','Z']]

Unnamed: 0,X,Z
A,0.628133,0.503826
B,-0.319318,0.605965
C,0.740122,-0.589001
D,-0.758872,0.955057
E,1.978757,0.683509


In [43]:
data['new'] = data['X'] + data['Z']

In [44]:
data

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,1.131958
B,0.651118,-0.319318,-0.848077,0.605965,0.286647
C,-2.018168,0.740122,0.528813,-0.589001,0.151122
D,0.188695,-0.758872,-0.933237,0.955057,0.196184
E,0.190794,1.978757,2.605967,0.683509,2.662266


In [45]:
data.drop("new",axis=1)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [46]:
data

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,1.131958
B,0.651118,-0.319318,-0.848077,0.605965,0.286647
C,-2.018168,0.740122,0.528813,-0.589001,0.151122
D,0.188695,-0.758872,-0.933237,0.955057,0.196184
E,0.190794,1.978757,2.605967,0.683509,2.662266


In [47]:
data.drop("new",axis=1,inplace=True)

In [48]:
data

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [49]:
data.drop('E',axis=0)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057


In [50]:
data

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [51]:
data.shape

(5, 4)

In [54]:
data[['W','Z']]

Unnamed: 0,W,Z
A,2.70685,0.503826
B,0.651118,0.605965
C,-2.018168,-0.589001
D,0.188695,0.955057
E,0.190794,0.683509


In [55]:
data

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [56]:
data.loc['A']

W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64

In [57]:
data.loc['B']

W    0.651118
X   -0.319318
Y   -0.848077
Z    0.605965
Name: B, dtype: float64

In [59]:
data.iloc[2]

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
Name: C, dtype: float64

In [61]:
data.loc['B','Y']

np.float64(-0.8480769834036315)

In [62]:
data.loc[['A','B'],['W','X']]

Unnamed: 0,W,X
A,2.70685,0.628133
B,0.651118,-0.319318


### Part-2

In [63]:
data

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [64]:
# conditional selection

In [65]:
data > 0

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [66]:
bool_data = data > 0

In [67]:
bool_data

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [68]:
data[bool_data]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [70]:
data[data>0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [71]:
data

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [72]:
data['W'] > 0

A     True
B     True
C    False
D     True
E     True
Name: W, dtype: bool

In [74]:
data['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [76]:
data[data['W']>0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [77]:
data

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [79]:
data['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [80]:
type(data['W'])

pandas.core.series.Series

In [83]:
data[data["W"]>0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [84]:
data[data['W']>0][['Y','X']]

Unnamed: 0,Y,X
A,0.907969,0.628133
B,-0.848077,-0.319318
D,-0.933237,-0.758872
E,2.605967,1.978757


In [85]:
bool_series = data['W']>0
result = data[bool_series]
my_cols = ['Y',"X"]
result[my_cols]

Unnamed: 0,Y,X
A,0.907969,0.628133
B,-0.848077,-0.319318
D,-0.933237,-0.758872
E,2.605967,1.978757


In [88]:
data[(data['W']>0) & (data['Y']>1)]

Unnamed: 0,W,X,Y,Z
E,0.190794,1.978757,2.605967,0.683509


In [89]:
data[(data['W']>0) | (data['Y']>1)]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [91]:
data[(data['W']>0) & (data['Y']>1)]

Unnamed: 0,W,X,Y,Z
E,0.190794,1.978757,2.605967,0.683509


In [92]:
data

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [95]:
data.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,A,2.70685,0.628133,0.907969,0.503826
1,B,0.651118,-0.319318,-0.848077,0.605965
2,C,-2.018168,0.740122,0.528813,-0.589001
3,D,0.188695,-0.758872,-0.933237,0.955057
4,E,0.190794,1.978757,2.605967,0.683509


In [96]:
data

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [97]:
new_ind = "CA NY WY OR CO".split()

In [98]:
new_ind

['CA', 'NY', 'WY', 'OR', 'CO']

In [99]:
data["States"] = new_ind

In [100]:
data

Unnamed: 0,W,X,Y,Z,States
A,2.70685,0.628133,0.907969,0.503826,CA
B,0.651118,-0.319318,-0.848077,0.605965,NY
C,-2.018168,0.740122,0.528813,-0.589001,WY
D,0.188695,-0.758872,-0.933237,0.955057,OR
E,0.190794,1.978757,2.605967,0.683509,CO


In [103]:
data.set_index(keys='States')

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,2.70685,0.628133,0.907969,0.503826
NY,0.651118,-0.319318,-0.848077,0.605965
WY,-2.018168,0.740122,0.528813,-0.589001
OR,0.188695,-0.758872,-0.933237,0.955057
CO,0.190794,1.978757,2.605967,0.683509


In [104]:
data

Unnamed: 0,W,X,Y,Z,States
A,2.70685,0.628133,0.907969,0.503826,CA
B,0.651118,-0.319318,-0.848077,0.605965,NY
C,-2.018168,0.740122,0.528813,-0.589001,WY
D,0.188695,-0.758872,-0.933237,0.955057,OR
E,0.190794,1.978757,2.605967,0.683509,CO


### Part-3

In [123]:
# Index levels

outside = "G1 G1 G1 G2 G2 G2".split()
inside = "1 2 3 1 2 3".split()
hier_index = list(zip(outside,inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [124]:
outside

['G1', 'G1', 'G1', 'G2', 'G2', 'G2']

In [125]:
inside

['1', '2', '3', '1', '2', '3']

In [126]:
hier_index

MultiIndex([('G1', '1'),
            ('G1', '2'),
            ('G1', '3'),
            ('G2', '1'),
            ('G2', '2'),
            ('G2', '3')],
           )

In [127]:
list(zip(outside,inside))

[('G1', '1'), ('G1', '2'), ('G1', '3'), ('G2', '1'), ('G2', '2'), ('G2', '3')]

In [128]:
df = pd.DataFrame(randn(6,2),hier_index,['A','B'])

In [129]:
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


In [132]:
df.loc['G1']

Unnamed: 0,A,B
1,0.302665,1.693723
2,-1.706086,-1.159119
3,-0.134841,0.390528


In [134]:
df.loc['G1'].iloc[1]

A   -1.706086
B   -1.159119
Name: 2, dtype: float64

In [135]:
df.index.names = ['Groups','Num']

In [136]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


In [137]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


In [138]:
df.loc["G2"]

Unnamed: 0_level_0,A,B
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.166905,0.184502
2,0.807706,0.07296
3,0.638787,0.329646


In [139]:
df.loc["G2"].iloc[2]

A    0.638787
B    0.329646
Name: 3, dtype: float64

In [140]:
# cross section of rows and columns

In [142]:
df.xs("G1")

Unnamed: 0_level_0,A,B
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.302665,1.693723
2,-1.706086,-1.159119
3,-0.134841,0.390528


In [143]:
df.xs("G2")

Unnamed: 0_level_0,A,B
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.166905,0.184502
2,0.807706,0.07296
3,0.638787,0.329646


In [148]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


In [149]:
df.xs(1,level="Num")

KeyError: 1