In [1]:
import pandas as pd
import numpy as np 
from numpy.random import randn

In [2]:
np.random.seed(101)

In [3]:
df = pd.DataFrame(randn(5,4),  index=['A', 'B', 'C', 'D', 'E'],columns=['W', 'X', 'Y', 'Z'])

In [4]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


## Selecting Series from DF 

In [5]:
df['W']  # each column is a series 

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [6]:
type(df['W'])

pandas.core.series.Series

In [7]:
df['W']['B']

0.6511179479432686

In [8]:
df[['W','X']]

Unnamed: 0,W,X
A,2.70685,0.628133
B,0.651118,-0.319318
C,-2.018168,0.740122
D,0.188695,-0.758872
E,0.190794,1.978757


# Part 1 

### Creating new Column 

In [9]:
df['NEW'] = df['W'] + df['X']

In [10]:
df

Unnamed: 0,W,X,Y,Z,NEW
A,2.70685,0.628133,0.907969,0.503826,3.334983
B,0.651118,-0.319318,-0.848077,0.605965,0.3318
C,-2.018168,0.740122,0.528813,-0.589001,-1.278046
D,0.188695,-0.758872,-0.933237,0.955057,-0.570177
E,0.190794,1.978757,2.605967,0.683509,2.169552


### Dropping the columns and Rows (use drop())

In [11]:
df.drop('NEW', axis=1)  # axis=0 means that we are looking at index. 

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [12]:
df # as we did not drop in place so the column was not deleted

Unnamed: 0,W,X,Y,Z,NEW
A,2.70685,0.628133,0.907969,0.503826,3.334983
B,0.651118,-0.319318,-0.848077,0.605965,0.3318
C,-2.018168,0.740122,0.528813,-0.589001,-1.278046
D,0.188695,-0.758872,-0.933237,0.955057,-0.570177
E,0.190794,1.978757,2.605967,0.683509,2.169552


In [13]:
# inplace deletion 
df.drop('NEW', axis=1, inplace=True)
df # this deletes the data from df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [14]:
# dropping rows 
df.drop('E', axis=0)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057


In [15]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [16]:
df.drop('E', inplace=True)

In [18]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057


In [19]:
df.shape 

(4, 4)

### Accessing column, rows and cells  (loc, iloc) 

In [20]:
# Row (index) is reffered as axis=0 and columns are reffered axis=1 

In [21]:
# selecting multiple rows 
df.loc['A']

W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64

In [22]:
df.loc[['A', 'C']]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
C,-2.018168,0.740122,0.528813,-0.589001


In [23]:
# look up the row by index position 
df.iloc[[2,3]]

Unnamed: 0,W,X,Y,Z
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057


In [24]:
# you can get a cell by loc [x,y] notation - loc[row,column]
df.loc['B','Y']

-0.8480769834036315

In [25]:
df.loc[['A','B'], ['W', 'Y']]

Unnamed: 0,W,Y
A,2.70685,0.907969
B,0.651118,-0.848077


# Part 2 
this section will see conditional selection and more indexing and selections and adding index 

In [27]:
df = pd.DataFrame(randn(5,4),  index=['A', 'B', 'C', 'D', 'E'],columns=['W', 'X', 'Y', 'Z'])

In [28]:
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [29]:
df > 0  # this gives the entire dataframe and returns boolean values 

Unnamed: 0,W,X,Y,Z
A,True,True,False,False
B,False,True,True,True
C,True,True,True,True
D,False,False,False,True
E,False,True,True,True


In [30]:
df['W']>0  # conditional selection. 

A     True
B    False
C     True
D    False
E    False
Name: W, dtype: bool

In [31]:
df[df['W']>0]    # this returns just the rows on the df that has `W` column values > 0  which is A and C in this case. 

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
C,0.807706,0.07296,0.638787,0.329646


In [37]:
cols = ['X','Y']
result = df[df['W']>0]
result[cols]

Unnamed: 0,X,Y
A,1.693723,-1.706086
C,0.07296,0.638787


In [41]:
df[df['W']>0][['X','Y']] # same as above

Unnamed: 0,X,Y
A,1.693723,-1.706086
C,0.07296,0.638787


In [45]:
# multiple conditions with and / or 
df[(df['W']<0) & (df['Z']>1)]

Unnamed: 0,W,X,Y,Z
E,-0.116773,1.901755,0.238127,1.996652


In [46]:
# Adding a new column in data frame 
newcol = "CA NY WY OR CO".split()

In [47]:
df['States'] = newcol
df

Unnamed: 0,W,X,Y,Z,States
A,0.302665,1.693723,-1.706086,-1.159119,CA
B,-0.134841,0.390528,0.166905,0.184502,NY
C,0.807706,0.07296,0.638787,0.329646,WY
D,-0.497104,-0.75407,-0.943406,0.484752,OR
E,-0.116773,1.901755,0.238127,1.996652,CO


In [49]:
# set the new states column as an index we can 
df.set_index('States')

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,0.302665,1.693723,-1.706086,-1.159119
NY,-0.134841,0.390528,0.166905,0.184502
WY,0.807706,0.07296,0.638787,0.329646
OR,-0.497104,-0.75407,-0.943406,0.484752
CO,-0.116773,1.901755,0.238127,1.996652


# Part 3 
Multilevel hierarchy

In [62]:
outside = "G1 G2 G3 G1 G2 G3".split()
inside = [1,2,3, 1,2,3]
hier_index = list(zip(outside,inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [63]:
hier_index

MultiIndex([('G1', 1),
            ('G2', 2),
            ('G3', 3),
            ('G1', 1),
            ('G2', 2),
            ('G3', 3)],
           )

In [64]:
mdf = pd.DataFrame(randn(6,2), hier_index, ['A', 'B'])  

In [65]:
mdf

Unnamed: 0,Unnamed: 1,A,B
G1,1,0.147027,-0.479448
G2,2,0.558769,1.02481
G3,3,-0.925874,1.862864
G1,1,-1.133817,0.610478
G2,2,0.38603,2.084019
G3,3,-0.376519,0.230336


In [73]:
mdf.loc['G1'].iloc[1]

A   -1.133817
B    0.610478
Name: 1, dtype: float64