In [None]:
# Name: Chaithra Kopparam Cheluvaiah
# Nov 27,2021

# PANDAS DATAFRAME - PART 2

In [None]:
import numpy as np
import pandas as pd
from numpy.random import randn

In [None]:
np.random.seed(101)

In [None]:
# sample data
df = pd.DataFrame(data=randn(5,4), index=['A','B','C','D','E'], columns=['W','X','Y','Z'])
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


### CONDITIONAL SELECTION USING BRACKETS

In [None]:
booldf = df>0 # broadcasting the condition across all the elements
booldf

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [None]:
df[booldf]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [None]:
df[df>0] # dataframe > 0 is not that common way. usually we pass the row and column with conditions

# this returns data frame with null/NaN values that are not satisfying the conditions

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [None]:
df['W']>0 # series is returned with boolean values

A     True
B     True
C    False
D     True
E     True
Name: W, dtype: bool

In [None]:
# filtering the data frame
df[df['W']>0] # notice that there is no row 'C'

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [None]:
df[df['Z']<0]

Unnamed: 0,W,X,Y,Z
C,-2.018168,0.740122,0.528813,-0.589001


In [None]:
df[df['W']>0]['X'] # conditional selection returns dataframe so we can stack commands on top of that dataframe

A    0.628133
B   -0.319318
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [None]:
df[df['W']>0][['X','Y']]  # takes less memory; no extra variables used to break the command into separate lines

Unnamed: 0,X,Y
A,0.628133,0.907969
B,-0.319318,-0.848077
D,-0.758872,-0.933237
E,1.978757,2.605967


### MULTIPLE CONDITIONS

In [None]:
df[(df['W']>0) and (df['Y']>1)] # gives error

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
# python's 'and' operator take into account series of boolean values compared to another series of boolean values
# it can only account single booleans at a time
# 'and' operator begins to get confused when we use it with pandas series

In [None]:
True and False

False

In [None]:
True and True

True

In [None]:
# use & and | operators
df[(df['W']>0) & (df['Y']>1)] 

Unnamed: 0,W,X,Y,Z
E,0.190794,1.978757,2.605967,0.683509


In [None]:
df[(df['W']>0) | (df['Y']>1)] 

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


### RESETTING INDEX - Reset the index of the DataFrame, and use the default one instead

In [None]:
df.reset_index() #default inplace=False

Unnamed: 0,index,W,X,Y,Z
0,A,2.70685,0.628133,0.907969,0.503826
1,B,0.651118,-0.319318,-0.848077,0.605965
2,C,-2.018168,0.740122,0.528813,-0.589001
3,D,0.188695,-0.758872,-0.933237,0.955057
4,E,0.190794,1.978757,2.605967,0.683509


In [None]:
newind = 'CA NY WY OR CO'.split()
newind

['CA', 'NY', 'WY', 'OR', 'CO']

In [None]:
df['States'] = newind # appending a column based on python array
df

Unnamed: 0,W,X,Y,Z,States
A,2.70685,0.628133,0.907969,0.503826,CA
B,0.651118,-0.319318,-0.848077,0.605965,NY
C,-2.018168,0.740122,0.528813,-0.589001,WY
D,0.188695,-0.758872,-0.933237,0.955057,OR
E,0.190794,1.978757,2.605967,0.683509,CO


In [None]:
df.set_index('States') # default inplace=False

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,2.70685,0.628133,0.907969,0.503826
NY,0.651118,-0.319318,-0.848077,0.605965
WY,-2.018168,0.740122,0.528813,-0.589001
OR,0.188695,-0.758872,-0.933237,0.955057
CO,0.190794,1.978757,2.605967,0.683509


In [None]:
df

Unnamed: 0,W,X,Y,Z,States
A,2.70685,0.628133,0.907969,0.503826,CA
B,0.651118,-0.319318,-0.848077,0.605965,NY
C,-2.018168,0.740122,0.528813,-0.589001,WY
D,0.188695,-0.758872,-0.933237,0.955057,OR
E,0.190794,1.978757,2.605967,0.683509,CO


### QUICK REVIEW:
1. conditional selection (similar to numpy) 
2. cannot use python's `and` and `or` operators on series instead use `&` and `|`
3. `reset_index()`
4. `set_index()`- already existing column can be set to index