### Pandas DataFrames

In [1]:
import numpy as np 
import pandas as pd
from numpy.random import randn

In [2]:
df = pd.DataFrame(randn(5,7),[1,2,3,4,5],['T','U','V','W','X','Y','Z'])
df

Unnamed: 0,T,U,V,W,X,Y,Z
1,0.89095,0.29023,0.392372,-0.521326,1.180511,-0.507833,1.000442
2,1.735856,-0.372495,-0.332359,0.052514,1.53956,-1.473029,0.512366
3,1.104707,0.45178,0.328972,-0.813794,-0.910293,1.48837,1.435349
4,1.834926,-1.930662,0.184121,-0.131765,-0.816331,-0.948155,0.933952
5,-1.290103,1.462288,0.065532,-1.375244,-1.58523,0.047519,-0.027028


In [3]:
# Ex 1:-

df['Z']

1    1.000442
2    0.512366
3    1.435349
4    0.933952
5   -0.027028
Name: Z, dtype: float64

In [4]:
# Ex 2:-

df[['T', 'V', 'Z']]

Unnamed: 0,T,V,Z
1,0.89095,0.392372,1.000442
2,1.735856,-0.332359,0.512366
3,1.104707,0.328972,1.435349
4,1.834926,0.184121,0.933952
5,-1.290103,0.065532,-0.027028


In [5]:
# Ex 2ii:- Getting columns V, Y, Z
cols = ['V', 'Y', 'Z']
df[cols]

Unnamed: 0,V,Y,Z
1,0.392372,-0.507833,1.000442
2,-0.332359,-1.473029,0.512366
3,0.328972,1.48837,1.435349
4,0.184121,-0.948155,0.933952
5,0.065532,0.047519,-0.027028


In [6]:
# Ex 3:- Adding another column to the DataFrame...

df['ext'] = df['V'] + df['X']
df['ext']

1    1.572884
2    1.207201
3   -0.581320
4   -0.632210
5   -1.519698
Name: ext, dtype: float64

In [7]:
df

Unnamed: 0,T,U,V,W,X,Y,Z,ext
1,0.89095,0.29023,0.392372,-0.521326,1.180511,-0.507833,1.000442,1.572884
2,1.735856,-0.372495,-0.332359,0.052514,1.53956,-1.473029,0.512366,1.207201
3,1.104707,0.45178,0.328972,-0.813794,-0.910293,1.48837,1.435349,-0.58132
4,1.834926,-1.930662,0.184121,-0.131765,-0.816331,-0.948155,0.933952,-0.63221
5,-1.290103,1.462288,0.065532,-1.375244,-1.58523,0.047519,-0.027028,-1.519698


In [8]:
# To Delete a certain column, we use the drop() method...
# "inplace=True" ensures that the col deletion is invoked permanently

# Ex 4:-
df.drop('W',axis=1,inplace=True)
df

Unnamed: 0,T,U,V,X,Y,Z,ext
1,0.89095,0.29023,0.392372,1.180511,-0.507833,1.000442,1.572884
2,1.735856,-0.372495,-0.332359,1.53956,-1.473029,0.512366,1.207201
3,1.104707,0.45178,0.328972,-0.910293,1.48837,1.435349,-0.58132
4,1.834926,-1.930662,0.184121,-0.816331,-0.948155,0.933952,-0.63221
5,-1.290103,1.462288,0.065532,-1.58523,0.047519,-0.027028,-1.519698


In [9]:
# To get the rows in a DataFrame, you use loc and iloc...
# "iloc": starts its index at 0 to get the numeric based index, WHILE, "loc": starts its index at 1 to get the location based index...

# Ex 5i:-
df.loc[3]

T      1.104707
U      0.451780
V      0.328972
X     -0.910293
Y      1.488370
Z      1.435349
ext   -0.581320
Name: 3, dtype: float64

In [10]:
# Ex 5ii:-
df.iloc[2]

T      1.104707
U      0.451780
V      0.328972
X     -0.910293
Y      1.488370
Z      1.435349
ext   -0.581320
Name: 3, dtype: float64

In [11]:
# Ex 6:- Getting subset of rows and cloumns....

# In grabbing data from multiple rows & cols, you can only do slicing w/"df.loc"
#"df.iloc" doesn't work with slicing in pandas Dataframes, AND you must use numeric indexers (i.e 0,1 etc) when getting data, even if your rows/cols are labelled in strings.

df

Unnamed: 0,T,U,V,X,Y,Z,ext
1,0.89095,0.29023,0.392372,1.180511,-0.507833,1.000442,1.572884
2,1.735856,-0.372495,-0.332359,1.53956,-1.473029,0.512366,1.207201
3,1.104707,0.45178,0.328972,-0.910293,1.48837,1.435349,-0.58132
4,1.834926,-1.930662,0.184121,-0.816331,-0.948155,0.933952,-0.63221
5,-1.290103,1.462288,0.065532,-1.58523,0.047519,-0.027028,-1.519698


In [12]:
df.loc[3:5,'X':]

Unnamed: 0,X,Y,Z,ext
3,-0.910293,1.48837,1.435349,-0.58132
4,-0.816331,-0.948155,0.933952,-0.63221
5,-1.58523,0.047519,-0.027028,-1.519698


In [13]:
df.loc[2,'V']

-0.33235873520852705

In [14]:
df.iloc[[3,4],[0,3,5]]

Unnamed: 0,T,X,Z
4,1.834926,-0.816331,0.933952
5,-1.290103,-1.58523,-0.027028


### Conditional Selection in Pandas

In [15]:
df

Unnamed: 0,T,U,V,X,Y,Z,ext
1,0.89095,0.29023,0.392372,1.180511,-0.507833,1.000442,1.572884
2,1.735856,-0.372495,-0.332359,1.53956,-1.473029,0.512366,1.207201
3,1.104707,0.45178,0.328972,-0.910293,1.48837,1.435349,-0.58132
4,1.834926,-1.930662,0.184121,-0.816331,-0.948155,0.933952,-0.63221
5,-1.290103,1.462288,0.065532,-1.58523,0.047519,-0.027028,-1.519698


In [16]:
# Ex 7:-
bool_df = df > 0
bool_df

Unnamed: 0,T,U,V,X,Y,Z,ext
1,True,True,True,True,False,True,True
2,True,False,False,True,False,True,True
3,True,True,True,False,True,True,False
4,True,False,True,False,False,True,False
5,False,True,True,False,True,False,False


In [17]:
# Ex 8:- The "Nan" result signifies that the value is False

df[bool_df]

Unnamed: 0,T,U,V,X,Y,Z,ext
1,0.89095,0.29023,0.392372,1.180511,,1.000442,1.572884
2,1.735856,,,1.53956,,0.512366,1.207201
3,1.104707,0.45178,0.328972,,1.48837,1.435349,
4,1.834926,,0.184121,,,0.933952,
5,,1.462288,0.065532,,0.047519,,


In [18]:
# Ex 9i:- Filtering only the "True" values...

df_bool = df['U'] > 0
df_bool

1     True
2    False
3     True
4    False
5     True
Name: U, dtype: bool

In [19]:
df[df_bool]          # The DataFrame returns only the subset of "True" values....

Unnamed: 0,T,U,V,X,Y,Z,ext
1,0.89095,0.29023,0.392372,1.180511,-0.507833,1.000442,1.572884
3,1.104707,0.45178,0.328972,-0.910293,1.48837,1.435349,-0.58132
5,-1.290103,1.462288,0.065532,-1.58523,0.047519,-0.027028,-1.519698


In [20]:
# Ex 9ii:-

df_booll = df['Z'] < 0
df_booll

1    False
2    False
3    False
4    False
5     True
Name: Z, dtype: bool

In [21]:
df_result = df[df_booll]          # The DataFrame returns only the True values
df_result

Unnamed: 0,T,U,V,X,Y,Z,ext
5,-1.290103,1.462288,0.065532,-1.58523,0.047519,-0.027028,-1.519698


### Dealing with multiple conditional selection

In [22]:
# Ex 10:- 

df

Unnamed: 0,T,U,V,X,Y,Z,ext
1,0.89095,0.29023,0.392372,1.180511,-0.507833,1.000442,1.572884
2,1.735856,-0.372495,-0.332359,1.53956,-1.473029,0.512366,1.207201
3,1.104707,0.45178,0.328972,-0.910293,1.48837,1.435349,-0.58132
4,1.834926,-1.930662,0.184121,-0.816331,-0.948155,0.933952,-0.63221
5,-1.290103,1.462288,0.065532,-1.58523,0.047519,-0.027028,-1.519698


In [23]:
df['T'] > 0

1     True
2     True
3     True
4     True
5    False
Name: T, dtype: bool

In [24]:
# With pandas, you can't use the "and"/"or" when working w/multiple conditions
# We use "and" as:- &; "or" as:- |

df[(df['T'] > 0) & (df['X'] < 1)]

Unnamed: 0,T,U,V,X,Y,Z,ext
3,1.104707,0.45178,0.328972,-0.910293,1.48837,1.435349,-0.58132
4,1.834926,-1.930662,0.184121,-0.816331,-0.948155,0.933952,-0.63221


In [25]:
df[(df['T'] > 0) | (df['U'] < 1)]

Unnamed: 0,T,U,V,X,Y,Z,ext
1,0.89095,0.29023,0.392372,1.180511,-0.507833,1.000442,1.572884
2,1.735856,-0.372495,-0.332359,1.53956,-1.473029,0.512366,1.207201
3,1.104707,0.45178,0.328972,-0.910293,1.48837,1.435349,-0.58132
4,1.834926,-1.930662,0.184121,-0.816331,-0.948155,0.933952,-0.63221


In [26]:
# But Pandas can only deal with single instances of boolean ("and"/"or") values...

True and True

True

### Resetting Indexes

In [27]:
# Ex 11:

df

Unnamed: 0,T,U,V,X,Y,Z,ext
1,0.89095,0.29023,0.392372,1.180511,-0.507833,1.000442,1.572884
2,1.735856,-0.372495,-0.332359,1.53956,-1.473029,0.512366,1.207201
3,1.104707,0.45178,0.328972,-0.910293,1.48837,1.435349,-0.58132
4,1.834926,-1.930662,0.184121,-0.816331,-0.948155,0.933952,-0.63221
5,-1.290103,1.462288,0.065532,-1.58523,0.047519,-0.027028,-1.519698


In [28]:
df.reset_index()             # This method resets the index of the DataFrame to numericals (from 0 to....)

Unnamed: 0,index,T,U,V,X,Y,Z,ext
0,1,0.89095,0.29023,0.392372,1.180511,-0.507833,1.000442,1.572884
1,2,1.735856,-0.372495,-0.332359,1.53956,-1.473029,0.512366,1.207201
2,3,1.104707,0.45178,0.328972,-0.910293,1.48837,1.435349,-0.58132
3,4,1.834926,-1.930662,0.184121,-0.816331,-0.948155,0.933952,-0.63221
4,5,-1.290103,1.462288,0.065532,-1.58523,0.047519,-0.027028,-1.519698


In [29]:
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,T,U,V,X,Y,Z,ext
0,0.89095,0.29023,0.392372,1.180511,-0.507833,1.000442,1.572884
1,1.735856,-0.372495,-0.332359,1.53956,-1.473029,0.512366,1.207201
2,1.104707,0.45178,0.328972,-0.910293,1.48837,1.435349,-0.58132
3,1.834926,-1.930662,0.184121,-0.816331,-0.948155,0.933952,-0.63221
4,-1.290103,1.462288,0.065532,-1.58523,0.047519,-0.027028,-1.519698


In [30]:
# Ex 11ii:-

new_index = 'Toronto Seattle SF NY Boston'.split()
new_index

['Toronto', 'Seattle', 'SF', 'NY', 'Boston']

In [31]:
# Adding a new column to the DataFrame...
df['Cities'] = new_index
df

Unnamed: 0,T,U,V,X,Y,Z,ext,Cities
0,0.89095,0.29023,0.392372,1.180511,-0.507833,1.000442,1.572884,Toronto
1,1.735856,-0.372495,-0.332359,1.53956,-1.473029,0.512366,1.207201,Seattle
2,1.104707,0.45178,0.328972,-0.910293,1.48837,1.435349,-0.58132,SF
3,1.834926,-1.930662,0.184121,-0.816331,-0.948155,0.933952,-0.63221,NY
4,-1.290103,1.462288,0.065532,-1.58523,0.047519,-0.027028,-1.519698,Boston


In [32]:
# Making our column "Cities" in the DataFrame to be the index... 
# Remember that by not including "inplace=True" we're invoking a permanent change to the index...

# Ex 11iii:-
df.set_index("Cities")         

Unnamed: 0_level_0,T,U,V,X,Y,Z,ext
Cities,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Toronto,0.89095,0.29023,0.392372,1.180511,-0.507833,1.000442,1.572884
Seattle,1.735856,-0.372495,-0.332359,1.53956,-1.473029,0.512366,1.207201
SF,1.104707,0.45178,0.328972,-0.910293,1.48837,1.435349,-0.58132
NY,1.834926,-1.930662,0.184121,-0.816331,-0.948155,0.933952,-0.63221
Boston,-1.290103,1.462288,0.065532,-1.58523,0.047519,-0.027028,-1.519698
