# DataFrame

In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
# import randn to generate random numbers
from numpy.random import randn

#### Create a DataFrame
Parameters pd.DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)

In [4]:
# DataFrame with random numbers, 5 rows, 4 columns
df=pd.DataFrame(data=randn(5,4), index= ['a','b','c','d','e'], columns=['apple','orange','banana','mango'])
df

Unnamed: 0,apple,orange,banana,mango
a,1.289639,0.19687,-0.191824,0.085701
b,1.226099,-0.704879,0.825065,1.396669
c,0.386236,0.788935,0.691029,1.539689
d,-0.774014,0.451101,0.154099,1.267538
e,-2.017397,-0.97405,-0.269219,-0.487416


#### how to grab any column, it return a Series

In [5]:
# First way:
df['apple']

a    1.289639
b    1.226099
c    0.386236
d   -0.774014
e   -2.017397
Name: apple, dtype: float64

In [6]:
#Second way:
df.apple

a    1.289639
b    1.226099
c    0.386236
d   -0.774014
e   -2.017397
Name: apple, dtype: float64

#### Grab multiple columns, it will return a DataFrame

In [7]:
df[['apple','mango']]

Unnamed: 0,apple,mango
a,1.289639,0.085701
b,1.226099,1.396669
c,0.386236,1.539689
d,-0.774014,1.267538
e,-2.017397,-0.487416


#### Insert a new column, add two columns and add sum of values into new column

In [8]:
df['cherry']=df['apple']+df['orange']
df

Unnamed: 0,apple,orange,banana,mango,cherry
a,1.945499,-0.790929,0.113086,0.971008,1.15457
b,0.111957,1.088596,-1.818121,1.530146,1.200553
c,-0.365011,-0.365316,0.615051,-1.17671,-0.730326
d,0.413228,0.828821,-0.192204,1.240294,1.242048
e,-1.957988,1.780609,0.149691,0.586205,-0.177379


#### Drop or delete any column
Note: By default, inplace=False, if you want a permanent change in DataFrame than it should be True

In [9]:
df.drop('mango',axis=1)

Unnamed: 0,apple,orange,banana,cherry
a,1.945499,-0.790929,0.113086,1.15457
b,0.111957,1.088596,-1.818121,1.200553
c,-0.365011,-0.365316,0.615051,-0.730326
d,0.413228,0.828821,-0.192204,1.242048
e,-1.957988,1.780609,0.149691,-0.177379


#### deleting any row

In [10]:
df.drop('a', axis=0)

Unnamed: 0,apple,orange,banana,mango,cherry
b,0.111957,1.088596,-1.818121,1.530146,1.200553
c,-0.365011,-0.365316,0.615051,-1.17671,-0.730326
d,0.413228,0.828821,-0.192204,1.240294,1.242048
e,-1.957988,1.780609,0.149691,0.586205,-0.177379


#### Selecting rows using loc( ) and iloc( ) method.

In [11]:
# it will return series
df.loc['a']

apple     1.945499
orange   -0.790929
banana    0.113086
mango     0.971008
cherry    1.154570
Name: a, dtype: float64

In [12]:
# multiple rows return Dataframe
df.loc[['a','b']]

Unnamed: 0,apple,orange,banana,mango,cherry
a,1.945499,-0.790929,0.113086,0.971008,1.15457
b,0.111957,1.088596,-1.818121,1.530146,1.200553


In [13]:
# We could also grab a specific item in the DataFrame, e.g row = d , column = mango.
df.loc['d','mango']

1.2402936205751314

In [14]:
# get a subset, rows = a & b, columns = mango and cherry
df.loc[['a','b'],['mango','cherry']]

Unnamed: 0,mango,cherry
a,0.971008,1.15457
b,1.530146,1.200553


In [15]:
# iloc() select index location, e.g select 'd'
df.iloc[3]

apple     0.413228
orange    0.828821
banana   -0.192204
mango     1.240294
cherry    1.242048
Name: d, dtype: float64

In [16]:
# We could also grab a specific item in the DataFrame, e.g row = 3, column = 2.
df.iloc[3,2]

-0.19220352115993006

In [17]:
# get a subset, rows = 1 & 2, columns = 0 & 1
df.iloc[[1,2],[0,1]]

Unnamed: 0,apple,orange
b,0.111957,1.088596
c,-0.365011,-0.365316


# Conditional Operations

In [18]:
df

Unnamed: 0,apple,orange,banana,mango,cherry
a,1.945499,-0.790929,0.113086,0.971008,1.15457
b,0.111957,1.088596,-1.818121,1.530146,1.200553
c,-0.365011,-0.365316,0.615051,-1.17671,-0.730326
d,0.413228,0.828821,-0.192204,1.240294,1.242048
e,-1.957988,1.780609,0.149691,0.586205,-0.177379


#### show values which are greater than 0, return boolean values.

In [19]:
df>0

Unnamed: 0,apple,orange,banana,mango,cherry
a,True,False,True,True,True
b,True,True,False,True,True
c,False,False,True,False,False
d,True,True,False,True,True
e,False,True,True,True,False


In [20]:
# we can apply condition on only one column, will return boolean
df['apple']>0

a     True
b     True
c    False
d     True
e    False
Name: apple, dtype: bool

In [21]:
# Another way of doing above and also display addition columns such as banana and mango, return positive rows with
#respect of 'apple'.
df[df['apple']>0][['banana','mango']]

Unnamed: 0,banana,mango
a,0.113086,0.971008
b,-1.818121,1.530146
d,-0.192204,1.240294


In [22]:
# we can get the whole DataFrame back but only apply condition to only one column, e.g 'apple, return only True rows.
df[df['apple']>0]

Unnamed: 0,apple,orange,banana,mango,cherry
a,1.945499,-0.790929,0.113086,0.971008,1.15457
b,0.111957,1.088596,-1.818121,1.530146,1.200553
d,0.413228,0.828821,-0.192204,1.240294,1.242048


In [23]:
df[df>0]

Unnamed: 0,apple,orange,banana,mango,cherry
a,1.945499,,0.113086,0.971008,1.15457
b,0.111957,1.088596,,1.530146,1.200553
c,,,0.615051,,
d,0.413228,0.828821,,1.240294,1.242048
e,,1.780609,0.149691,0.586205,


### Multiple Conditions 
##### Show all values greater than 0 in column 'apple' and 'mango'

In [24]:
# Both conditions should be true
df[(df['apple']>0) & (df['mango']>0)]

Unnamed: 0,apple,orange,banana,mango,cherry
a,1.945499,-0.790929,0.113086,0.971008,1.15457
b,0.111957,1.088596,-1.818121,1.530146,1.200553
d,0.413228,0.828821,-0.192204,1.240294,1.242048


##### Show all values greater than 0 in column 'apple' or 'mango'

In [25]:
# this will return all the values in both columns which are greater than 0.
df[(df['apple']>0) | (df['mango']>0)]

Unnamed: 0,apple,orange,banana,mango,cherry
a,1.945499,-0.790929,0.113086,0.971008,1.15457
b,0.111957,1.088596,-1.818121,1.530146,1.200553
d,0.413228,0.828821,-0.192204,1.240294,1.242048
e,-1.957988,1.780609,0.149691,0.586205,-0.177379


### Reset index
##### Index becomes a new column

In [26]:
df

Unnamed: 0,apple,orange,banana,mango,cherry
a,1.945499,-0.790929,0.113086,0.971008,1.15457
b,0.111957,1.088596,-1.818121,1.530146,1.200553
c,-0.365011,-0.365316,0.615051,-1.17671,-0.730326
d,0.413228,0.828821,-0.192204,1.240294,1.242048
e,-1.957988,1.780609,0.149691,0.586205,-0.177379


In [27]:
df.reset_index()

Unnamed: 0,index,apple,orange,banana,mango,cherry
0,a,1.945499,-0.790929,0.113086,0.971008,1.15457
1,b,0.111957,1.088596,-1.818121,1.530146,1.200553
2,c,-0.365011,-0.365316,0.615051,-1.17671,-0.730326
3,d,0.413228,0.828821,-0.192204,1.240294,1.242048
4,e,-1.957988,1.780609,0.149691,0.586205,-0.177379


#### insert new column and set it as index

In [28]:
# Step 1, make a list.
new_index=['CA','AZ','CO','NY','OR']

In [29]:
# Step 2, add a new columns
df['States']=new_index
df

Unnamed: 0,apple,orange,banana,mango,cherry,States
a,1.945499,-0.790929,0.113086,0.971008,1.15457,CA
b,0.111957,1.088596,-1.818121,1.530146,1.200553,AZ
c,-0.365011,-0.365316,0.615051,-1.17671,-0.730326,CO
d,0.413228,0.828821,-0.192204,1.240294,1.242048,NY
e,-1.957988,1.780609,0.149691,0.586205,-0.177379,OR


In [30]:
# Step 3, replace new column with the index
df.set_index('States')

Unnamed: 0_level_0,apple,orange,banana,mango,cherry
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CA,1.945499,-0.790929,0.113086,0.971008,1.15457
AZ,0.111957,1.088596,-1.818121,1.530146,1.200553
CO,-0.365011,-0.365316,0.615051,-1.17671,-0.730326
NY,0.413228,0.828821,-0.192204,1.240294,1.242048
OR,-1.957988,1.780609,0.149691,0.586205,-0.177379
