# 4.Filtering/Manipulating DataFrame

## Section

1)[Sort The Columns](#SORT-THE-COLUMNS)<br>
2)[Filter The Columns](#FILTER-THE-COLUMNS)<br>
3)[Pivot The Dataframe](#PIVOT-THE-DATAFRAME)<br>
4)[Using Pivot Tables](#USING-PIVOT-TABLES)<br>
5)[Apply](#APPLY)<br>
6)[Handling NA](#HANDLING-NA)<br>

In [2]:
import pandas as pd
import numpy as np
import random
bool = random.choices(['Yes','No'],k=10)

date = pd.date_range('2015-04-01',periods=10)
df_man = pd.DataFrame(np.random.randint(0,100,50).reshape(10,5),index=date,columns=list('ABCDE'))
df_man

Unnamed: 0,A,B,C,D,E
2015-04-01,76,33,22,47,21
2015-04-02,34,82,57,39,24
2015-04-03,40,39,74,45,1
2015-04-04,48,20,14,66,80
2015-04-05,62,74,39,78,59
2015-04-06,3,65,89,91,20
2015-04-07,15,7,74,62,55
2015-04-08,26,45,14,31,48
2015-04-09,60,6,98,12,97
2015-04-10,55,80,12,64,59


## SORT THE COLUMNS

[Top](#Section)

In [3]:
df_man.sort_index()
df_man.sort_values(['A'],ascending=False, inplace=False, na_position='first', ignore_index=False)
df_man.sort_values(['A','C'],ascending=False, inplace=False, na_position='first', ignore_index=False)

Unnamed: 0,A,B,C,D,E
2015-04-01,76,33,22,47,21
2015-04-05,62,74,39,78,59
2015-04-09,60,6,98,12,97
2015-04-10,55,80,12,64,59
2015-04-04,48,20,14,66,80
2015-04-03,40,39,74,45,1
2015-04-02,34,82,57,39,24
2015-04-08,26,45,14,31,48
2015-04-07,15,7,74,62,55
2015-04-06,3,65,89,91,20


## FILTER THE COLUMNS
### SELECT COLUMNS WITH CONDITION (WHERE FILTER)

[Top](#Section)

In [4]:
df_man[df_man>50]
df_man[df_man.A>50]
df_man[df_man.loc[:,'A']>50]

Unnamed: 0,A,B,C,D,E
2015-04-01,76,33,22,47,21
2015-04-05,62,74,39,78,59
2015-04-09,60,6,98,12,97
2015-04-10,55,80,12,64,59


### Add 1 to all the colums

In [5]:
#df_man = df_man.drop(['G'], axis=1)
df_man + 1
df_man.add(1)
#Can also be done with subtraction, division, multiplication

Unnamed: 0,A,B,C,D,E
2015-04-01,77,34,23,48,22
2015-04-02,35,83,58,40,25
2015-04-03,41,40,75,46,2
2015-04-04,49,21,15,67,81
2015-04-05,63,75,40,79,60
2015-04-06,4,66,90,92,21
2015-04-07,16,8,75,63,56
2015-04-08,27,46,15,32,49
2015-04-09,61,7,99,13,98
2015-04-10,56,81,13,65,60


### SELECT COLUMNS WITH CONDITION (isin() FILTER)

In [6]:
df_man['F'] = bool
df_man[df_man['F'].isin(['Yes'])]

Unnamed: 0,A,B,C,D,E,F
2015-04-02,34,82,57,39,24,Yes
2015-04-03,40,39,74,45,1,Yes
2015-04-04,48,20,14,66,80,Yes
2015-04-05,62,74,39,78,59,Yes
2015-04-08,26,45,14,31,48,Yes
2015-04-09,60,6,98,12,97,Yes
2015-04-10,55,80,12,64,59,Yes


## PIVOT THE DATAFRAME

[Top](#Section)

In [7]:
df_man['G'] = df_man.index
df_man.pivot(index='G', columns='F', values='A')

F,No,Yes
G,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-04-01,76.0,
2015-04-02,,34.0
2015-04-03,,40.0
2015-04-04,,48.0
2015-04-05,,62.0
2015-04-06,3.0,
2015-04-07,15.0,
2015-04-08,,26.0
2015-04-09,,60.0
2015-04-10,,55.0


## USING PIVOT TABLES

[Top](#Section)

In [8]:
date = pd.date_range('2015-04-01',periods=3)
date = list(date)*3
date.sort()
account = [1,2,3]*3
df_man = pd.DataFrame({'Date':date, 'Account':account,'Bal':np.random.randint(0,100,9)})
df_man

Unnamed: 0,Date,Account,Bal
0,2015-04-01,1,78
1,2015-04-01,2,80
2,2015-04-01,3,83
3,2015-04-02,1,39
4,2015-04-02,2,28
5,2015-04-02,3,4
6,2015-04-03,1,36
7,2015-04-03,2,30
8,2015-04-03,3,54


In [9]:
pd.pivot_table(df_man, values='Bal', index='Account', columns='Date')

Date,2015-04-01,2015-04-02,2015-04-03
Account,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,78,39,36
2,80,28,30
3,83,4,54


In [10]:
date = random.choices(pd.date_range('2015-04-1', periods =3),k=20)
#date.sort()
account = random.choices([1,2,3],k=20)
#account.sort()
df_man = pd.DataFrame({'Date':date, 'Account':account, 'Bal':np.random.randint(0,100,20)})
df_man.sort_values(by=['Date'])

Unnamed: 0,Date,Account,Bal
3,2015-04-01,1,11
4,2015-04-01,3,52
5,2015-04-01,3,16
6,2015-04-01,2,61
7,2015-04-01,1,91
15,2015-04-01,1,19
18,2015-04-01,1,98
10,2015-04-01,2,91
14,2015-04-01,2,86
13,2015-04-01,1,53


In [11]:
pt_sum = pd.pivot_table(df_man, values='Bal', index='Account', columns='Date', aggfunc=np.sum)
pt_sum

Date,2015-04-01,2015-04-02,2015-04-03
Account,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,272.0,180.0,24.0
2,238.0,14.0,
3,68.0,234.0,34.0


In [12]:
pd.pivot_table(df_man, values='Bal', index='Account', columns='Date', aggfunc=np.mean)
pd.pivot_table(df_man, values='Bal', index='Account', columns='Date', aggfunc=[min,max])
pd.pivot_table(df_man, values='Bal', index='Account', columns='Date', aggfunc=[np.mean, np.median])

Unnamed: 0_level_0,mean,mean,mean,median,median,median
Date,2015-04-01,2015-04-02,2015-04-03,2015-04-01,2015-04-02,2015-04-03
Account,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,54.4,60.0,24.0,53.0,58.0,24.0
2,79.333333,14.0,,86.0,14.0,
3,34.0,58.5,34.0,34.0,65.0,34.0


## APPLY

[Top](#Section)

In [13]:
pt_sum.apply(np.sum,0)
pt_sum.apply(np.sum,1)


Account
1    476.0
2    252.0
3    336.0
dtype: float64

## HANDLING NA

### Replacing NA values
[Top](#Section)

In [14]:
import pandas as pd
import numpy as np

df = pd.DataFrame([[np.nan, 2, np.nan, 0],
                   [3, 4, 7, 1],
                   [np.nan, np.nan, np.nan, 5],
                   [np.nan, 3, np.nan, 4]],
                  columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,7.0,1
2,,,,5
3,,3.0,,4


In [15]:
#Replace all NaN with 3s
df.fillna(3)

Unnamed: 0,A,B,C,D
0,3.0,2.0,3.0,0
1,3.0,4.0,7.0,1
2,3.0,3.0,3.0,5
3,3.0,3.0,3.0,4


In [16]:
#Fill Na for each column with a value
values = {'A':0,'B':1,'C':2,'D':3}
df.fillna(value=values)

Unnamed: 0,A,B,C,D
0,0.0,2.0,2.0,0
1,3.0,4.0,7.0,1
2,0.0,1.0,2.0,5
3,0.0,3.0,2.0,4


In [17]:
#Only replace first two NaN
df.fillna(value=values,limit=2)

Unnamed: 0,A,B,C,D
0,0.0,2.0,2.0,0
1,3.0,4.0,7.0,1
2,0.0,1.0,2.0,5
3,,3.0,,4


In [18]:
### Drop rows with Na
df.dropna()

Unnamed: 0,A,B,C,D
1,3.0,4.0,7.0,1


In [19]:
#Drop columns with NA
df.dropna(axis='columns')

Unnamed: 0,D
0,0
1,1
2,5
3,4


In [20]:
#Keep only rowhs with at lease 2 non-NA values
df.dropna(thresh=2)

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,7.0,1
3,,3.0,,4


In [21]:
#Keep rows in columns that do not have NA
df.dropna(subset=['D','A'])

Unnamed: 0,A,B,C,D
1,3.0,4.0,7.0,1
