# Pandas

## Pandas define dataframe and add row

In [9]:
import pandas as pd
df  = pd.DataFrame(columns = ['digit', 'idx'])
df.loc[0] = [1, 2]
df

Unnamed: 0,digit,idx
0,1,2


## Pandas loc vs iloc

In [24]:
import pandas as pd
import numpy as np
df  = pd.DataFrame(columns = ['digit', 'number'])
df.digit = np.linspace(1,10,10)
df.number = np.linspace(100,110,10)
df = df[df.digit.between(3,5)]
df

Unnamed: 0,digit,number
2,3.0,102.222222
3,4.0,103.333333
4,5.0,104.444444


### iloc access a relative index after filtering

In [25]:
df.iloc[0] #il

digit       3.000000
number    102.222222
Name: 2, dtype: float64

### loc access global index and therefore gives error after filtering

In [26]:
df.loc[3]

digit       4.000000
number    103.333333
Name: 3, dtype: float64

### pandas take first row based on a field

In [29]:
import pandas as pd
import numpy as np
df  = pd.DataFrame(columns = ['digit', 'number'])
df.digit = [1,1,2,2,3,3,4,4,5,5]
df.number = np.linspace(100,110,10)
df

Unnamed: 0,digit,number
0,1,100.0
1,1,101.111111
2,2,102.222222
3,2,103.333333
4,3,104.444444
5,3,105.555556
6,4,106.666667
7,4,107.777778
8,5,108.888889
9,5,110.0


In [31]:
df = df.groupby('digit').first().reset_index()
df

Unnamed: 0,digit,number
0,1,100.0
1,2,102.222222
2,3,104.444444
3,4,106.666667
4,5,108.888889


## pandas load and save dataframe as csv

In [37]:
import pandas as pd
import numpy as np
df  = pd.DataFrame(columns = ['digit', 'number'])
df.digit = [1,1,2,2,3,3,4,4,5,5]
df.number = np.linspace(100,110,10)
df.to_csv('example_table.csv',index=False)
df2 = pd.read_csv('example_table.csv')
df2

Unnamed: 0,digit,number
0,1,100.0
1,1,101.111111
2,2,102.222222
3,2,103.333333
4,3,104.444444
5,3,105.555556
6,4,106.666667
7,4,107.777778
8,5,108.888889
9,5,110.0


## pandas apply

In [38]:
import pandas as pd
df  = pd.DataFrame(columns = ['digit', 'idx'])
df.digit = [12, 124, 1222, 122]
df.idx = [0, 10, 20, 30]

df['applied'] = df.apply(lambda x: x['digit'] + 5, axis=1)
df

Unnamed: 0,digit,idx,applied
0,12,0,17
1,124,10,129
2,1222,20,1227
3,122,30,127


## pandas apply custom function

In [39]:
import pandas as pd
df  = pd.DataFrame(columns = ['digit', 'idx'])
df.digit = [12, 124, 1222, 122]
df.idx = [0, 10, 20, 30]

def my_custom_function(x):
    return x['digit']+5
df['applied'] = df.apply(lambda x: my_custom_function(x), axis=1)
df

Unnamed: 0,digit,idx,applied
0,12,0,17
1,124,10,129
2,1222,20,1227
3,122,30,127


## pandas sort rows in df1 by df2

In [40]:
df1 = pd.DataFrame({'name' : ['A', 'Z','C'],
                   'company' : ['Apple', 'Yahoo','Amazon'],
                   'height' : [130, 150,173]})

df1

Unnamed: 0,name,company,height
0,A,Apple,130
1,Z,Yahoo,150
2,C,Amazon,173


In [41]:
df2 = pd.DataFrame({'name' : ['Z', 'C', 'A'],
                   'x' : [5, 4,3],
                   'y' : [1, 11,111]})
df2

Unnamed: 0,name,x,y
0,Z,5,1
1,C,4,11
2,A,3,111


In [42]:
df1 = df1.set_index('name').reindex(list(df2.name)).reset_index()
assert list(df1.name) == list(df2.name)
df1

Unnamed: 0,name,company,height
0,Z,Yahoo,150
1,C,Amazon,173
2,A,Apple,130


##  Inplace explained

Original data frame

In [20]:
import pandas as pd
df = pd.DataFrame()
df['A'] = [10,2]
df['B'] = [3,4]
display(df)

Unnamed: 0,A,B
0,10,3
1,2,4


sort a data frame

In [22]:
df.sort_values('A', ascending=True)
df

Unnamed: 0,A,B
0,10,3
1,2,4


Without i

## Multi index slicing

In [38]:
import numpy as np, pandas as pd
mux = pd.MultiIndex.from_arrays([
    list('aaaabbbb'),
    list('tuvwtuvw')
], names=['one', 'two'])

df = pd.DataFrame({'col': np.arange(len(mux))}, mux)
display(df)
df.loc[('b','u')]

Unnamed: 0_level_0,Unnamed: 1_level_0,col
one,two,Unnamed: 2_level_1
a,t,0
a,u,1
a,v,2
a,w,3
b,t,4
b,u,5
b,v,6
b,w,7


col    5
Name: (b, u), dtype: int32