In [223]:
import pandas as pd 
import numpy as np

class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)

# Functions to know so far

# Table Indexing and Slicing

###### Sample DF Creation from scratch

In [224]:
words = pd.Series(['hello','good','bye','maybe'])
col1 = words.sample(10,replace=True)
col1.head()

0    hello
1     good
0    hello
2      bye
3    maybe
dtype: object

###### Creating a Series and DF

Let's reset the indecies and assign a name to the series

In [225]:
col1.reset_index(drop=True,name='blah',inplace=True)
col1.head()

0    hello
1     good
2    hello
3      bye
4    maybe
Name: blah, dtype: object

In [226]:
col1.rename('Fixed Name',copy=False,inplace=True)
col1

0    hello
1     good
2    hello
3      bye
4    maybe
5    hello
6      bye
7    maybe
8    hello
9     good
Name: Fixed Name, dtype: object

Passing a mapping or a function will change the labels

In [227]:
col1.rename(lambda x: x**2,inplace=True,copy=False)
col1

0     hello
1      good
4     hello
9       bye
16    maybe
25    hello
36      bye
49    maybe
64    hello
81     good
Name: Fixed Name, dtype: object

In [228]:
col1.reset_index(drop=True,inplace=True,name='category')
col1

0    hello
1     good
2    hello
3      bye
4    maybe
5    hello
6      bye
7    maybe
8    hello
9     good
Name: category, dtype: object

In [229]:
col1.rename_axis('index',copy=False,inplace=True)
col1

index
0    hello
1     good
2    hello
3      bye
4    maybe
5    hello
6      bye
7    maybe
8    hello
9     good
Name: category, dtype: object

Now let's create a DF and append col1 to it

In [230]:
df=pd.DataFrame(np.random.randn(10,4),columns=list('abcd'))
pd.concat([df,col1],axis=1)
#or
df['category']=col1
df.head(2)

Unnamed: 0,a,b,c,d,category
0,0.475792,0.855135,-0.056463,1.103796,hello
1,-0.430023,-0.462592,0.287409,0.469179,good


###### Reindex/Rename

In [231]:
df.reindex(np.arange(1,4),axis=0,copy=False)

Unnamed: 0,a,b,c,d,category
1,-0.430023,-0.462592,0.287409,0.469179,good
2,0.405246,-0.284096,0.886093,0.896904,hello
3,-0.269263,-1.906759,-0.420642,1.757762,bye


In [232]:
df.reindex(index=np.arange(0,20,2),columns=list('bde'),copy=False,fill_value=100).head(7)

Unnamed: 0,b,d,e
0,0.855135,1.103796,100
2,-0.284096,0.896904,100
4,-0.732559,-1.517676,100
6,0.017914,-0.296345,100
8,-1.672452,-0.121698,100
10,100.0,100.0,100
12,100.0,100.0,100


In [233]:
df.reindex(index=np.arange(0,15),copy=False,method='ffill',limit=3,fill_value=100)

Unnamed: 0,a,b,c,d,category
0,0.475792,0.855135,-0.056463,1.103796,hello
1,-0.430023,-0.462592,0.287409,0.469179,good
2,0.405246,-0.284096,0.886093,0.896904,hello
3,-0.269263,-1.906759,-0.420642,1.757762,bye
4,-1.142284,-0.732559,0.566788,-1.517676,maybe
5,-0.961623,2.447174,0.311278,0.097899,hello
6,-0.183577,0.017914,0.349984,-0.296345,bye
7,-0.907963,0.590703,-0.036541,-0.833209,maybe
8,0.088129,-1.672452,-0.272646,-0.121698,hello
9,-0.388534,0.680432,1.10964,-0.991786,good


Let's check the indecies

In [234]:
df.index.get_level_values(0)

RangeIndex(start=0, stop=10, step=1)

In [235]:
df.dtypes

a           float64
b           float64
c           float64
d           float64
category     object
dtype: object

Let's drop column b

In [236]:
df.drop(columns='b',inplace=True)
df.head()

Unnamed: 0,a,c,d,category
0,0.475792,-0.056463,1.103796,hello
1,-0.430023,0.287409,0.469179,good
2,0.405246,0.886093,0.896904,hello
3,-0.269263,-0.420642,1.757762,bye
4,-1.142284,0.566788,-1.517676,maybe


Rename columns

In [237]:
df.rename(str.upper,axis=1,copy=False,inplace=True)
df

Unnamed: 0,A,C,D,CATEGORY
0,0.475792,-0.056463,1.103796,hello
1,-0.430023,0.287409,0.469179,good
2,0.405246,0.886093,0.896904,hello
3,-0.269263,-0.420642,1.757762,bye
4,-1.142284,0.566788,-1.517676,maybe
5,-0.961623,0.311278,0.097899,hello
6,-0.183577,0.349984,-0.296345,bye
7,-0.907963,-0.036541,-0.833209,maybe
8,0.088129,-0.272646,-0.121698,hello
9,-0.388534,1.10964,-0.991786,good


In [238]:
df.A.rename('F',inplace=True,copy=False,axis=1)
print(df.A.head())
df.head()

0    0.475792
1   -0.430023
2    0.405246
3   -0.269263
4   -1.142284
Name: F, dtype: float64


Unnamed: 0,A,C,D,CATEGORY
0,0.475792,-0.056463,1.103796,hello
1,-0.430023,0.287409,0.469179,good
2,0.405246,0.886093,0.896904,hello
3,-0.269263,-0.420642,1.757762,bye
4,-1.142284,0.566788,-1.517676,maybe


In [239]:
df.rename({'A':'F'},copy=False,inplace=True,axis=1)
df

Unnamed: 0,F,C,D,CATEGORY
0,0.475792,-0.056463,1.103796,hello
1,-0.430023,0.287409,0.469179,good
2,0.405246,0.886093,0.896904,hello
3,-0.269263,-0.420642,1.757762,bye
4,-1.142284,0.566788,-1.517676,maybe
5,-0.961623,0.311278,0.097899,hello
6,-0.183577,0.349984,-0.296345,bye
7,-0.907963,-0.036541,-0.833209,maybe
8,0.088129,-0.272646,-0.121698,hello
9,-0.388534,1.10964,-0.991786,good


In [240]:
df.rename_axis('Letters',axis=1,copy=False,inplace=True)
df

Letters,F,C,D,CATEGORY
0,0.475792,-0.056463,1.103796,hello
1,-0.430023,0.287409,0.469179,good
2,0.405246,0.886093,0.896904,hello
3,-0.269263,-0.420642,1.757762,bye
4,-1.142284,0.566788,-1.517676,maybe
5,-0.961623,0.311278,0.097899,hello
6,-0.183577,0.349984,-0.296345,bye
7,-0.907963,-0.036541,-0.833209,maybe
8,0.088129,-0.272646,-0.121698,hello
9,-0.388534,1.10964,-0.991786,good


In [241]:
df.rename_axis('Indecies',axis=0,index=True,copy=False)

Letters,F,C,D,CATEGORY
Indecies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.475792,-0.056463,1.103796,hello
1,-0.430023,0.287409,0.469179,good
2,0.405246,0.886093,0.896904,hello
3,-0.269263,-0.420642,1.757762,bye
4,-1.142284,0.566788,-1.517676,maybe
5,-0.961623,0.311278,0.097899,hello
6,-0.183577,0.349984,-0.296345,bye
7,-0.907963,-0.036541,-0.833209,maybe
8,0.088129,-0.272646,-0.121698,hello
9,-0.388534,1.10964,-0.991786,good


In [242]:
df.rename_axis(index='Row_ID',columns='Col Names',copy=False,inplace=True)
df

Col Names,F,C,D,CATEGORY
Row_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.475792,-0.056463,1.103796,hello
1,-0.430023,0.287409,0.469179,good
2,0.405246,0.886093,0.896904,hello
3,-0.269263,-0.420642,1.757762,bye
4,-1.142284,0.566788,-1.517676,maybe
5,-0.961623,0.311278,0.097899,hello
6,-0.183577,0.349984,-0.296345,bye
7,-0.907963,-0.036541,-0.833209,maybe
8,0.088129,-0.272646,-0.121698,hello
9,-0.388534,1.10964,-0.991786,good


In [243]:
df.set_axis(list('abcd'),axis=1,inplace=True)
df

Unnamed: 0_level_0,a,b,c,d
Row_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.475792,-0.056463,1.103796,hello
1,-0.430023,0.287409,0.469179,good
2,0.405246,0.886093,0.896904,hello
3,-0.269263,-0.420642,1.757762,bye
4,-1.142284,0.566788,-1.517676,maybe
5,-0.961623,0.311278,0.097899,hello
6,-0.183577,0.349984,-0.296345,bye
7,-0.907963,-0.036541,-0.833209,maybe
8,0.088129,-0.272646,-0.121698,hello
9,-0.388534,1.10964,-0.991786,good


Capitalize the col labels and name them as Columns

In [244]:
df.rename(str.upper,axis=1,copy=False,inplace=True)
df.rename_axis('Columns',axis=1,copy=False,inplace=True)
df

Columns,A,B,C,D
Row_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.475792,-0.056463,1.103796,hello
1,-0.430023,0.287409,0.469179,good
2,0.405246,0.886093,0.896904,hello
3,-0.269263,-0.420642,1.757762,bye
4,-1.142284,0.566788,-1.517676,maybe
5,-0.961623,0.311278,0.097899,hello
6,-0.183577,0.349984,-0.296345,bye
7,-0.907963,-0.036541,-0.833209,maybe
8,0.088129,-0.272646,-0.121698,hello
9,-0.388534,1.10964,-0.991786,good


###### Slicing and Selection by labels

In [252]:
df.loc[:,'A':'C']

Columns,A,B,C
Row_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.475792,-0.056463,1.103796
1,-0.430023,0.287409,0.469179
2,0.405246,0.886093,0.896904
3,-0.269263,-0.420642,1.757762
4,-1.142284,0.566788,-1.517676
5,-0.961623,0.311278,0.097899
6,-0.183577,0.349984,-0.296345
7,-0.907963,-0.036541,-0.833209
8,0.088129,-0.272646,-0.121698
9,-0.388534,1.10964,-0.991786


In [253]:
df.iloc[::2,::-1]

Columns,D,C,B,A
Row_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,hello,1.103796,-0.056463,0.475792
2,hello,0.896904,0.886093,0.405246
4,maybe,-1.517676,0.566788,-1.142284
6,bye,-0.296345,0.349984,-0.183577
8,hello,-0.121698,-0.272646,0.088129


In [254]:
df[:4]

Columns,A,B,C,D
Row_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.475792,-0.056463,1.103796,hello
1,-0.430023,0.287409,0.469179,good
2,0.405246,0.886093,0.896904,hello
3,-0.269263,-0.420642,1.757762,bye


In [258]:
df.iat[3,3]

'bye'

In [259]:
df.at[2,'D']

'hello'

In [265]:
df.A.sample(n=3,replace=True)

Row_ID
2    0.405246
2    0.405246
8    0.088129
Name: A, dtype: float64

In [266]:
df.sample(n=3,replace=True)

Columns,A,B,C,D
Row_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,0.405246,0.886093,0.896904,hello
6,-0.183577,0.349984,-0.296345,bye
0,0.475792,-0.056463,1.103796,hello


###### Boolean Selections

Check if we have ANY nulls

In [270]:
df.all()

Columns
A    True
B    True
C    True
D    True
dtype: bool

In [272]:
df.A.all()

True

select values from the table where col A<0

In [282]:
df[df.A<0]

Columns,A,B,C,D
Row_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,-0.430023,0.287409,0.469179,good
3,-0.269263,-0.420642,1.757762,bye
4,-1.142284,0.566788,-1.517676,maybe
5,-0.961623,0.311278,0.097899,hello
6,-0.183577,0.349984,-0.296345,bye
7,-0.907963,-0.036541,-0.833209,maybe
9,-0.388534,1.10964,-0.991786,good


Same with map now

In [283]:
df[df.A.map(lambda x: x<0)]

Columns,A,B,C,D
Row_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,-0.430023,0.287409,0.469179,good
3,-0.269263,-0.420642,1.757762,bye
4,-1.142284,0.566788,-1.517676,maybe
5,-0.961623,0.311278,0.097899,hello
6,-0.183577,0.349984,-0.296345,bye
7,-0.907963,-0.036541,-0.833209,maybe
9,-0.388534,1.10964,-0.991786,good


Replace those values in col A with 0 if A<0

In [285]:
df.A.mask(df.A<0,0)

Row_ID
0    0.475792
1    0.000000
2    0.405246
3    0.000000
4    0.000000
5    0.000000
6    0.000000
7    0.000000
8    0.088129
9    0.000000
Name: A, dtype: float64

In [286]:
df.A.where(df.A>0,0)

Row_ID
0    0.475792
1    0.000000
2    0.405246
3    0.000000
4    0.000000
5    0.000000
6    0.000000
7    0.000000
8    0.088129
9    0.000000
Name: A, dtype: float64

In [289]:
df.A[df.A.map(lambda x: x<0)]=0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [297]:
df.isin([0]).head()

Columns,A,B,C,D
Row_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,False,False,False,False
1,True,False,False,False
2,False,False,False,False
3,True,False,False,False
4,True,False,False,False


# Pivot, Merge and MultiIndex Slicing

###### Creating a multiIndex

In [302]:
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]

tuples=list(zip(*arrays))
index=pd.MultiIndex.from_tuples(tuples,names=['First','Second'])
index

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['First', 'Second'])

In [303]:
s=pd.Series(np.random.randint(0,100,len(index)),index=index)
s

First  Second
bar    one       95
       two       69
baz    one       82
       two       29
foo    one       60
       two       17
qux    one       17
       two       87
dtype: int32

###### Creating DF for merging and pivoting

In [304]:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']},
index=[0, 1, 2, 3])

df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
'B': ['B4', 'B5', 'B6', 'B7'],
'C': ['C4', 'C5', 'C6', 'C7'],
'D': ['D4', 'D5', 'D6', 'D7']},
index=[4, 5, 6, 7])

df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
'B': ['B8', 'B9', 'B10', 'B11'],
'C': ['C8', 'C9', 'C10', 'C11'],
'D': ['D8', 'D9', 'D10', 'D11']},
index=[8, 9, 10, 11])

display('df1','df2','df3')

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3

Unnamed: 0,A,B,C,D
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7

Unnamed: 0,A,B,C,D
8,A8,B8,C8,D8
9,A9,B9,C9,D9
10,A10,B10,C10,D10
11,A11,B11,C11,D11


###### Concat the tables

In [305]:
pd.concat([df1,df2],axis=0,keys=['df1','df2'],names=['first','second'])

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
df1,0,A0,B0,C0,D0
df1,1,A1,B1,C1,D1
df1,2,A2,B2,C2,D2
df1,3,A3,B3,C3,D3
df2,4,A4,B4,C4,D4
df2,5,A5,B5,C5,D5
df2,6,A6,B6,C6,D6
df2,7,A7,B7,C7,D7


In [307]:
pd.concat([df1,df2],axis=0)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


In [None]:
pd

In [308]:
pd.concat([df1,df2],axis=0)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7
