__[Open and try this file online (Colab)](https://colab.research.google.com/github/djekra/pandasklar/blob/master/jupyter/21_Manage_Columns.ipynb)__

# Manage columns
* `drop_cols`: Drops a column or a list of columns. Does not throw an error if the column does not exist.
* `move_cols`: Reorders the columns of a DataFrame. The specified columns are moved to a numerical position or behind a named column.
* `rename_col`: Renames a column of a DataFrame. If you try to rename a column again, no error is thrown (better for the workflow in jupyter notebooks).
* `col_names`: Selects column names based on analyse_cols. Useful to apply a method to specific columns of a DataFrame.
* `write_empty_col`: Writes empty iterables into a column.

In [1]:
# blab init
try:
    import blab
except ImportError as e:
    !pip install blab
    import blab    
startup_notebook = blab.blab_startup()
%run $startup_notebook 

blab init
environment['in_colab']     = False
environment['dropbox_path'] = /home/me/Data_Linux/Dropbox
environment['lib_path']     = /home/me/Data_Linux/Dropbox/31_Projekte/01_Python/libs
Start Time: 21:42:08


In [2]:
import pandas     as pd 
import bpyth      as bpy

# pandasklar
try:
    import pandasklar as pak 
except ImportError as e:
    !pip install pandasklar
    import pandasklar as pak   
    
# verbose
pak.Config.set('VERBOSE', True)

# copy_on_write
pd.set_option("mode.copy_on_write", True)

VERBOSE = True
--> setting verbose=True as default for all pandasklar functions



In [3]:
# Test data
df = pak.people()
df

Unnamed: 0,first_name,age,age_class,postal_code,birthplace,secret,features,history
0,Wilhelm,33,30,66354,Berlin,YqoYA,"{H, J, 2, k}","[b, b, a, b]"
1,Willi,31,30,30423,Bremen,iVwiV,{C},"[a, b, c]"
2,Lilli,36,30,77410,Bremen,a6a6xuaqv,"{L, h}","[A, A, A]"
3,Dennis,31,30,74666,,v7ÄPo,"{L, Q, C, E, S}",[]
4,Bastian,34,30,21708,Bremen,dnKÜm,"{p, Y, E}","[c, b, a]"
...,...,...,...,...,...,...,...,...
95,Stefanie,27,20,50068,Berlin,QEMZd,"{b, a, F, m}","[c, b, a]"
96,Henry,30,30,48288,,Öe3g4ÄsEv,{T},"[A, B, C, C]"
97,Jakob,30,30,37937,Berlin,E9uä9üWÖV,"{A, h}","[A, C, C, B]"
98,John,25,20,42791,Berlin,FGngrqÄxw6,"{m, a}","[c, b, a]"


## drop_cols()

In [4]:
?pak.drop_cols

[0;31mSignature:[0m [0mpak[0m[0;34m.[0m[0mdrop_cols[0m[0;34m([0m[0mdf[0m[0;34m,[0m [0mcolnames[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Drops a column or a list of columns.
Does not throw an error if the column does not exist.
[0;31mFile:[0m      ~/Data_Linux/Dropbox/31_Projekte/01_Python/git/pandasklar/src/pandasklar/pandas.py
[0;31mType:[0m      function


In [5]:
df = pak.drop_cols(df,'age_class')
df

Unnamed: 0,first_name,age,postal_code,birthplace,secret,features,history
0,Wilhelm,33,66354,Berlin,YqoYA,"{H, J, 2, k}","[b, b, a, b]"
1,Willi,31,30423,Bremen,iVwiV,{C},"[a, b, c]"
2,Lilli,36,77410,Bremen,a6a6xuaqv,"{L, h}","[A, A, A]"
3,Dennis,31,74666,,v7ÄPo,"{L, Q, C, E, S}",[]
4,Bastian,34,21708,Bremen,dnKÜm,"{p, Y, E}","[c, b, a]"
...,...,...,...,...,...,...,...
95,Stefanie,27,50068,Berlin,QEMZd,"{b, a, F, m}","[c, b, a]"
96,Henry,30,48288,,Öe3g4ÄsEv,{T},"[A, B, C, C]"
97,Jakob,30,37937,Berlin,E9uä9üWÖV,"{A, h}","[A, C, C, B]"
98,John,25,42791,Berlin,FGngrqÄxw6,"{m, a}","[c, b, a]"


In [6]:
# drop_cols avoid error messages
df = pak.drop_cols(df,['postal_code','POSTALCODE'])
df

Unnamed: 0,first_name,age,birthplace,secret,features,history
0,Wilhelm,33,Berlin,YqoYA,"{H, J, 2, k}","[b, b, a, b]"
1,Willi,31,Bremen,iVwiV,{C},"[a, b, c]"
2,Lilli,36,Bremen,a6a6xuaqv,"{L, h}","[A, A, A]"
3,Dennis,31,,v7ÄPo,"{L, Q, C, E, S}",[]
4,Bastian,34,Bremen,dnKÜm,"{p, Y, E}","[c, b, a]"
...,...,...,...,...,...,...
95,Stefanie,27,Berlin,QEMZd,"{b, a, F, m}","[c, b, a]"
96,Henry,30,,Öe3g4ÄsEv,{T},"[A, B, C, C]"
97,Jakob,30,Berlin,E9uä9üWÖV,"{A, h}","[A, C, C, B]"
98,John,25,Berlin,FGngrqÄxw6,"{m, a}","[c, b, a]"


## move_cols()

In [7]:
?pak.move_cols

[0;31mSignature:[0m [0mpak[0m[0;34m.[0m[0mmove_cols[0m[0;34m([0m[0mdf[0m[0;34m,[0m [0mcolnames[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mto[0m[0;34m=[0m[0;36m0[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Reorders the columns of a DataFrame. 
The specified columns are moved to a numerical position or behind a named column.
If no arguments are given, sorts the columns lexicographically.
 * colnames: String or list of column names
 * to:       0: move to the front
            -1: move to the back
           <i>: move to position i (buggy)
     <colname>: move behind a specific column
[0;31mFile:[0m      ~/Data_Linux/Dropbox/31_Projekte/01_Python/git/pandasklar/src/pandasklar/pandas.py
[0;31mType:[0m      function


In [8]:
# move to the front
df = pak.move_cols( df, ['secret','birthplace','i_dont_exist'])
df

Unnamed: 0,secret,birthplace,first_name,age,features,history
0,YqoYA,Berlin,Wilhelm,33,"{H, J, 2, k}","[b, b, a, b]"
1,iVwiV,Bremen,Willi,31,{C},"[a, b, c]"
2,a6a6xuaqv,Bremen,Lilli,36,"{L, h}","[A, A, A]"
3,v7ÄPo,,Dennis,31,"{L, Q, C, E, S}",[]
4,dnKÜm,Bremen,Bastian,34,"{p, Y, E}","[c, b, a]"
...,...,...,...,...,...,...
95,QEMZd,Berlin,Stefanie,27,"{b, a, F, m}","[c, b, a]"
96,Öe3g4ÄsEv,,Henry,30,{T},"[A, B, C, C]"
97,E9uä9üWÖV,Berlin,Jakob,30,"{A, h}","[A, C, C, B]"
98,FGngrqÄxw6,Berlin,John,25,"{m, a}","[c, b, a]"


In [9]:
# move to the end
df = pak.move_cols( df, ['secret','birthplace','i_dont_exist'], to=-1)
df

Unnamed: 0,first_name,age,features,history,secret,birthplace
0,Wilhelm,33,"{H, J, 2, k}","[b, b, a, b]",YqoYA,Berlin
1,Willi,31,{C},"[a, b, c]",iVwiV,Bremen
2,Lilli,36,"{L, h}","[A, A, A]",a6a6xuaqv,Bremen
3,Dennis,31,"{L, Q, C, E, S}",[],v7ÄPo,
4,Bastian,34,"{p, Y, E}","[c, b, a]",dnKÜm,Bremen
...,...,...,...,...,...,...
95,Stefanie,27,"{b, a, F, m}","[c, b, a]",QEMZd,Berlin
96,Henry,30,{T},"[A, B, C, C]",Öe3g4ÄsEv,
97,Jakob,30,"{A, h}","[A, C, C, B]",E9uä9üWÖV,Berlin
98,John,25,"{m, a}","[c, b, a]",FGngrqÄxw6,Berlin


In [10]:
# move to a specific column
df = pak.move_cols( df, ['secret','birthplace','i_dont_exist'], to='age')
df

Unnamed: 0,first_name,age,secret,birthplace,features,history
0,Wilhelm,33,YqoYA,Berlin,"{H, J, 2, k}","[b, b, a, b]"
1,Willi,31,iVwiV,Bremen,{C},"[a, b, c]"
2,Lilli,36,a6a6xuaqv,Bremen,"{L, h}","[A, A, A]"
3,Dennis,31,v7ÄPo,,"{L, Q, C, E, S}",[]
4,Bastian,34,dnKÜm,Bremen,"{p, Y, E}","[c, b, a]"
...,...,...,...,...,...,...
95,Stefanie,27,QEMZd,Berlin,"{b, a, F, m}","[c, b, a]"
96,Henry,30,Öe3g4ÄsEv,,{T},"[A, B, C, C]"
97,Jakob,30,E9uä9üWÖV,Berlin,"{A, h}","[A, C, C, B]"
98,John,25,FGngrqÄxw6,Berlin,"{m, a}","[c, b, a]"


In [11]:
# move to a specific position
df = pak.move_cols( df, ['secret','birthplace','i_dont_exist'], to=1)
df

Unnamed: 0,first_name,secret,birthplace,age,features,history
0,Wilhelm,YqoYA,Berlin,33,"{H, J, 2, k}","[b, b, a, b]"
1,Willi,iVwiV,Bremen,31,{C},"[a, b, c]"
2,Lilli,a6a6xuaqv,Bremen,36,"{L, h}","[A, A, A]"
3,Dennis,v7ÄPo,,31,"{L, Q, C, E, S}",[]
4,Bastian,dnKÜm,Bremen,34,"{p, Y, E}","[c, b, a]"
...,...,...,...,...,...,...
95,Stefanie,QEMZd,Berlin,27,"{b, a, F, m}","[c, b, a]"
96,Henry,Öe3g4ÄsEv,,30,{T},"[A, B, C, C]"
97,Jakob,E9uä9üWÖV,Berlin,30,"{A, h}","[A, C, C, B]"
98,John,FGngrqÄxw6,Berlin,25,"{m, a}","[c, b, a]"


## rename_col()

In [12]:
?pak.rename_col

[0;31mSignature:[0m [0mpak[0m[0;34m.[0m[0mrename_col[0m[0;34m([0m[0mdf[0m[0;34m,[0m [0mname_from[0m[0;34m,[0m [0mname_to[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Renames a column of a DataFrame.
If you try to rename a column again, no error is thrown (better for the workflow in jupyter notebooks).
[0;31mFile:[0m      ~/Data_Linux/Dropbox/31_Projekte/01_Python/git/pandasklar/src/pandasklar/pandas.py
[0;31mType:[0m      function


In [13]:
# do rename
df = pak.rename_col(df, 'age', 'AGE')
df

Unnamed: 0,first_name,secret,birthplace,AGE,features,history
0,Wilhelm,YqoYA,Berlin,33,"{H, J, 2, k}","[b, b, a, b]"
1,Willi,iVwiV,Bremen,31,{C},"[a, b, c]"
2,Lilli,a6a6xuaqv,Bremen,36,"{L, h}","[A, A, A]"
3,Dennis,v7ÄPo,,31,"{L, Q, C, E, S}",[]
4,Bastian,dnKÜm,Bremen,34,"{p, Y, E}","[c, b, a]"
...,...,...,...,...,...,...
95,Stefanie,QEMZd,Berlin,27,"{b, a, F, m}","[c, b, a]"
96,Henry,Öe3g4ÄsEv,,30,{T},"[A, B, C, C]"
97,Jakob,E9uä9üWÖV,Berlin,30,"{A, h}","[A, C, C, B]"
98,John,FGngrqÄxw6,Berlin,25,"{m, a}","[c, b, a]"


In [14]:
# do it again >> no error
pak.rename_col(df, 'age', 'AGE')

Unnamed: 0,first_name,secret,birthplace,AGE,features,history
0,Wilhelm,YqoYA,Berlin,33,"{H, J, 2, k}","[b, b, a, b]"
1,Willi,iVwiV,Bremen,31,{C},"[a, b, c]"
2,Lilli,a6a6xuaqv,Bremen,36,"{L, h}","[A, A, A]"
3,Dennis,v7ÄPo,,31,"{L, Q, C, E, S}",[]
4,Bastian,dnKÜm,Bremen,34,"{p, Y, E}","[c, b, a]"
...,...,...,...,...,...,...
95,Stefanie,QEMZd,Berlin,27,"{b, a, F, m}","[c, b, a]"
96,Henry,Öe3g4ÄsEv,,30,{T},"[A, B, C, C]"
97,Jakob,E9uä9üWÖV,Berlin,30,"{A, h}","[A, C, C, B]"
98,John,FGngrqÄxw6,Berlin,25,"{m, a}","[c, b, a]"


## col_names()

In [15]:
?pak.col_names

[0;31mSignature:[0m
[0mpak[0m[0;34m.[0m[0mcol_names[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdf[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0monly[0m[0;34m=[0m[0;34m''[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mwithout[0m[0;34m=[0m[0;34m'XXXXXX'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mas_list[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mquery[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msort[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Selects column names based on analyse_cols. Useful to apply a method to specific columns of a DataFrame.
* only:     Only column names whose datatype begins like this.
* without:  Without column names whose datatype starts like this
* as_list:  Output result as list (otherwise as DataFrame, useful for development and control)
* query:    additional conditions
* sort:     sorted by nunique
d

In [16]:
# have a look at the analyse again
pak.analyse_cols(df)

Unnamed: 0,col_name,datatype_instance,datatype,datatype_short,datatype_suggest,is_numeric,is_string,is_hashable,nan_allowed,mem_usage,ntypes,nunique,nnan,ndups,n,vmin,vmean,vmedian,vmax,vsum
0,__index__,int64,np.int64,int64,np.int8,True,False,True,False,80.0 B,1,100,0,0,100,0.0,49.5,49.5,99.0,4950.0
1,first_name,str,pd.string,string,,False,True,True,True,622.0 B,1,70,0,30,100,Aileen,,,Yvonne,
2,secret,str,pd.string,string,,False,True,True,True,812.0 B,1,100,0,0,100,0SÄINB,,,üwkkNSeeI,
3,birthplace,str,pd.string,string,,False,True,True,True,607.0 B,1,2,28,70,100,Berlin,,,Bremen,
4,AGE,int8,pd.Int8,Int8,,True,False,True,True,20.0 B,1,20,0,80,100,21.0,30.54,31.0,42.0,3054.0
5,features,set,object,object,,False,False,False,True,2.0 KB,1,79,0,21,100,{},,,"{L, h}",
6,history,list,object,object,,False,False,False,True,704.0 B,1,9,0,91,100,[],,,"[c, b, a]",


In [17]:
# select some columns
pak.col_names(df, only='int', query='nnan == 0', as_list=True )

['AGE']

In [18]:
# Example: fillna on all sting columns
df2 = df.copy()
spalten = pak.col_names(df2, only='str', query='nnan > 0')

print('Spalten:',spalten)
print('any_nan before:', pak.any_nan(df2[spalten]))

# apply
df2[spalten] = df2[spalten].fillna('')
print('any_nan after:', pak.any_nan(df2[spalten]))

Spalten: ['birthplace']
any_nan before: True
any_nan after: False


In [19]:
# Example2: astype on all int64 columns
# First: find out suitable data type
pak.type_info('int8').info()

{'instance1': None,
 'instance2': None,
 'name': 'np.int8',
 'framework': 'np',
 'name_short': 'int8',
 'name_long': 'np.int8',
 'class_object': numpy.int8,
 'is_hashable': True,
 'nan_allowed': False,
 'name_instance': '',
 'xmin': -128,
 'xmax': 127}

In [20]:
# astype on all int64 columns
df2 = df.copy()
spalten = pak.col_names(df2, only='int', query='vmin >= 0 and vmax < 127 and nnan == 0') # int_grob only
spalten

['AGE']

In [21]:
df2[spalten] = df2[spalten].astype( pak.type_info('int8').class_object  )

In [22]:
spalten = pak.col_names(df2, only='int')
pak.analyse_cols(df2[spalten])

Unnamed: 0,col_name,datatype_instance,datatype,datatype_short,datatype_suggest,is_numeric,is_string,is_hashable,nan_allowed,mem_usage,ntypes,nunique,nnan,ndups,n,vmin,vmean,vmedian,vmax,vsum
0,__index__,int64,np.int64,int64,np.int8,True,False,True,False,80.0 B,1,100,0,0,100,0.0,49.5,49.5,99.0,4950.0
1,AGE,int8,np.int8,int8,pd.Int8,True,False,True,False,10.0 B,1,20,0,80,100,21.0,30.54,31.0,42.0,3054.0


## write_empty_col

In [23]:
?pak.write_empty_col

[0;31mSignature:[0m [0mpak[0m[0;34m.[0m[0mwrite_empty_col[0m[0;34m([0m[0mdf[0m[0;34m,[0m [0mcol_name[0m[0;34m,[0m [0mcontent[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Writes empty iterables into a column and sets datatype.
* content: Can be 'list','set','dict','defaultdict', 'string' or any empty iterable.
Example: write_empty_col(df,'mycol','list') writes empty lists in mycol.
[0;31mFile:[0m      ~/Data_Linux/Dropbox/31_Projekte/01_Python/git/pandasklar/src/pandasklar/pandas.py
[0;31mType:[0m      function


In [24]:
pak.write_empty_col(df,'mycol','defaultdict')

Unnamed: 0,first_name,secret,birthplace,AGE,features,history,mycol
0,Wilhelm,YqoYA,Berlin,33,"{H, J, 2, k}","[b, b, a, b]",{}
1,Willi,iVwiV,Bremen,31,{C},"[a, b, c]",{}
2,Lilli,a6a6xuaqv,Bremen,36,"{L, h}","[A, A, A]",{}
3,Dennis,v7ÄPo,,31,"{L, Q, C, E, S}",[],{}
4,Bastian,dnKÜm,Bremen,34,"{p, Y, E}","[c, b, a]",{}
...,...,...,...,...,...,...,...
95,Stefanie,QEMZd,Berlin,27,"{b, a, F, m}","[c, b, a]",{}
96,Henry,Öe3g4ÄsEv,,30,{T},"[A, B, C, C]",{}
97,Jakob,E9uä9üWÖV,Berlin,30,"{A, h}","[A, C, C, B]",{}
98,John,FGngrqÄxw6,Berlin,25,"{m, a}","[c, b, a]",{}


In [25]:
pak.write_empty_col(df,'mycol',list())

Unnamed: 0,first_name,secret,birthplace,AGE,features,history,mycol
0,Wilhelm,YqoYA,Berlin,33,"{H, J, 2, k}","[b, b, a, b]",[]
1,Willi,iVwiV,Bremen,31,{C},"[a, b, c]",[]
2,Lilli,a6a6xuaqv,Bremen,36,"{L, h}","[A, A, A]",[]
3,Dennis,v7ÄPo,,31,"{L, Q, C, E, S}",[],[]
4,Bastian,dnKÜm,Bremen,34,"{p, Y, E}","[c, b, a]",[]
...,...,...,...,...,...,...,...
95,Stefanie,QEMZd,Berlin,27,"{b, a, F, m}","[c, b, a]",[]
96,Henry,Öe3g4ÄsEv,,30,{T},"[A, B, C, C]",[]
97,Jakob,E9uä9üWÖV,Berlin,30,"{A, h}","[A, C, C, B]",[]
98,John,FGngrqÄxw6,Berlin,25,"{m, a}","[c, b, a]",[]
