__[Open and try this file online (Colab)](https://colab.research.google.com/github/djekra/pandasklar/blob/master/jupyter/22_Manage_Rows.ipynb)__

# Manage rows
* `drop_multiindex`: Converts any MultiIndex to normal columns and resets the index. Works with MultiIndex in Series or DataFrames, in rows and in columns.
* `reset_index`: Creates a new, unnamed index. If keep_as is given, the old index is preserved as a row with this name. Otherwise the old index is dropped.
* `rename_index`: Renames the index.
* `drop_rows`: Drops rows identified by a binary mask, verbose if wanted.
* `move_rows`: Moves rows identified by a binary mask from one dataframe to another (e.g. into a trash).<br>
   The target dataframe gets an additional message column by standard (to identify why the rows were moved). Verbose if wanted. 
* `add_rows`: Like concat, with additional features only_new and verbose.

In [1]:
# blab init
try:
    import blab
except ImportError as e:
    !pip install blab
    import blab    
startup_notebook = blab.blab_startup()
%run $startup_notebook 

blab init
environment['in_colab']     = False
environment['dropbox_path'] = /home/me/Data_Linux/Dropbox
environment['lib_path']     = /home/me/Data_Linux/Dropbox/31_Projekte/01_Python/libs
Start Time: 21:43:00


In [2]:
import pandas     as pd 
import bpyth      as bpy

# pandasklar
try:
    import pandasklar as pak 
except ImportError as e:
    !pip install pandasklar
    import pandasklar as pak   
    
# verbose
pak.Config.set('VERBOSE', True)

# copy_on_write
pd.set_option("mode.copy_on_write", True)

VERBOSE = True
--> setting verbose=True as default for all pandasklar functions



In [3]:
# Test data
df = pak.people(1000)
df

Unnamed: 0,first_name,age,age_class,postal_code,birthplace,secret,features,history
0,Elisabeth,29,20,16124,Bremen,Ai1au,"{e, s, m, o, t}","[A, C, C, B]"
1,Kira,28,20,78792,Berlin,oCoiw2or,"{b, C, P}","[A, A, A]"
2,Tanja,28,20,14686,Bremen,ighxfpNOR6,"{U, u, z}",[]
3,Marko,34,30,83425,,DAVXiÜOYVs,"{V, N, k, z}","[A, B, C]"
4,Elli,28,20,37741,Berlin,GlrdjitN,"{m, h}","[b, b, a, b]"
...,...,...,...,...,...,...,...,...
995,Christina,41,40,54858,Bremen,xXpe1FwE,"{L, F, D, C, T}","[A, A, A]"
996,Walter,33,30,27094,Bremen,4BlOVSzG,"{Q, W}","[A, B, C, C]"
997,Elena,30,30,65212,,vnaÜN,"{v, N, s, D}","[A, B, C]"
998,Helene,24,20,,Berlin,tIuToS,"{3, u}","[A, C, C, B]"


## === Row Indexes ===

### rename_index()

In [4]:
# set a trashy index
df.index = pak.random_series(df.shape[0], 'string', len_min=3, len_max=3, p_dup=0)
df

Unnamed: 0_level_0,first_name,age,age_class,postal_code,birthplace,secret,features,history
rnd_string,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
SÄa,Elisabeth,29,20,16124,Bremen,Ai1au,"{e, s, m, o, t}","[A, C, C, B]"
äQR,Kira,28,20,78792,Berlin,oCoiw2or,"{b, C, P}","[A, A, A]"
4yA,Tanja,28,20,14686,Bremen,ighxfpNOR6,"{U, u, z}",[]
Dvn,Marko,34,30,83425,,DAVXiÜOYVs,"{V, N, k, z}","[A, B, C]"
Xat,Elli,28,20,37741,Berlin,GlrdjitN,"{m, h}","[b, b, a, b]"
...,...,...,...,...,...,...,...,...
Ü7R,Christina,41,40,54858,Bremen,xXpe1FwE,"{L, F, D, C, T}","[A, A, A]"
d0ö,Walter,33,30,27094,Bremen,4BlOVSzG,"{Q, W}","[A, B, C, C]"
out,Elena,30,30,65212,,vnaÜN,"{v, N, s, D}","[A, B, C]"
eYr,Helene,24,20,,Berlin,tIuToS,"{3, u}","[A, C, C, B]"


In [5]:
df = pak.rename_index(df,'index_new')
df

Unnamed: 0_level_0,first_name,age,age_class,postal_code,birthplace,secret,features,history
index_new,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
SÄa,Elisabeth,29,20,16124,Bremen,Ai1au,"{e, s, m, o, t}","[A, C, C, B]"
äQR,Kira,28,20,78792,Berlin,oCoiw2or,"{b, C, P}","[A, A, A]"
4yA,Tanja,28,20,14686,Bremen,ighxfpNOR6,"{U, u, z}",[]
Dvn,Marko,34,30,83425,,DAVXiÜOYVs,"{V, N, k, z}","[A, B, C]"
Xat,Elli,28,20,37741,Berlin,GlrdjitN,"{m, h}","[b, b, a, b]"
...,...,...,...,...,...,...,...,...
Ü7R,Christina,41,40,54858,Bremen,xXpe1FwE,"{L, F, D, C, T}","[A, A, A]"
d0ö,Walter,33,30,27094,Bremen,4BlOVSzG,"{Q, W}","[A, B, C, C]"
out,Elena,30,30,65212,,vnaÜN,"{v, N, s, D}","[A, B, C]"
eYr,Helene,24,20,,Berlin,tIuToS,"{3, u}","[A, C, C, B]"


### reset_index()

In [6]:
?pak.reset_index

[0;31mSignature:[0m [0mpak[0m[0;34m.[0m[0mreset_index[0m[0;34m([0m[0mdf[0m[0;34m,[0m [0mkeep_as[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Creates a new, unnamed index.
* keep_as: If keep_as is given, the old index is preserved as a row with this name.
Otherwise the old index is dropped.
[0;31mFile:[0m      ~/Data_Linux/Dropbox/31_Projekte/01_Python/git/pandasklar/src/pandasklar/pandas.py
[0;31mType:[0m      function


In [7]:
df = pak.reset_index(df)
df

Unnamed: 0,first_name,age,age_class,postal_code,birthplace,secret,features,history
0,Elisabeth,29,20,16124,Bremen,Ai1au,"{e, s, m, o, t}","[A, C, C, B]"
1,Kira,28,20,78792,Berlin,oCoiw2or,"{b, C, P}","[A, A, A]"
2,Tanja,28,20,14686,Bremen,ighxfpNOR6,"{U, u, z}",[]
3,Marko,34,30,83425,,DAVXiÜOYVs,"{V, N, k, z}","[A, B, C]"
4,Elli,28,20,37741,Berlin,GlrdjitN,"{m, h}","[b, b, a, b]"
...,...,...,...,...,...,...,...,...
995,Christina,41,40,54858,Bremen,xXpe1FwE,"{L, F, D, C, T}","[A, A, A]"
996,Walter,33,30,27094,Bremen,4BlOVSzG,"{Q, W}","[A, B, C, C]"
997,Elena,30,30,65212,,vnaÜN,"{v, N, s, D}","[A, B, C]"
998,Helene,24,20,,Berlin,tIuToS,"{3, u}","[A, C, C, B]"


### drop_multiindex()

In [8]:
?pak.drop_multiindex

[0;31mSignature:[0m [0mpak[0m[0;34m.[0m[0mdrop_multiindex[0m[0;34m([0m[0mdf[0m[0;34m,[0m [0mverbose[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Converts any MultiIndex to normal columns and resets the index. 
Works with MultiIndex in Series or DataFrames, in rows and in columns.
[0;31mFile:[0m      ~/Data_Linux/Dropbox/31_Projekte/01_Python/git/pandasklar/src/pandasklar/pandas.py
[0;31mType:[0m      function


## === Rows ===

### drop_rows()

In [9]:
?pak.drop_rows

[0;31mSignature:[0m [0mpak[0m[0;34m.[0m[0mdrop_rows[0m[0;34m([0m[0mdf[0m[0;34m,[0m [0mmask[0m[0;34m,[0m [0mverbose[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Drops rows identified by a binary mask.
* verbose: True if you want to print how many rows are droped.
(If you want to delete the rows to a trash, use move_rows.)
[0;31mFile:[0m      ~/Data_Linux/Dropbox/31_Projekte/01_Python/git/pandasklar/src/pandasklar/pandas.py
[0;31mType:[0m      function


In [10]:
# create mask: most frequent first_name
mask = df.first_name == pak.most_freq_elt(df.first_name)
df[mask]

Unnamed: 0,first_name,age,age_class,postal_code,birthplace,secret,features,history
407,Anja,23,20,27989,,3A75fe9,"{c, 1}","[b, b, a, b]"
455,Anja,38,30,96842,Berlin,CnOaYuüJh,"{D, d, a}",[]
634,Anja,24,20,13510,Berlin,OadVql3Ur,"{e, V, D}",[]
764,Anja,25,20,37536,Berlin,aF9Ä6cJe,"{g, 5, i, V, j}","[A, B, C]"
810,Anja,35,30,21075,Berlin,ewraxBSf89,"{g, U}","[c, b, a]"
989,Anja,37,30,98954,Berlin,5IFGg,{r},[]


In [11]:
# just delete them
df = pak.drop_rows(df,mask)

Delete 6 rows from 1000


### move_rows()

In [12]:
?pak.move_rows

[0;31mSignature:[0m
[0mpak[0m[0;34m.[0m[0mmove_rows[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdf_from[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdf_to[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmask[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmsg[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmsgcol[0m[0;34m=[0m[0;34m'msg'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mverbose[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mzähler[0m[0;34m=[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Moves rows identified by a binary mask from one dataframe to another (e.g. into a trash).
Returns two DataFrames.
The target dataframe gets an additional message column by standard (to identify why the rows were moved).
If you don't give a message, move_rows will generate one: just the coun

In [13]:
# New test data
df = pak.people(1000)
df = pak.drop_cols(df,['age_class','postal_code','birthplace','features','history'])
df['first_name'] = pak.change_datatype(df.first_name,'object', verbose=False)
df['age']        = pak.change_datatype(df.age,       'int64',  verbose=False)
df_orig = df.copy()
pak.sample(df)

Unnamed: 0,first_name,age,secret
0,Tom,32,AÄQmFJe
18,Erika,24,üÄOWü
23,Karl,42,nTrZo
26,Melanie,34,pjFoüövmHx
105,Jessica,20,ENQP0b
237,Ingrid,40,GL2obu3
957,Andreas,26,0Goq5anne
999,Malte,27,6Äap8Ku


In [14]:
# create mask: most frequent first_name
mask = df.first_name == pak.most_freq_elt(df.first_name)
df[mask]

Unnamed: 0,first_name,age,secret
237,Ingrid,40,GL2obu3
620,Ingrid,41,W42ZuhavÄ
675,Ingrid,24,BZÜGxvLmX
708,Ingrid,38,9Ow6mw4iu
720,Ingrid,35,oCNdNKOaP
773,Ingrid,25,vaZPXÜGh


In [15]:
# move them away (and create a new trash)
df, df_trash = pak.move_rows( df, mask)

Move 6 rows from 1000


In [16]:
print('count rows:', df.shape[0], ' + ', df_trash.shape[0])

count rows: 994  +  6


In [17]:
#df
df_trash

Unnamed: 0,first_name,age,secret
237,Ingrid,40,GL2obu3
620,Ingrid,41,W42ZuhavÄ
675,Ingrid,24,BZÜGxvLmX
708,Ingrid,38,9Ow6mw4iu
720,Ingrid,35,oCNdNKOaP
773,Ingrid,25,vaZPXÜGh


### add_rows()

In [18]:
?pak.add_rows

[0;31mSignature:[0m
[0mpak[0m[0;34m.[0m[0madd_rows[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdf_main[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdf_add[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0monly_new[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mreindex[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0massert_subset[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mverbose[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Like concat, with additional features only_new and verbose.
* df_main:       The new rows are added to the end of this dataframe.
* df_add:        Rows to add. The dtypes are adapted to those of df_main.
                 Series or list are also accepted for appending.
* only_new:      Avoid duplicates by setting this to a list of column names.
                 This combination must contain new content

In [19]:
# We will concat df and df_trash again and compare it with df_orig, the backup of df

# get rid of the msg column
df_trash     = pak.drop_cols(df_trash,'msg')       

# optimize df, but not df_trash. 
# So the dtypes are different now
df = pak.change_datatype(df)

change_datatype 
first_name           --> pd.string 
age                  --> pd.Int8   
change_datatype before: 147.8 KB after: 141.9 KB



In [20]:
# recover to the state before move_rows
df_recovered = pak.add_rows(df, df_trash)

6 rows added, now a total of 1000


In [21]:
#df_recovered

In [22]:
# check the datatypes 
#pak.analyse_cols(df_recovered)

In [23]:
# The dataframes are equal, but sorted different
# (see row: Total, cols: sort and eq)
pak.compare_dataframes(df_orig, df_recovered)

Unnamed: 0,name,dtype,nnan,content,sort,eq
first_name,True,False,True,True,False,False
age,True,False,True,True,False,False
secret,True,True,True,True,False,False
(Total),True,False,True,True,False,False


In [24]:
# if we do this with reindex=True, 
# the content keeps equal, 
# but the dataframes are not equal
# (see row: Total, cols: content and eq)
df_recovered = pak.add_rows(df, df_trash, reindex=True)
pak.compare_dataframes(df_orig, df_recovered)

6 rows added, now a total of 1000


Unnamed: 0,name,dtype,nnan,content,sort,eq
first_name,True,False,True,True,False,False
age,True,False,True,True,False,False
secret,True,True,True,True,False,False
(Total),True,False,True,True,False,False


#### add_rows() with only_new=[col1, col2]

In [25]:
# we go back to df and df_trash
print('count rows:', df.shape[0], ' + ', df_trash.shape[0])

count rows: 994  +  6


In [26]:
# recover to the state before move_rows
df_recovered = pak.add_rows(df, df_trash)

6 rows added, now a total of 1000


In [27]:
# add the same rows again >> now we have some duplicates
df_with_dups = pak.add_rows(df_recovered, df_trash)

6 rows added, now a total of 1006


In [28]:
# you can avoid that by setting only_new to a column name combination that must contain new content
# run this cell multible times
df_recovered = pak.add_rows(df_recovered, df_trash, only_new = ['secret','age'])

6 rows not attached
0 rows added, now a total of 1000


#### add_rows() with only_new=True

In [29]:
# we go back to df and df_trash
# but we give them a random index
df.index       = pak.random_series(df.shape[0],       'string', len_min=3, len_max=3, p_dup=0)
df_trash.index = pak.random_series(df_trash.shape[0], 'string', len_min=3, len_max=3, p_dup=0)
print('count rows:', df.shape[0], ' + ', df_trash.shape[0])

count rows: 994  +  6


In [30]:
# recover with only_new=True to avoid duplicate row indexes
df_recovered = pak.add_rows(df, df_trash, only_new=True)
df_recovered

0 rows not attached
6 rows added, now a total of 1000


Unnamed: 0,first_name,age,secret
0,Tom,32,AÄQmFJe
1,Kim,32,üurfMäMöD
2,Josephine,21,eFihw8d
3,Caroline,37,CpmUzpd
4,Inge,26,zOhCV3ä
...,...,...,...
995,Ingrid,41,W42ZuhavÄ
996,Ingrid,24,BZÜGxvLmX
997,Ingrid,38,9Ow6mw4iu
998,Ingrid,35,oCNdNKOaP


In [31]:
# add the same rows again >> no rows added
df_recovered = pak.add_rows(df_recovered, df_trash, only_new=True)
#df_recovered

0 rows not attached
6 rows added, now a total of 1006


In [32]:
#search_notebooks('add_rows', radius=3, exclude='nb_out', suffix='py')