__[Open and try this file online (Colab)](https://colab.research.google.com/github/djekra/pandasklar/blob/master/jupyter/22_Manage_Rows.ipynb)__

# Manage rows
* `drop_multiindex`: Converts any MultiIndex to normal columns and resets the index. Works with MultiIndex in Series or DataFrames, in rows and in columns.
* `reset_index`: Creates a new, unnamed index. If keep_as is given, the old index is preserved as a row with this name. Otherwise the old index is dropped.
* `rename_index`: Renames the index.
* `drop_rows`: Drops rows identified by a binary mask, verbose if wanted.
* `move_rows`: Moves rows identified by a binary mask from one dataframe to another (e.g. into a trash).<br>
   The target dataframe gets an additional message column by standard (to identify why the rows were moved). Verbose if wanted. 
* `add_rows`: Like concat, with additional features only_new and verbose.

In [1]:
# blab init
try:
    import blab
except ImportError as e:
    !pip install blab
    import blab    
startup_notebook = blab.blab_startup()
%run $startup_notebook 

blab init
environment['dropbox_path'] = /home/me/Data_Linux/Dropbox
environment['lib_path']     = /home/me/Data_Linux/Dropbox/31_Projekte/01_Python/libs
Start Time: 21:40:20


time: 393 ms


In [2]:
import pandas     as pd 
import bpyth      as bpy

# pandasklar
try:
    import pandasklar as pak 
except ImportError as e:
    !pip install pandasklar
    import pandasklar as pak   
    
# verbose
pak.Config.set('VERBOSE', True)

VERBOSE = True
--> setting verbose=True as default for all pandasklar functions

time: 263 ms


In [3]:
# Test data
df = pak.people(1000)
df

Unnamed: 0,first_name,age,age_class,postal_code,birthplace,secret,features,history
0,Tina,39,30,11064,,sÜdoA,"{p, u, V}","[A, x]"
1,Ralf,42,40,57102,Berlin,7g0TiBoPzG,"{Z, r, V, R}","[c, b, a]"
2,Carla,30,30,28521,,e3ScsL7wX,"{N, n, F, r, K}","[A, A, A]"
3,Max,26,20,22004,Berlin,meQh2pV47,"{l, M, b, Y}","[a, b, c]"
4,Manuel,36,30,64309,Berlin,KEufM,"{a, C, U}","[A, C, C, B]"
...,...,...,...,...,...,...,...,...
995,Zoe,31,30,32490,Berlin,fLEwuT,"{E, L, Q}","[A, B, C]"
996,Helmut,30,30,78962,Bremen,M4qPB89,"{t, Z, K}","[a, b, c]"
997,Leo,29,20,31000,,8L0uEM1M,"{r, 4}","[b, b, a, b]"
998,Pauline,20,20,51043,Bremen,vkPineÜÜ,"{N, v, V, d, s}","[A, A, A]"


time: 167 ms


## === Row Indexes ===

### rename_index()

In [4]:
# set a trashy index
df.index = pak.random_series(df.shape[0], 'string', len_min=3, len_max=3, p_dup=0)
df

Unnamed: 0_level_0,first_name,age,age_class,postal_code,birthplace,secret,features,history
rnd_string,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Y4B,Tina,39,30,11064,,sÜdoA,"{p, u, V}","[A, x]"
7fo,Ralf,42,40,57102,Berlin,7g0TiBoPzG,"{Z, r, V, R}","[c, b, a]"
mK2,Carla,30,30,28521,,e3ScsL7wX,"{N, n, F, r, K}","[A, A, A]"
4Ul,Max,26,20,22004,Berlin,meQh2pV47,"{l, M, b, Y}","[a, b, c]"
koÖ,Manuel,36,30,64309,Berlin,KEufM,"{a, C, U}","[A, C, C, B]"
...,...,...,...,...,...,...,...,...
BDx,Zoe,31,30,32490,Berlin,fLEwuT,"{E, L, Q}","[A, B, C]"
muf,Helmut,30,30,78962,Bremen,M4qPB89,"{t, Z, K}","[a, b, c]"
ZTS,Leo,29,20,31000,,8L0uEM1M,"{r, 4}","[b, b, a, b]"
Öu9,Pauline,20,20,51043,Bremen,vkPineÜÜ,"{N, v, V, d, s}","[A, A, A]"


time: 66 ms


In [5]:
df = pak.rename_index(df,'index_new')
df

Unnamed: 0_level_0,first_name,age,age_class,postal_code,birthplace,secret,features,history
index_new,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Y4B,Tina,39,30,11064,,sÜdoA,"{p, u, V}","[A, x]"
7fo,Ralf,42,40,57102,Berlin,7g0TiBoPzG,"{Z, r, V, R}","[c, b, a]"
mK2,Carla,30,30,28521,,e3ScsL7wX,"{N, n, F, r, K}","[A, A, A]"
4Ul,Max,26,20,22004,Berlin,meQh2pV47,"{l, M, b, Y}","[a, b, c]"
koÖ,Manuel,36,30,64309,Berlin,KEufM,"{a, C, U}","[A, C, C, B]"
...,...,...,...,...,...,...,...,...
BDx,Zoe,31,30,32490,Berlin,fLEwuT,"{E, L, Q}","[A, B, C]"
muf,Helmut,30,30,78962,Bremen,M4qPB89,"{t, Z, K}","[a, b, c]"
ZTS,Leo,29,20,31000,,8L0uEM1M,"{r, 4}","[b, b, a, b]"
Öu9,Pauline,20,20,51043,Bremen,vkPineÜÜ,"{N, v, V, d, s}","[A, A, A]"


time: 42.1 ms


### reset_index()

In [6]:
?pak.reset_index

time: 73.1 ms


[0;31mSignature:[0m [0mpak[0m[0;34m.[0m[0mreset_index[0m[0;34m([0m[0mdf[0m[0;34m,[0m [0mkeep_as[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Creates a new, unnamed index.
* keep_as: If keep_as is given, the old index is preserved as a row with this name.
Otherwise the old index is dropped.
[0;31mFile:[0m      ~/Data_Linux/Dropbox/31_Projekte/01_Python/git/pandasklar/src/pandasklar/pandas.py
[0;31mType:[0m      function


In [7]:
df = pak.reset_index(df)
df

Unnamed: 0,first_name,age,age_class,postal_code,birthplace,secret,features,history
0,Tina,39,30,11064,,sÜdoA,"{p, u, V}","[A, x]"
1,Ralf,42,40,57102,Berlin,7g0TiBoPzG,"{Z, r, V, R}","[c, b, a]"
2,Carla,30,30,28521,,e3ScsL7wX,"{N, n, F, r, K}","[A, A, A]"
3,Max,26,20,22004,Berlin,meQh2pV47,"{l, M, b, Y}","[a, b, c]"
4,Manuel,36,30,64309,Berlin,KEufM,"{a, C, U}","[A, C, C, B]"
...,...,...,...,...,...,...,...,...
995,Zoe,31,30,32490,Berlin,fLEwuT,"{E, L, Q}","[A, B, C]"
996,Helmut,30,30,78962,Bremen,M4qPB89,"{t, Z, K}","[a, b, c]"
997,Leo,29,20,31000,,8L0uEM1M,"{r, 4}","[b, b, a, b]"
998,Pauline,20,20,51043,Bremen,vkPineÜÜ,"{N, v, V, d, s}","[A, A, A]"


time: 51.6 ms


### drop_multiindex()

In [8]:
?pak.drop_multiindex

time: 13.9 ms


[0;31mSignature:[0m [0mpak[0m[0;34m.[0m[0mdrop_multiindex[0m[0;34m([0m[0mdf[0m[0;34m,[0m [0mverbose[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Converts any MultiIndex to normal columns and resets the index. 
Works with MultiIndex in Series or DataFrames, in rows and in columns.
[0;31mFile:[0m      ~/Data_Linux/Dropbox/31_Projekte/01_Python/git/pandasklar/src/pandasklar/pandas.py
[0;31mType:[0m      function


## === Rows ===

### drop_rows()

In [9]:
?pak.drop_rows

time: 12 ms


[0;31mSignature:[0m [0mpak[0m[0;34m.[0m[0mdrop_rows[0m[0;34m([0m[0mdf[0m[0;34m,[0m [0mmask[0m[0;34m,[0m [0mverbose[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Drops rows identified by a binary mask.
* verbose: True if you want to print how many rows are droped.
(If you want to delete the rows to a trash, use move_rows.)
[0;31mFile:[0m      ~/Data_Linux/Dropbox/31_Projekte/01_Python/git/pandasklar/src/pandasklar/pandas.py
[0;31mType:[0m      function


In [10]:
# create mask: most frequent first_name
mask = df.first_name == pak.most_freq_elt(df.first_name)
df[mask]

Unnamed: 0,first_name,age,age_class,postal_code,birthplace,secret,features,history
268,Eva,29,20,43052,Berlin,FüHaFCxoä,"{l, D, S, 1}","[A, A, A]"
469,Eva,25,20,67271,,eeAsaL,"{a, Z, S, B, Q}","[A, B, C]"
520,Eva,29,20,52664,Bremen,THPIZ,"{A, p, G}","[A, C, C, B]"
562,Eva,38,30,39068,,eYxpea,"{i, T, O}","[A, B, C]"
570,Eva,33,30,55530,Bremen,R4xanoPHD,"{P, A, m}","[A, A, A]"
902,Eva,30,30,70148,,m3jY3NhHaa,"{s, M, T}","[A, B, C, C]"


time: 29.7 ms


In [11]:
# just delete them
df = pak.drop_rows(df,mask)

Delete 6 rows from 1000
time: 18.9 ms


### move_rows()

In [12]:
?pak.move_rows

time: 24.6 ms


[0;31mSignature:[0m
[0mpak[0m[0;34m.[0m[0mmove_rows[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdf_from[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdf_to[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmask[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmsg[0m[0;34m=[0m[0;34m''[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmsgcol[0m[0;34m=[0m[0;34m'msg'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mverbose[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mzähler[0m[0;34m=[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Moves rows identified by a binary mask from one dataframe to another (e.g. into a trash).
Returns two DataFrames.
The target dataframe gets an additional message column by standard (to identify why the rows were moved).
If you don't give a message, move_rows will generate one: just the count 

In [13]:
# New test data
df = pak.people(1000)
df = pak.drop_cols(df,['age_class','postal_code','birthplace','features','history'])
df['first_name'] = pak.change_datatype(df.first_name,'object', verbose=False)
df['age']        = pak.change_datatype(df.age,       'int64',  verbose=False)
df_orig = df.copy()
df.head(3)

Unnamed: 0,first_name,age,secret
0,Bianca,39,YefLü
1,Kurt,29,eCAUeaQaJz
2,Rainer,23,eBXKZMHy


time: 189 ms


In [14]:
# create mask: most frequent first_name
mask = df.first_name == pak.most_freq_elt(df.first_name)
df[mask]

Unnamed: 0,first_name,age,secret
195,Annica,37,Ä2öS1eho
545,Annica,30,ÜUTS1y2öX
684,Annica,31,BÖoÖrh
690,Annica,34,9FLiUT64OV
722,Annica,21,F1O0öAÖa
762,Annica,36,zwY8Jf


time: 24.2 ms


In [15]:
# move them away (and create a new trash)
df, df_trash = pak.move_rows( df, mask)

Move 6 rows from 1000
time: 24.6 ms


In [16]:
print('count rows:', df.shape[0], ' + ', df_trash.shape[0])

count rows: 994  +  6
time: 16 ms


In [17]:
#df
df_trash

Unnamed: 0,first_name,age,secret,msg
195,Annica,37,Ä2öS1eho,0
545,Annica,30,ÜUTS1y2öX,0
684,Annica,31,BÖoÖrh,0
690,Annica,34,9FLiUT64OV,0
722,Annica,21,F1O0öAÖa,0
762,Annica,36,zwY8Jf,0


time: 22.4 ms


In [18]:
# create mask: (second) most frequent first_name
mask = df.first_name == pak.most_freq_elt(df.first_name)
df[mask]

Unnamed: 0,first_name,age,secret
139,Cornelia,29,äüFYuVa
494,Cornelia,30,JBO4K2XmC7
504,Cornelia,38,362Öve
853,Cornelia,39,9sÜwLow4
908,Cornelia,28,g6F2fä
971,Cornelia,28,ÜWdiRk


time: 39.6 ms


In [19]:
# move them away (into the existing trash)
df, df_trash = pak.move_rows( df, df_trash, mask)

Move 6 rows from 994
time: 34.8 ms


In [20]:
print('count rows:', df.shape[0], ' + ', df_trash.shape[0])

count rows: 988  +  12
time: 21 ms


In [21]:
df_trash

Unnamed: 0,first_name,age,secret,msg
195,Annica,37,Ä2öS1eho,0
545,Annica,30,ÜUTS1y2öX,0
684,Annica,31,BÖoÖrh,0
690,Annica,34,9FLiUT64OV,0
722,Annica,21,F1O0öAÖa,0
762,Annica,36,zwY8Jf,0
139,Cornelia,29,äüFYuVa,1
494,Cornelia,30,JBO4K2XmC7,1
504,Cornelia,38,362Öve,1
853,Cornelia,39,9sÜwLow4,1


time: 25.7 ms


### add_rows()

In [22]:
?pak.add_rows

time: 24.2 ms


[0;31mSignature:[0m
[0mpak[0m[0;34m.[0m[0madd_rows[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdf_main[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdf_add[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0monly_new[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mreindex[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0massert_subset[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mverbose[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Like concat, with additional features only_new and verbose.
* df_main:       The new rows are added to the end of this dataframe.
* df_add:        Rows to add. The dtypes are adapted to those of df_main.
                 Series or list are also accepted for appending.
* only_new:      Avoid duplicates by setting this to a list of column names.
                 This combination must contain new content

In [23]:
# We will concat df and df_trash again and compare it with df_orig, the backup of df

# get rid of the msg column
df_trash     = pak.drop_cols(df_trash,'msg')       

# optimize df, but not df_trash. 
# So the dtypes are different now
df = pak.change_datatype(df)

change_datatype 
first_name           --> pd.string 
age                  --> pd.Int8   
change_datatype before: 146.3 KB after: 140.5 KB

time: 25.9 ms


In [24]:
# recover to the state before move_rows
df_recovered = pak.add_rows(df, df_trash)

12 rows added, now a total of 1000
time: 24.1 ms


In [25]:
#df_recovered

time: 17.2 ms


In [26]:
# check the datatypes 
#pak.analyse_cols(df_recovered)

time: 20.8 ms


In [27]:
# The dataframes are equal, but sorted different
# (see row: Total, cols: sort and eq)
pak.compare_dataframes(df_orig, df_recovered)

Unnamed: 0,name,dtype,nnan,content,sort,eq
first_name,True,False,True,True,False,False
age,True,False,True,True,False,False
secret,True,True,True,True,False,False
(Total),True,False,True,True,False,False


time: 55.1 ms


In [28]:
# if we do this with reindex=True, 
# the content keeps equal, 
# but the dataframes are not equal
# (see row: Total, cols: content and eq)
df_recovered = pak.add_rows(df, df_trash, reindex=True)
pak.compare_dataframes(df_orig, df_recovered)

12 rows added, now a total of 1000


Unnamed: 0,name,dtype,nnan,content,sort,eq
first_name,True,False,True,True,False,False
age,True,False,True,True,False,False
secret,True,True,True,True,False,False
(Total),True,False,True,True,False,False


time: 39.5 ms


#### add_rows() with only_new=[col1, col2]

In [29]:
# we go back to df and df_trash
print('count rows:', df.shape[0], ' + ', df_trash.shape[0])

count rows: 988  +  12
time: 19 ms


In [30]:
# recover to the state before move_rows
df_recovered = pak.add_rows(df, df_trash)

12 rows added, now a total of 1000
time: 23.4 ms


In [31]:
# add the same rows again >> now we have some duplicates
df_with_dups = pak.add_rows(df_recovered, df_trash)

12 rows added, now a total of 1012
time: 25.1 ms


In [32]:
# you can avoid that by setting only_new to a column name combination that must contain new content
# run this cell multible times
df_recovered = pak.add_rows(df_recovered, df_trash, only_new = ['secret','age'])

12 rows not attached
0 rows added, now a total of 1000
time: 50.8 ms


#### add_rows() with only_new=True

In [33]:
# we go back to df and df_trash
# but we give them a random index
df.index       = pak.random_series(df.shape[0],       'string', len_min=3, len_max=3, p_dup=0)
df_trash.index = pak.random_series(df_trash.shape[0], 'string', len_min=3, len_max=3, p_dup=0)
print('count rows:', df.shape[0], ' + ', df_trash.shape[0])

count rows: 988  +  12
time: 46.8 ms


In [34]:
# recover with only_new=True to avoid duplicate row indexes
df_recovered = pak.add_rows(df, df_trash, only_new=True)
df_recovered

0 rows not attached
12 rows added, now a total of 1000


Unnamed: 0,first_name,age,secret
0,Bianca,39,YefLü
1,Kurt,29,eCAUeaQaJz
2,Rainer,23,eBXKZMHy
3,Ulrike,31,äeODÖzYeA7
4,Michaela,28,OytfzKW0yä
...,...,...,...
995,Cornelia,30,JBO4K2XmC7
996,Cornelia,38,362Öve
997,Cornelia,39,9sÜwLow4
998,Cornelia,28,g6F2fä


time: 27.9 ms


In [35]:
# add the same rows again >> no rows added
df_recovered = pak.add_rows(df_recovered, df_trash, only_new=True)
#df_recovered

0 rows not attached
12 rows added, now a total of 1012
time: 15.9 ms


In [36]:
#search_notebooks('add_rows', radius=3, exclude='nb_out', suffix='py')

time: 21.5 ms
