__[Open and try this file online (Colab)](https://colab.research.google.com/github/djekra/pandasklar/blob/master/jupyter/22_Manage_Rows.ipynb)__

# Manage rows
* `drop_multiindex`: Converts any MultiIndex to normal columns and resets the index. Works with MultiIndex in Series or DataFrames, in rows and in columns.
* `reset_index`: Creates a new, unnamed index. If keep_as is given, the old index is preserved as a row with this name. Otherwise the old index is dropped.
* `rename_index`: Renames the index.
* `drop_rows`: Drops rows identified by a binary mask, verbose if wanted.
* `move_rows`: Moves rows identified by a binary mask from one dataframe to another (e.g. into a trash).<br>
   The target dataframe gets an additional message column by standard (to identify why the rows were moved). Verbose if wanted. 
* `add_rows`: Like concat, with additional features only_new and verbose.

In [1]:
# blab init
try:
    import blab
except ImportError as e:
    !pip install blab
    import blab    
startup_notebook = blab.blab_startup()
%run $startup_notebook 

blab init
environment['in_colab']     = False
environment['dropbox_path'] = /home/me/Data_Linux/Dropbox
environment['lib_path']     = /home/me/Data_Linux/Dropbox/31_Projekte/01_Python/libs
Start Time: 09:40:27


In [2]:
import pandas     as pd 
import bpyth      as bpy

# pandasklar
try:
    import pandasklar as pak 
except ImportError as e:
    !pip install pandasklar
    import pandasklar as pak   
    
# verbose
pak.Config.set('VERBOSE', True)

VERBOSE = True
--> setting verbose=True as default for all pandasklar functions



In [3]:
# Test data
df = pak.people(1000)
df

Unnamed: 0,first_name,age,age_class,postal_code,birthplace,secret,features,history
0,Hendrik,39,30,80728,,fBu95oSX,"{u, Q, z}","[A, B, C]"
1,Rudolph,28,20,33277,Bremen,HQfRäE,"{0, 2, R, g}","[b, b, a, b]"
2,Christa,36,30,45479,Berlin,wp5FTgpöij,"{T, N, 0}","[c, b, a]"
3,Martin,39,30,36226,Berlin,kVrqjE,"{T, r, W, z, K}","[A, x]"
4,Christoph,25,20,75762,Bremen,KmiÄiPLeG,"{4, q, f, B, 1}","[A, A, A]"
...,...,...,...,...,...,...,...,...
995,Kurt,31,30,48548,Bremen,9äöXFUaZ4b,"{a, 4, o, 5, V}","[A, C, C, B]"
996,Steffen,22,20,37342,Berlin,6s3taajN,"{N, H, 1, X}","[a, b, c]"
997,Marius,27,20,38585,Berlin,eÖOqd,{f},"[a, b, c]"
998,David,22,20,63612,,znqir,"{I, b, p}","[A, C, C, B]"


## === Row Indexes ===

### rename_index()

In [4]:
# set a trashy index
df.index = pak.random_series(df.shape[0], 'string', len_min=3, len_max=3, p_dup=0)
df

Unnamed: 0_level_0,first_name,age,age_class,postal_code,birthplace,secret,features,history
rnd_string,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3M4,Hendrik,39,30,80728,,fBu95oSX,"{u, Q, z}","[A, B, C]"
PÖy,Rudolph,28,20,33277,Bremen,HQfRäE,"{0, 2, R, g}","[b, b, a, b]"
0iW,Christa,36,30,45479,Berlin,wp5FTgpöij,"{T, N, 0}","[c, b, a]"
e4a,Martin,39,30,36226,Berlin,kVrqjE,"{T, r, W, z, K}","[A, x]"
5k8,Christoph,25,20,75762,Bremen,KmiÄiPLeG,"{4, q, f, B, 1}","[A, A, A]"
...,...,...,...,...,...,...,...,...
U0I,Kurt,31,30,48548,Bremen,9äöXFUaZ4b,"{a, 4, o, 5, V}","[A, C, C, B]"
AK7,Steffen,22,20,37342,Berlin,6s3taajN,"{N, H, 1, X}","[a, b, c]"
MDy,Marius,27,20,38585,Berlin,eÖOqd,{f},"[a, b, c]"
7Ik,David,22,20,63612,,znqir,"{I, b, p}","[A, C, C, B]"


In [5]:
df = pak.rename_index(df,'index_new')
df

Unnamed: 0_level_0,first_name,age,age_class,postal_code,birthplace,secret,features,history
index_new,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3M4,Hendrik,39,30,80728,,fBu95oSX,"{u, Q, z}","[A, B, C]"
PÖy,Rudolph,28,20,33277,Bremen,HQfRäE,"{0, 2, R, g}","[b, b, a, b]"
0iW,Christa,36,30,45479,Berlin,wp5FTgpöij,"{T, N, 0}","[c, b, a]"
e4a,Martin,39,30,36226,Berlin,kVrqjE,"{T, r, W, z, K}","[A, x]"
5k8,Christoph,25,20,75762,Bremen,KmiÄiPLeG,"{4, q, f, B, 1}","[A, A, A]"
...,...,...,...,...,...,...,...,...
U0I,Kurt,31,30,48548,Bremen,9äöXFUaZ4b,"{a, 4, o, 5, V}","[A, C, C, B]"
AK7,Steffen,22,20,37342,Berlin,6s3taajN,"{N, H, 1, X}","[a, b, c]"
MDy,Marius,27,20,38585,Berlin,eÖOqd,{f},"[a, b, c]"
7Ik,David,22,20,63612,,znqir,"{I, b, p}","[A, C, C, B]"


### reset_index()

In [6]:
?pak.reset_index

[0;31mSignature:[0m [0mpak[0m[0;34m.[0m[0mreset_index[0m[0;34m([0m[0mdf[0m[0;34m,[0m [0mkeep_as[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Creates a new, unnamed index.
* keep_as: If keep_as is given, the old index is preserved as a row with this name.
Otherwise the old index is dropped.
[0;31mFile:[0m      ~/Data_Linux/Dropbox/31_Projekte/01_Python/git/pandasklar/src/pandasklar/pandas.py
[0;31mType:[0m      function


In [7]:
df = pak.reset_index(df)
df

Unnamed: 0,first_name,age,age_class,postal_code,birthplace,secret,features,history
0,Hendrik,39,30,80728,,fBu95oSX,"{u, Q, z}","[A, B, C]"
1,Rudolph,28,20,33277,Bremen,HQfRäE,"{0, 2, R, g}","[b, b, a, b]"
2,Christa,36,30,45479,Berlin,wp5FTgpöij,"{T, N, 0}","[c, b, a]"
3,Martin,39,30,36226,Berlin,kVrqjE,"{T, r, W, z, K}","[A, x]"
4,Christoph,25,20,75762,Bremen,KmiÄiPLeG,"{4, q, f, B, 1}","[A, A, A]"
...,...,...,...,...,...,...,...,...
995,Kurt,31,30,48548,Bremen,9äöXFUaZ4b,"{a, 4, o, 5, V}","[A, C, C, B]"
996,Steffen,22,20,37342,Berlin,6s3taajN,"{N, H, 1, X}","[a, b, c]"
997,Marius,27,20,38585,Berlin,eÖOqd,{f},"[a, b, c]"
998,David,22,20,63612,,znqir,"{I, b, p}","[A, C, C, B]"


### drop_multiindex()

In [8]:
?pak.drop_multiindex

[0;31mSignature:[0m [0mpak[0m[0;34m.[0m[0mdrop_multiindex[0m[0;34m([0m[0mdf[0m[0;34m,[0m [0mverbose[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Converts any MultiIndex to normal columns and resets the index. 
Works with MultiIndex in Series or DataFrames, in rows and in columns.
[0;31mFile:[0m      ~/Data_Linux/Dropbox/31_Projekte/01_Python/git/pandasklar/src/pandasklar/pandas.py
[0;31mType:[0m      function


## === Rows ===

### drop_rows()

In [9]:
?pak.drop_rows

[0;31mSignature:[0m [0mpak[0m[0;34m.[0m[0mdrop_rows[0m[0;34m([0m[0mdf[0m[0;34m,[0m [0mmask[0m[0;34m,[0m [0mverbose[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Drops rows identified by a binary mask.
* verbose: True if you want to print how many rows are droped.
(If you want to delete the rows to a trash, use move_rows.)
[0;31mFile:[0m      ~/Data_Linux/Dropbox/31_Projekte/01_Python/git/pandasklar/src/pandasklar/pandas.py
[0;31mType:[0m      function


In [10]:
# create mask: most frequent first_name
mask = df.first_name == pak.most_freq_elt(df.first_name)
df[mask]

Unnamed: 0,first_name,age,age_class,postal_code,birthplace,secret,features,history
152,Carl,33,30,78354,Bremen,GWkä2iBQmt,"{A, M}","[A, C, C, B]"
435,Carl,35,30,78831,Bremen,wuDtGeoh,"{2, O}","[A, B, C]"
569,Carl,34,30,41873,,YgirT,"{u, U}","[a, b, c]"
637,Carl,29,20,61857,,J0Cfg,"{T, U, V}","[a, b, c]"
790,Carl,23,20,74935,Berlin,KMDAa0,"{5, X}","[A, C, C, B]"
866,Carl,26,20,70292,Bremen,BV32bvÜ,{U},"[A, A, A]"


In [11]:
# just delete them
df = pak.drop_rows(df,mask)

Delete 6 rows from 1000


### move_rows()

In [12]:
?pak.move_rows

[0;31mSignature:[0m
[0mpak[0m[0;34m.[0m[0mmove_rows[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdf_from[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdf_to[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmask[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmsg[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmsgcol[0m[0;34m=[0m[0;34m'msg'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mverbose[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mzähler[0m[0;34m=[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Moves rows identified by a binary mask from one dataframe to another (e.g. into a trash).
Returns two DataFrames.
The target dataframe gets an additional message column by standard (to identify why the rows were moved).
If you don't give a message, move_rows will generate one: just the coun

In [37]:
# New test data
df = pak.people(1000)
df = pak.drop_cols(df,['age_class','postal_code','birthplace','features','history'])
df['first_name'] = pak.change_datatype(df.first_name,'object', verbose=False)
df['age']        = pak.change_datatype(df.age,       'int64',  verbose=False)
df_orig = df.copy()
pak.sample(df)

Unnamed: 0,first_name,age,secret
0,Hildegard,34,hUEuhGOäv
3,Frida,29,üsqH1lpe
5,Pauline,30,4Fdop8T
27,Anton,20,q2UoW
228,Mika,31,07MWlByMVE
413,Marcel,27,I3DxDlyÄm
465,Sophie,42,MRe6pCG
999,Markus,32,aVOmA8yJxT


In [38]:
# create mask: most frequent first_name
mask = df.first_name == pak.most_freq_elt(df.first_name)
df[mask]

Unnamed: 0,first_name,age,secret
413,Marcel,27,I3DxDlyÄm
446,Marcel,23,liIAcNANuÄ
486,Marcel,36,MBeJSBYjx
504,Marcel,20,iriua
541,Marcel,29,vmneNRüF
735,Marcel,35,kNxjEwRQ
911,Marcel,37,16r3PÜw


In [39]:
# move them away (and create a new trash)
df, df_trash = pak.move_rows( df, mask)

Move 7 rows from 1000


In [40]:
print('count rows:', df.shape[0], ' + ', df_trash.shape[0])

count rows: 993  +  7


In [41]:
#df
df_trash

Unnamed: 0,first_name,age,secret
413,Marcel,27,I3DxDlyÄm
446,Marcel,23,liIAcNANuÄ
486,Marcel,36,MBeJSBYjx
504,Marcel,20,iriua
541,Marcel,29,vmneNRüF
735,Marcel,35,kNxjEwRQ
911,Marcel,37,16r3PÜw


### add_rows()

In [22]:
?pak.add_rows

[0;31mSignature:[0m
[0mpak[0m[0;34m.[0m[0madd_rows[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdf_main[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdf_add[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0monly_new[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mreindex[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0massert_subset[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mverbose[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Like concat, with additional features only_new and verbose.
* df_main:       The new rows are added to the end of this dataframe.
* df_add:        Rows to add. The dtypes are adapted to those of df_main.
                 Series or list are also accepted for appending.
* only_new:      Avoid duplicates by setting this to a list of column names.
                 This combination must contain new content

In [23]:
# We will concat df and df_trash again and compare it with df_orig, the backup of df

# get rid of the msg column
df_trash     = pak.drop_cols(df_trash,'msg')       

# optimize df, but not df_trash. 
# So the dtypes are different now
df = pak.change_datatype(df)

change_datatype 
first_name           --> pd.string 
age                  --> pd.Int8   
change_datatype before: 146.3 KB after: 140.6 KB



In [24]:
# recover to the state before move_rows
df_recovered = pak.add_rows(df, df_trash)

13 rows added, now a total of 1000


In [25]:
#df_recovered

In [26]:
# check the datatypes 
#pak.analyse_cols(df_recovered)

In [27]:
# The dataframes are equal, but sorted different
# (see row: Total, cols: sort and eq)
pak.compare_dataframes(df_orig, df_recovered)

Unnamed: 0,name,dtype,nnan,content,sort,eq
first_name,True,False,True,True,False,False
age,True,False,True,True,False,False
secret,True,True,True,True,False,False
(Total),True,False,True,True,False,False


In [28]:
# if we do this with reindex=True, 
# the content keeps equal, 
# but the dataframes are not equal
# (see row: Total, cols: content and eq)
df_recovered = pak.add_rows(df, df_trash, reindex=True)
pak.compare_dataframes(df_orig, df_recovered)

13 rows added, now a total of 1000


Unnamed: 0,name,dtype,nnan,content,sort,eq
first_name,True,False,True,True,False,False
age,True,False,True,True,False,False
secret,True,True,True,True,False,False
(Total),True,False,True,True,False,False


#### add_rows() with only_new=[col1, col2]

In [29]:
# we go back to df and df_trash
print('count rows:', df.shape[0], ' + ', df_trash.shape[0])

count rows: 987  +  13


In [30]:
# recover to the state before move_rows
df_recovered = pak.add_rows(df, df_trash)

13 rows added, now a total of 1000


In [31]:
# add the same rows again >> now we have some duplicates
df_with_dups = pak.add_rows(df_recovered, df_trash)

13 rows added, now a total of 1013


In [32]:
# you can avoid that by setting only_new to a column name combination that must contain new content
# run this cell multible times
df_recovered = pak.add_rows(df_recovered, df_trash, only_new = ['secret','age'])

13 rows not attached
0 rows added, now a total of 1000


#### add_rows() with only_new=True

In [33]:
# we go back to df and df_trash
# but we give them a random index
df.index       = pak.random_series(df.shape[0],       'string', len_min=3, len_max=3, p_dup=0)
df_trash.index = pak.random_series(df_trash.shape[0], 'string', len_min=3, len_max=3, p_dup=0)
print('count rows:', df.shape[0], ' + ', df_trash.shape[0])

count rows: 987  +  13


In [34]:
# recover with only_new=True to avoid duplicate row indexes
df_recovered = pak.add_rows(df, df_trash, only_new=True)
df_recovered

0 rows not attached
13 rows added, now a total of 1000


Unnamed: 0,first_name,age,secret
0,Anke,25,ü06Xu64x8
1,Brigitte,30,CV8JuCLmQ
2,Evelyn,29,rÜüzo
3,Yasmin,32,2SFaütKk
4,Stephanie,31,qZS7SkJlS
...,...,...,...
995,Emily,32,azÖUymf
996,Emily,35,mCLj900eh
997,Emily,32,DA9JkbWj2E
998,Emily,23,üFFIe5K


In [35]:
# add the same rows again >> no rows added
df_recovered = pak.add_rows(df_recovered, df_trash, only_new=True)
#df_recovered

0 rows not attached
13 rows added, now a total of 1013


In [36]:
#search_notebooks('add_rows', radius=3, exclude='nb_out', suffix='py')