__[Open and try this file online (Colab)](https://colab.research.google.com/github/djekra/pandasklar/blob/master/jupyter/13_Analyse_Datatypes.ipynb)__

# Analyse Datatypes
* `analyse_datatypes`: Returns info about the datatypes and the mem_usage of the columns of a DataFrame 
* `analyse_values`: Returns statistical data for a DataFrame, a Series or an Index 
* `analyse_cols`: Describes the datatypes and the content of a DataFrame. Merged info from analyse_datatypes and analyse_values
* `change_datatype`: Converts the datatypes of a DataFrame or a Series. Automatically, if you want.
* `copy_datatype`: Copies the dtypes from one dataframe to another, matching the column names.

In [1]:
# blab init
try:
    import blab
except ImportError as e:
    !pip install blab
    import blab    
startup_notebook = blab.blab_startup()
%run $startup_notebook 

blab init
environment['in_colab']     = False
environment['dropbox_path'] = D:\Dropbox
environment['lib_path']     = D:\Dropbox\31_Projekte\01_Python\libs
Start Time: 22:46:32


In [2]:
import numpy      as np
import pandas     as pd 
import bpyth      as bpy

# pandasklar
try:
    import pandasklar as pak 
except ImportError as e:
    !pip install pandasklar
    import pandasklar as pak   
    
# verbose
pak.Config.set('VERBOSE', True)

# copy_on_write
pd.set_option("mode.copy_on_write", True)

VERBOSE = True
--> setting verbose=True as default for all pandasklar functions



In [3]:
# Generate random data
anz = 10000
a = pak.random_series( anz, 'int',   min=-500, max=100 )
b = pak.random_series( anz, 'int',   min=-127, max=127, p_dup=0 ) # keine Dups erlaubt
c = a + b + 0.0001
a = a % 10 * 10
v = pak.random_series( anz, 'name',                  p_nan=0)
w = v.str[:1]
s = pak.random_series( anz, 'string',                p_nan=0)
t = pak.random_series( anz, 'string',                p_nan=0.1)
m = pak.random_series( anz, 'int',   min=0, max=127, p_nan=0.1 )
n = pak.random_series( anz, 'float', decimals=4,     p_nan=0.2 ) #* 70000
o = pak.random_series( anz, 'choice', choice=['Bremen','Bremerhaven'], p_nan=0.3,   p_dup=0     )
p = pak.random_series( anz, 'list',                                    p_nan=0.1,   p_dup=0.5   )
q = pak.random_series( anz, 'time',                                    p_nan=0.1,   p_dup=0.5   )
z = pak.random_series( anz, 'mix',                                     p_nan=0.01,  p_dup=0     )

df = pak.dataframe( [a, b, c, v, w, s, t, m, n, o, p, q, z], verbose=False)
df.columns = ['int_grob','int_fein','float_summe','first_name','Letter1','string_nonan','string_nan','int_nan','float_nan','City','List','time','Mix']
df.float_summe = df.float_summe.astype('float')
pak.sample(df,10)

Unnamed: 0,int_grob,int_fein,float_summe,first_name,Letter1,string_nonan,string_nan,int_nan,float_nan,City,List,time,Mix
0,50,109,114.0001,Friedrich,F,D2Wx67,IPöÖyu,117.0,,,"[Anette, Anja, Peter, Arthur, Hanna, Anna, Sas...",NaT,{0}
340,50,55,-39.9999,Charlotte,C,BueI,kxb51,42.0,0.4566,Bremerhaven,"[Anja, Tom, Meik, Bernd, Anja, Anja, Anna, Jen...",1951-02-24 16:33:25.064580096,Bremerhaven
378,50,-103,-107.9999,Adolf,A,caTTo,Tqixb,70.0,0.7582,,"[Marina, Lukas, Christopher, Kathrin, Tanja]",1906-04-17 22:30:47.948887040,pCpÜnÜHqxS
928,70,117,-45.9999,Maja,M,005T7I,,13.0,0.2789,Bremen,"[Ulrike, Stella, Meik, Ulrich, Sascha, Sina, A...",1946-12-25 16:02:52.360026624,XsUEKR6Vh
6784,40,37,-78.9999,Lina,L,wbbsNox,00PQ1,28.0,,Bremerhaven,"[Karin, Lina, Bernhard, Christa, Steffen]",1990-08-05 12:57:57.373496832,"[Wolfgang, Luisa, Tanja]"
7179,50,103,88.0001,Gerd,G,üZäU,0ACBE,54.0,0.0,Bremen,"[Yvonne, Tanja, Adolf, Melina, Hanna, Erna, To...",1948-01-15 09:23:11.883444224,"[Hanna, Natalie, Sven, Ella, Benedikt, Elke]"
7988,90,119,218.0001,Tanja,T,Y80NIJ,1rK1j,85.0,0.5691,Bremerhaven,"[Rita, Tanja, Anja, Barbara, Anna, Natalie, He...",NaT,1924-10-30 04:42:19.017817088
8836,60,11,-232.9999,Tanja,T,Rz7s0C,üümTV8Ä,0.0,,,"[Gerhard, Louisa, Tom, Hanna, Stephanie, Axel,...",1939-05-19 21:08:41.318045184,1911-03-08 19:20:45.220207616
9998,0,-39,-458.9999,Fynn,F,uvzöas,mJ7VSR,57.0,,,"[Jana, Ella, Anna, Anja, Jana, Simone, Florian]",1932-06-06 21:39:15.537562112,2017-12-22 15:02:26.963902976
9999,50,126,-8.9999,Gisela,G,1BSvuud,,,0.2701,Bremen,,NaT,"[Bärbel, Nadine, Anna, Marlene, Marion, Natali..."


## analyse_datatypes()

In [4]:
?pak.analyse_datatypes

[31mSignature:[39m pak.analyse_datatypes(df, with_index=[38;5;28;01mTrue[39;00m, human_readable=[38;5;28;01mTrue[39;00m)
[31mDocstring:[39m Returns info about the datatypes and the mem_usage of the columns of a DataFrame.  
[31mFile:[39m      d:\dropbox\31_projekte\01_python\88_pycharm\pandasklar\src\pandasklar\analyse.py
[31mType:[39m      function

In [5]:
a = pak.analyse_datatypes(df)
a

Unnamed: 0_level_0,col_name,datatype_instance,datatype,datatype_short,is_numeric,is_string,is_datetime,is_hashable,nan_allowed,mem_usage
col_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,__index__,int64,np.int64,int64,True,False,False,True,False,80.0 B
1,int_grob,intc,np.int32,int32,True,False,False,True,False,40.0 B
2,int_fein,intc,np.int32,int32,True,False,False,True,False,40.0 B
3,float_summe,float64,np.float64,float64,True,False,False,True,True,80.0 B
4,first_name,str,pd.string,string,False,True,False,True,True,545.0 B
5,Letter1,str,pd.string,string,False,True,False,True,True,500.0 B
6,string_nonan,str,pd.string,string,False,True,False,True,True,660.0 B
7,string_nan,str,pd.string,string,False,True,False,True,True,602.0 B
8,int_nan,int64,pd.Int64,Int64,True,False,False,True,True,90.0 B
9,float_nan,float64,np.float64,float64,True,False,False,True,True,80.0 B


## analyse_values()

In [6]:
?pak.analyse_values

[31mSignature:[39m
pak.analyse_values(
    data,
    as_list=[38;5;28;01mFalse[39;00m,
    as_dict=[38;5;28;01mFalse[39;00m,
    sort=[38;5;28;01mFalse[39;00m,
    with_index=[38;5;28;01mTrue[39;00m,
    nanless_ints=[38;5;28;01mFalse[39;00m,
)
[31mDocstring:[39m Returns statistical data for a DataFrame, a Series or an Index     
[31mFile:[39m      d:\dropbox\31_projekte\01_python\88_pycharm\pandasklar\src\pandasklar\analyse.py
[31mType:[39m      function

In [7]:
pak.analyse_values(df)

Unnamed: 0_level_0,col_name,ntypes,nunique,nnan,ndups,n,vmin,vmean,vmedian,vmax,vsum,datatype_suggest,datatype_identified
col_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,__index__,1,10000,0,0,10000,0.0,4999.5,4999.5,9999.0,49995000.0,np.int16,int
1,int_grob,1,10,0,9990,10000,0.0,44.79,40.0,90.0,447900.0,pd.Int8,int
2,int_fein,1,255,0,9745,10000,-127.0,0.53,1.0,127.0,5285.0,pd.Int8,int
3,float_summe,1,824,0,9176,10000,-623.0,-197.82,-198.0,218.0,-1978154.0,np.float32,float
4,first_name,1,435,0,9565,10000,Adolf,,,Zoe,,,string
5,Letter1,1,24,0,9976,10000,A,,,Z,,,string
6,string_nonan,1,10000,0,0,10000,005T7I,,,üüDe,,,string
7,string_nan,1,9049,951,0,10000,00PQ1,,,üümTV8Ä,,,string
8,int_nan,1,128,960,8912,10000,0.0,63.59,64.0,127.0,574840.0,pd.Int8,int
9,float_nan,1,8069,1930,1,10000,0.0,0.5,0.5,1.0,4042.36,np.float32,float


## analyse_cols()

In [8]:
?pak.analyse_cols

[31mSignature:[39m pak.analyse_cols(df, sort=[38;5;28;01mFalse[39;00m, with_index=[38;5;28;01mTrue[39;00m, human_readable=[38;5;28;01mTrue[39;00m)
[31mDocstring:[39m
Describes the datatypes and the content of a DataFrame.
Merged info from analyse_datatypes and analyse_values.
[31mFile:[39m      d:\dropbox\31_projekte\01_python\88_pycharm\pandasklar\src\pandasklar\analyse.py
[31mType:[39m      function

In [9]:
pak.analyse_cols(df)

Unnamed: 0,col_name,datatype_instance,datatype,datatype_short,datatype_suggest,datatype_identified,is_numeric,is_string,is_datetime,is_hashable,...,ntypes,nunique,nnan,ndups,n,vmin,vmean,vmedian,vmax,vsum
0,__index__,int64,np.int64,int64,np.int16,int,True,False,False,True,...,1,10000,0,0,10000,0.0,4999.5,4999.5,9999.0,49995000.0
1,int_grob,intc,np.int32,int32,pd.Int8,int,True,False,False,True,...,1,10,0,9990,10000,0.0,44.79,40.0,90.0,447900.0
2,int_fein,intc,np.int32,int32,pd.Int8,int,True,False,False,True,...,1,255,0,9745,10000,-127.0,0.53,1.0,127.0,5285.0
3,float_summe,float64,np.float64,float64,np.float32,float,True,False,False,True,...,1,824,0,9176,10000,-623.0,-197.82,-198.0,218.0,-1978154.0
4,first_name,str,pd.string,string,,string,False,True,False,True,...,1,435,0,9565,10000,Adolf,,,Zoe,
5,Letter1,str,pd.string,string,,string,False,True,False,True,...,1,24,0,9976,10000,A,,,Z,
6,string_nonan,str,pd.string,string,,string,False,True,False,True,...,1,10000,0,0,10000,005T7I,,,üüDe,
7,string_nan,str,pd.string,string,,string,False,True,False,True,...,1,9049,951,0,10000,00PQ1,,,üümTV8Ä,
8,int_nan,int64,pd.Int64,Int64,pd.Int8,int,True,False,False,True,...,1,128,960,8912,10000,0.0,63.59,64.0,127.0,574840.0
9,float_nan,float64,np.float64,float64,np.float32,float,True,False,False,True,...,1,8069,1930,1,10000,0.0,0.5,0.5,1.0,4042.36


## change_datatype()

In [10]:
?pak.change_datatype

[31mSignature:[39m
pak.change_datatype(
    data,
    search=[38;5;28;01mNone[39;00m,
    verbose=[38;5;28;01mNone[39;00m,
    msg=[33m''[39m,
    category_maxsize=-[32m1[39m,
    nanless_ints=[38;5;28;01mFalse[39;00m,
)
[31mDocstring:[39m
Converts the datatypes of a DataFrame or a Series.
If used with a Series:    
Similar behavior as pandas astype. But it also accepts
sloppy class names like type_info knows.
If no target datatype is specified, it will be selected automatically.
If used with a DataFrame:
Converts all datatypes automatically.                      

* category_maxsize: How big can a category get to be suggested as datatype_suggest?
* nanless_ints: Are numpy's integer classes (that don't know NaN) suggested as datatype_suggest?    
[31mFile:[39m      d:\dropbox\31_projekte\01_python\88_pycharm\pandasklar\src\pandasklar\analyse.py
[31mType:[39m      function

In [11]:
# example Series, before
pak.analyse_datatypes(df.int_fein)

Unnamed: 0,col_name,datatype_instance,datatype,datatype_short,is_numeric,is_string,is_datetime,is_hashable,nan_allowed,mem_usage
0,int_fein,intc,np.int32,int32,True,False,False,True,False,39.1 KB


In [12]:
# example Series, after 
a = pak.change_datatype(df.int_fein)
pak.analyse_datatypes(a)

int_fein             --> pd.Int8   


Unnamed: 0,col_name,datatype_instance,datatype,datatype_short,is_numeric,is_string,is_datetime,is_hashable,nan_allowed,mem_usage
0,int_fein,int8,pd.Int8,Int8,True,False,False,True,True,19.5 KB


In [13]:
# whole DataFrame
df2 = pak.change_datatype(df)
pak.analyse_cols(df2)

change_datatype 
int_grob             --> pd.Int8   
int_fein             --> pd.Int8   
float_summe          --> np.float32
int_nan              --> pd.Int8   
float_nan            --> np.float32
City                 --> pd.string 
change_datatype before: 4.8 MB after: 4.7 MB



Unnamed: 0,col_name,datatype_instance,datatype,datatype_short,datatype_suggest,datatype_identified,is_numeric,is_string,is_datetime,is_hashable,...,ntypes,nunique,nnan,ndups,n,vmin,vmean,vmedian,vmax,vsum
0,__index__,int64,np.int64,int64,np.int16,int,True,False,False,True,...,1,10000,0,0,10000,0.0,4999.5,4999.5,9999.0,49995000.0
1,int_grob,int8,pd.Int8,Int8,,int,True,False,False,True,...,1,10,0,9990,10000,0.0,44.79,40.0,90.0,447900.0
2,int_fein,int8,pd.Int8,Int8,,int,True,False,False,True,...,1,255,0,9745,10000,-127.0,0.53,1.0,127.0,5285.0
3,float_summe,float32,np.float32,float32,,float,True,False,False,True,...,1,824,0,9176,10000,-623.0,-197.82,-198.0,218.0,-1978154.12
4,first_name,str,pd.string,string,,string,False,True,False,True,...,1,435,0,9565,10000,Adolf,,,Zoe,
5,Letter1,str,pd.string,string,,string,False,True,False,True,...,1,24,0,9976,10000,A,,,Z,
6,string_nonan,str,pd.string,string,,string,False,True,False,True,...,1,10000,0,0,10000,005T7I,,,üüDe,
7,string_nan,str,pd.string,string,,string,False,True,False,True,...,1,9049,951,0,10000,00PQ1,,,üümTV8Ä,
8,int_nan,int8,pd.Int8,Int8,,int,True,False,False,True,...,1,128,960,8912,10000,0.0,63.59,64.0,127.0,574840.0
9,float_nan,float32,np.float32,float32,,float,True,False,False,True,...,1,8069,1930,1,10000,0.0,0.5,0.5,1.0,4042.36


## copy_datatype()

In [14]:
?pak.copy_datatype

[31mSignature:[39m pak.copy_datatype(data_to, data_from)
[31mDocstring:[39m
Copies the dtypes from data_from to data_to. 
Usable for Series and DataFrames.
When applied on a DataFrame, it's applied to all column names that match.
[31mFile:[39m      d:\dropbox\31_projekte\01_python\88_pycharm\pandasklar\src\pandasklar\pandas.py
[31mType:[39m      function

In [15]:
# Target
df1 = df.copy()                      # Wrong datatypes
df1 = pak.drop_cols(df1,'int_fein')  # One column is missing

# Template
df2 = pak.change_datatype(df)        # Target datatypes
df2 = pak.drop_cols(df2,'City')      # One column is missing

change_datatype 
int_grob             --> pd.Int8   
int_fein             --> pd.Int8   
float_summe          --> np.float32
int_nan              --> pd.Int8   
float_nan            --> np.float32
City                 --> pd.string 
change_datatype before: 4.8 MB after: 4.7 MB



In [16]:
result = pak.copy_datatype(df1, df2)

# Result: All columns except 'City' were treated
pak.analyse_cols(result)

Unnamed: 0,col_name,datatype_instance,datatype,datatype_short,datatype_suggest,datatype_identified,is_numeric,is_string,is_datetime,is_hashable,...,ntypes,nunique,nnan,ndups,n,vmin,vmean,vmedian,vmax,vsum
0,__index__,int64,np.int64,int64,np.int16,int,True,False,False,True,...,1,10000,0,0,10000,0.0,4999.5,4999.5,9999.0,49995000.0
1,int_grob,int8,pd.Int8,Int8,,int,True,False,False,True,...,1,10,0,9990,10000,0.0,44.79,40.0,90.0,447900.0
2,float_summe,float32,np.float32,float32,,float,True,False,False,True,...,1,824,0,9176,10000,-623.0,-197.82,-198.0,218.0,-1978154.12
3,first_name,str,pd.string,string,,string,False,True,False,True,...,1,435,0,9565,10000,Adolf,,,Zoe,
4,Letter1,str,pd.string,string,,string,False,True,False,True,...,1,24,0,9976,10000,A,,,Z,
5,string_nonan,str,pd.string,string,,string,False,True,False,True,...,1,10000,0,0,10000,005T7I,,,üüDe,
6,string_nan,str,pd.string,string,,string,False,True,False,True,...,1,9049,951,0,10000,00PQ1,,,üümTV8Ä,
7,int_nan,int8,pd.Int8,Int8,,int,True,False,False,True,...,1,128,960,8912,10000,0.0,63.59,64.0,127.0,574840.0
8,float_nan,float32,np.float32,float32,,float,True,False,False,True,...,1,8069,1930,1,10000,0.0,0.5,0.5,1.0,4042.36
9,City,str,object,object,pd.string,string,False,False,False,True,...,1,2,3025,6973,10000,Bremen,,,Bremerhaven,


## type_info()

In [17]:
?pak.type_info

[31mInit signature:[39m pak.type_info(search)
[31mDocstring:[39m     
Provides information about pandas types and standardises them.
Is initialised with anything, e.g. with the name of a class, or with the class itself.
Or, even better, with a series.
Ex:   i = type_info('Int32')     
      i.info()            # returns all attributes, including for example:
      i.class_object      # the class object
      i.name              # the name of the Dtype
      i.name_instance     # type of the contents of the series
      i.instance1         # an example instance that is not NaN              
[31mFile:[39m           d:\dropbox\31_projekte\01_python\88_pycharm\pandasklar\src\pandasklar\type_info.py
[31mType:[39m           type
[31mSubclasses:[39m     

In [18]:
# Call with class name
i = pak.type_info('uint32')     
i.info()

{'instance1': None,
 'instance2': None,
 'name': 'np.uint32',
 'framework': 'np',
 'name_short': 'uint32',
 'name_long': 'np.uint32',
 'class_object': numpy.uint32,
 'is_hashable': True,
 'nan_allowed': False,
 'name_instance': '',
 'xmin': 0,
 'xmax': 4294967295}

In [19]:
# Call with class name
i = pak.type_info('intc')     
i.info()

{'instance1': None,
 'instance2': None,
 'name': 'np.intc',
 'framework': 'np',
 'name_short': 'intc',
 'name_long': 'np.intc',
 'class_object': numpy.intc,
 'is_hashable': True,
 'nan_allowed': False,
 'name_instance': '',
 'xmin': -2147483648,
 'xmax': 2147483647}

In [20]:
# Call with class name
i = pak.type_info('category')     
i.info()

{'instance1': None,
 'instance2': None,
 'name': 'pd.category',
 'framework': 'pd',
 'name_short': 'category',
 'name_long': 'pd.Categorical',
 'class_object': pandas.core.arrays.categorical.Categorical,
 'is_hashable': True,
 'nan_allowed': True,
 'name_instance': '',
 'xmin': None,
 'xmax': None}

In [21]:
# Call with sloppy class name
i = pak.type_info('pd.String')     
i = pak.type_info('str')  
i.info()

{'instance1': None,
 'instance2': None,
 'name': 'pd.string',
 'framework': 'pd',
 'name_short': 'string',
 'name_long': 'pd.StringDtype',
 'class_object': pandas.core.arrays.string_.StringDtype,
 'is_hashable': True,
 'nan_allowed': True,
 'name_instance': '',
 'xmin': None,
 'xmax': None}

In [22]:
# Call with class name
i = pak.type_info('StringDtype')     
i.info()

{'instance1': None,
 'instance2': None,
 'name': 'pd.string',
 'framework': 'pd',
 'name_short': 'string',
 'name_long': 'pd.StringDtype',
 'class_object': pandas.core.arrays.string_.StringDtype,
 'is_hashable': True,
 'nan_allowed': True,
 'name_instance': '',
 'xmin': None,
 'xmax': None}

In [23]:
# Call with class name
i = pak.type_info('float16')     
i.info()

{'instance1': None,
 'instance2': None,
 'name': 'np.float16',
 'framework': 'np',
 'name_short': 'float16',
 'name_long': 'np.float16',
 'class_object': numpy.float16,
 'is_hashable': True,
 'nan_allowed': True,
 'name_instance': '',
 'xmin': np.float16(-65500.0),
 'xmax': np.float16(65500.0)}

In [24]:
# Call with class 
i = pak.type_info(np.int16)     
i.info()

{'instance1': None,
 'instance2': None,
 'name': 'np.int16',
 'framework': 'np',
 'name_short': 'int16',
 'name_long': 'np.int16',
 'class_object': numpy.int16,
 'is_hashable': True,
 'nan_allowed': False,
 'name_instance': '',
 'xmin': -32768,
 'xmax': 32767}

In [25]:
# Call with class
i = pak.type_info(pd.Int64Dtype)     
i.info()

{'instance1': None,
 'instance2': None,
 'name': 'pd.Int64',
 'framework': 'pd',
 'name_short': 'Int64',
 'name_long': 'pd.Int64Dtype',
 'class_object': pandas.core.arrays.integer.Int64Dtype,
 'is_hashable': True,
 'nan_allowed': True,
 'name_instance': '',
 'xmin': -9223372036854775808,
 'xmax': 9223372036854775807}

In [26]:
# Call with Series
pak.type_info(df.List).info()

{'instance1': ['Anette',
  'Anja',
  'Peter',
  'Arthur',
  'Hanna',
  'Anna',
  'Sascha',
  'Oskar'],
 'instance2': ['Jana', 'Ella', 'Anna', 'Anja', 'Jana', 'Simone', 'Florian'],
 'name': 'object',
 'framework': '',
 'name_short': 'object',
 'name_long': 'object',
 'class_object': object,
 'is_hashable': False,
 'nan_allowed': True,
 'name_instance': 'list',
 'xmin': None,
 'xmax': None}

# Spielwiese

In [27]:
import pytest
import pandas as pd
import numpy as np
from pandasklar.analyse import change_datatype
from bpyth import rtype
import pandasklar as pak
import random

In [29]:
from pandas.api.types import is_string_dtype, is_numeric_dtype

In [30]:
def generate_random_data_tcdt(seed=42, anz=1000):
    """
    Generiert zufällige Testdaten für change_datatype mit einem festen Seed.

    Args:
        seed (int): Der Seed für den Zufallsgenerator.

    Returns:
        pd.DataFrame: Ein DataFrame mit zufälligen Testdaten.
    """
    random.seed(seed)
    np.random.seed(seed)

    # Generate random data
    a = pak.random_series(anz, 'int', min=-500, max=100)
    b = pak.random_series(anz, 'int', min=-127, max=127, p_dup=0)  # keine Dups erlaubt
    c = a + b + 0.0001
    a = a % 10 * 10
    v = pak.random_series(anz, 'name', p_nan=0)
    w = v.str[:1]
    s = pak.random_series(anz, 'string', p_nan=0)
    t = pak.random_series(anz, 'string', p_nan=0.1)
    m = pak.random_series(anz, 'int', min=0, max=127, p_nan=0.1)
    n = pak.random_series(anz, 'float', decimals=4, p_nan=0.2)  # * 70000
    o = pak.random_series(anz, 'choice', choice=['Bremen', 'Bremerhaven'], p_nan=0.3, p_dup=0)
    p = pak.random_series(anz, 'list', p_nan=0.1, p_dup=0.5)
    q = pak.random_series(anz, 'time', p_nan=0.1, p_dup=0.5)
    z = pak.random_series(anz, 'mix', p_nan=0.01, p_dup=0)

    df = pak.dataframe([a, b, c, v, w, s, t, m, n, o, p, q, z], verbose=False)
    df.columns = ['int_grob', 'int_fein', 'float_summe', 'first_name', 'Letter1', 'string_nonan', 'string_nan',
                  'int_nan', 'float_nan', 'City', 'List', 'time', 'Mix']
    df.float_summe = df.float_summe.astype('float')
    return df

In [53]:
        s1 = pd.Series([1, 2, 3], name='test')
        s2 = pd.Series([3, 2, 1], name='test')

In [54]:
df = generate_random_data_tcdt()
dfc= generate_random_data_tcdt()
dfc = change_datatype(dfc)
result = pak.copy_datatype(df, dfc)
result2 = pak.copy_datatype(result, df)

change_datatype 
int_grob             --> pd.Int8   
int_fein             --> pd.Int8   
float_summe          --> np.float32
int_nan              --> pd.Int8   
float_nan            --> np.float32
City                 --> pd.string 
change_datatype before: 492.5 KB after: 481.4 KB



In [57]:
        df = generate_random_data_tcdt()
        result2 = change_datatype(df)

change_datatype 
int_grob             --> pd.Int8   
int_fein             --> pd.Int8   
float_summe          --> np.float32
int_nan              --> pd.Int8   
float_nan            --> np.float32
City                 --> pd.string 
change_datatype before: 492.5 KB after: 481.4 KB



In [65]:
comparison = pak.compare_dataframes(df, result2, decimals=None)
comparison

Unnamed: 0,name,dtype,nnan,nan_pat,content,sort,eq
int_grob,True,False,True,True,True,True,True
int_fein,True,False,True,True,True,True,True
float_summe,True,False,True,True,False,True,False
first_name,True,True,True,True,True,True,True
Letter1,True,True,True,True,True,True,True
string_nonan,True,True,True,True,True,True,True
string_nan,True,True,True,True,True,True,True
int_nan,True,False,True,True,True,True,True
float_nan,True,False,True,True,True,True,False
City,True,False,True,True,True,True,True


In [61]:
s = df.int_grob
t = result.int_grob

In [62]:
index_s = s.reset_index(drop=True).sort_values().index.to_list()
index_t = t.reset_index(drop=True).sort_values().index.to_list()

In [64]:
index_t

[688,
 612,
 828,
 829,
 601,
 600,
 575,
 96,
 572,
 228,
 93,
 851,
 853,
 214,
 856,
 233,
 564,
 556,
 554,
 81,
 535,
 77,
 875,
 525,
 257,
 266,
 506,
 230,
 502,
 212,
 112,
 711,
 738,
 704,
 166,
 167,
 693,
 680,
 759,
 671,
 765,
 133,
 771,
 110,
 668,
 648,
 645,
 638,
 633,
 629,
 202,
 627,
 803,
 807,
 116,
 115,
 811,
 649,
 898,
 153,
 459,
 327,
 454,
 453,
 41,
 336,
 444,
 440,
 948,
 435,
 14,
 337,
 334,
 331,
 30,
 967,
 978,
 394,
 972,
 401,
 22,
 27,
 396,
 332,
 460,
 969,
 2,
 390,
 9,
 287,
 5,
 477,
 988,
 359,
 990,
 55,
 346,
 479,
 905,
 903,
 184,
 720,
 626,
 345,
 330,
 642,
 171,
 364,
 640,
 683,
 681,
 386,
 187,
 179,
 675,
 695,
 494,
 211,
 205,
 533,
 478,
 722,
 254,
 522,
 517,
 547,
 510,
 283,
 269,
 503,
 480,
 500,
 497,
 265,
 473,
 466,
 558,
 625,
 207,
 618,
 615,
 485,
 409,
 414,
 422,
 310,
 427,
 599,
 452,
 571,
 297,
 236,
 328,
 529,
 353,
 736,
 793,
 809,
 815,
 849,
 88,
 859,
 83,
 867,
 872,
 69,
 68,
 890,
 65,
 64,
 6