__[Open and try this file online (Colab)](https://colab.research.google.com/github/djekra/pandasklar/blob/master/jupyter/24_Create_DataFrames_Easily.ipynb)__

# Create DataFrames Easily
* `dataframe`: Converts multidimensional objects into dataframes. Dictionaries and Tuples are interpreted column-wise, Lists and Counters by rows.

In [25]:
# blab init
try:
    import blab
except ImportError as e:
    !pip install blab
    import blab    
startup_notebook = blab.blab_startup()
%run $startup_notebook 

blab init
environment['in_colab']     = False
environment['dropbox_path'] = /media/me/LinuxDropbox/Dropbox
environment['lib_path']     = /media/me/LinuxDropbox/Dropbox/31_Projekte/01_Python/libs
Start Time: 17:40:50


In [26]:
import numpy      as np
import pandas     as pd 
import polars     as pl
import datetime   as dt
import bpyth      as bpy

# pandasklar
try:
    import pandasklar as pak 
except ImportError as e:
    !pip install pandasklar
    import pandasklar as pak   
    
# Config
pak.Config.set('VERBOSE', True)
pak.Config.set('FRAMEWORK', 'polars')

# copy_on_write
pd.set_option("mode.copy_on_write", True)

VERBOSE = True
--> setting parameter verbose=True as default for all pandasklar functions

FRAMEWORK = polars
--> setting parameter framework=polars as default for all pandasklar functions



## dataframe()

In [27]:
help(pak.dataframe)

<span style="font-size:larger; margin-top: 15px; display: block;">**dataframe**(inp, verbose=None, framework=None):</span>

Converts various multidimensional objects into DataFrames (Pandas or Polars).

This function intelligently transforms a wide range of input data structures into either a
Pandas DataFrame or a Polars DataFrame, based on the specified `framework`. It automatically
infers the intended structure (row-wise or column-wise) and handles various data types.

**Input Handling:**

The function can handle the following input types:

- **Scalars:** Single values (int, float, str, bool) are converted into a 1x1 DataFrame.
- **Lists:**
    - One-dimensional lists are interpreted as a single row.
    - Multidimensional lists (list of lists, list of tuples) are interpreted row-wise,
      where each inner list/tuple represents a row.
    - A list of Series is interpreted column-wise.
- **Tuples:**
    - One-dimensional tuples are interpreted as a single row.
    - Multidimensional tuples (tuple of lists, tuple of tuples) are interpreted column-wise,
      where each inner list/tuple represents a column.
    - A tuple of Series is interpreted column-wise.
- **Dictionaries:**
    - One-dimensional dictionaries are interpreted as a single row.
    - Multidimensional dictionaries are interpreted column-wise, where keys become column names.
    - Lists of dictionaries are interpreted row-wise.
- **Pandas Series** (pd.Series): Interpreted as a single column.
- **Polars Series** (pl.Series): Interpreted as a single column.
- **NumPy ndarrays** (np.ndarray): Interpreted as a single column.
- **Lists/Tuples of Series/ndarrays:** Interpreted column-wise, where each Series/ndarray represents a column.

**Key Features:**
- **Automatic Structure Inference:** The function automatically determines whether the input data
  should be interpreted row-wise or column-wise, based on the input type.
- **Column Name Handling:** Sensible column names are automatically assigned if not provided.
  Duplicate column names or numeric column names are replaced with letters (A, B, C, ...).
- **Framework Flexibility:** Supports both Pandas and Polars DataFrames.
- **Series/ndarray Handling:** Correctly handles Pandas Series, Polars Series, and NumPy ndarrays as columns.
- **Robustness:** Handles various edge cases, including empty inputs and mixed data types.

**Args:**
- `inp`: The input object to be converted into a DataFrame. Can be any of the types listed above.
- `verbose` (bool, optional): If True, prints detailed information about the input object and the
        conversion process. Defaults to the value of `Config.get('VERBOSE')`.
- `framework` (str, optional): Specifies the desired DataFrame framework. Must be either 'pandas'
        or 'polars'. Defaults to the value of `Config.get('FRAMEWORK')`.

**Returns:**
- pandas.DataFrame or polars.DataFrame: The resulting DataFrame, based on the specified `framework`.

### Rowwise

In [28]:
# 2 dimensions 
inp = [ list('Hallo'), 
        list('Welt!'), 
        list('Coole'),  
        list('Sache'),        
      ]

In [29]:
out = pak.dataframe(inp, framework='pandas')
out

dataframe: Input rtype=('list', 'list', 'str') shape=(4, 5)


Unnamed: 0,A,B,C,D,E
0,H,a,l,l,o
1,W,e,l,t,!
2,C,o,o,l,e
3,S,a,c,h,e


In [30]:
# 2 dimensions
inp = [[1,2,3],[4,5,6], [7, 8, 9], [10, 11, 12]]
out = pak.dataframe(inp, framework='polars')
out

dataframe: Input rtype=('list', 'list', 'int') shape=(4, 3)


A,B,C
i64,i64,i64
1,2,3
4,5,6
7,8,9
10,11,12


### Columnwise

In [31]:
# Tuple of lists
Number = [1,2,3,4,6]
L1     = ['a','v','vvvv','e','Q']
L2     = [100, 55, 315, 68, 23]
L3     = ['18%','105%','56%','12%','4%']
inp = (Number, L1, L2, L3)

df = pak.dataframe(inp, framework='pandas')
df

dataframe: Input rtype=('tuple', 'list', 'int') shape=(4, 5)


Unnamed: 0,A,B,C,D
0,1,a,100,18%
1,2,v,55,105%
2,3,vvvv,315,56%
3,4,e,68,12%
4,6,Q,23,4%


In [32]:
# dict of lists
inp = {'AA': [1,1,1,1],
       'BB': [2,4,8,16],
       'CC': [3,6,9,12],
       'DD': [4,4,4,4],
       'EE': [5,10,15,20],                
               }
out = pak.dataframe(inp)
out

dataframe: Input rtype=('dict', 'list', 'int') shape=(5, 4)
dataframe: dict


AA,BB,CC,DD,EE
i64,i64,i64,i64,i64
1,2,3,4,5
1,4,6,4,10
1,8,9,4,15
1,16,12,4,20


In [33]:
# dict of tuples
inp = {'AA': (1,1,1,1   ),
       'BB': (2,4,8,16  ),
       'CC': (3,6,9,12  ),
       'DD': (4,4,4,4   ),
       'EE': (5,10,15,20),                
               }
out = pak.dataframe(inp)
out

dataframe: Input rtype=('dict', 'tuple', 'int') shape=(5, 4)
dataframe: dict


AA,BB,CC,DD,EE
i64,i64,i64,i64,i64
1,2,3,4,5
1,4,6,4,10
1,8,9,4,15
1,16,12,4,20


In [34]:
# list of dicts
inp = [ {'AA': 1, 'BB': 2,  'CC': 3, },
        {'AA': 1, 'BB': 4,  'CC': 9, },
        {'AA': 1, 'BB': 16, 'CC': 81, } ]    
out = pak.dataframe(inp)
out

dataframe: Input rtype=('list', 'dict', 'int') shape=(3, 3)


AA,BB,CC
i64,i64,i64
1,2,3
1,4,9
1,16,81


In [35]:
# dict of dicts
inp ={
 0: {'AA':  5, 'BB': 0, 'CC': 3, 'DD': 3},
 1: {'AA': 10, 'BB': 0, 'CC': 3, 'DD': 5},
 2: {'AA': 15, 'BB': 0, 'CC': 7, 'DD': 6}}
out = pak.dataframe(inp, framework='polars')
out

dataframe: Input rtype=('dict', 'dict', 'int') shape=(3, 4)
dataframe: dict


0,1,2
i64,i64,i64
5,10,15
0,0,0
3,3,7
3,5,6


In [36]:
# dict of mix
inp = { 'AA': np.array([-77] * 4, dtype='int32'),
        'BB': pd.Categorical(["test", "train", "test", "train"]),
        'CC': pd.Series(1, index=list(range(4)), dtype='float32'),
        'DD': 'foo', 
      }
out = pak.dataframe(inp)
out

dataframe: Input rtype=('dict', 'ndarray', 'int') shape=(-77, -77)
dataframe: dict


AA,BB,CC,DD
i32,cat,f32,str
-77,"""test""",1.0,"""foo"""
-77,"""train""",1.0,"""foo"""
-77,"""test""",1.0,"""foo"""
-77,"""train""",1.0,"""foo"""


In [37]:
# Linear Series
inp = {'AA': np.linspace(0, 10), 
       'BB': np.linspace(-10, 0) }
out = pak.dataframe(inp)
out.head(4)

dataframe: Input rtype=('dict', 'ndarray', 'float') shape=(2, 50)
dataframe: dict


AA,BB
f64,f64
0.0,-10.0
0.204082,-9.795918
0.408163,-9.591837
0.612245,-9.387755


### Columnwise: list or tuple of Series

In [38]:
?pak.random_series

[31mSignature:[39m pak.random_series(size, typ, framework=[38;5;28;01mNone[39;00m, **kwargs)
[31mDocstring:[39m
Returns a series of random data. 
* size
* typ: 'int', 'float', 'string', 'name', 'choice', 'list', 'time', mix',
       'ascending', 'descending', 'perlin' or 'errorprone'. Or the first letter of this.
       'name' generates random first names, 'list' generates lists of random first names.
       'mix' generates mixed datatypes. 
       'ascending', 'descending' and 'perlin' generates ordered random sequences.
       'errorprone' generates sequences of NaNs, 0, 1 with similar index. Useful for testing. 

The other arguments are passed to the appropriate functions for the type of random data.
General arguments are:
* name
* p_nan: value 0..1 specifies  how many NaNs are interspersed
* p_dup: value 0..1 determines how many Dups are included.

There are extra parameters for some types of random data:
- int:    min, max: closed interval, min and max are both possible valu

In [39]:
# Try
pak.random_series( 10, 'int')

rnd_int
i64
973
124
505
765
233
200
431
716
613
923


In [40]:
# 
anz = 10
a = pak.random_series( anz, 'name',                                                             name='first_name' )
b = pak.random_series( anz, 'int',    min=20, max=30) + pak.random_series( anz, 'int', min=0, max=12) 
b.rename('age')
c = pak.random_series( anz, 'choice', choice=['Bremen','Berlin'],      p_nan=0.3,   p_dup=0,    name='birtplace')
d = pak.random_series( anz, 'int',    min=10000, max=99999,            p_nan=0.02,  p_dup=0.3,  name='zip')
e = pak.random_series( anz, 'string', len_min=5, len_max=10,           p_nan=0,     p_dup=0,    name='secret')
f = pak.random_series( anz, 'string', len_min=0, len_max=5,            p_nan=0,     p_dup=0.2,  name='features')
if pak.Config.get('FRAMEWORK') == 'pandas':
    f = f.apply(set) 
else:
    f = f.map_elements(set, return_dtype=pl.Object)
g = pak.random_series( anz, 'choice', choice=['ABC','ABCC','','abc','cba','Ax','AAA','ACCB','bbab'],  name='history')
if pak.Config.get('FRAMEWORK') == 'pandas':
    g = g.apply(list) 
else:
    g = g.map_elements(list, return_dtype=pl.List(pl.Utf8))
inp = [a,b,c,d,e,f,g]
pak.dataframe(inp)

dataframe: Input rtype=('list', 'Series', 'str') shape=(-77, -77)
dataframe: list or tuple of ndarray or Series


first_name,rnd_int,birtplace,zip,secret,features,history
str,i64,str,i64,str,object,list[str]
"""Carolin""",37,"""Berlin""",17139.0,"""BaFiqÜG""","{'v', 'g', 'K', 'f', 'T'}","[""A"", ""B"", ""C""]"
"""Swen""",23,,30923.0,"""ZVnQsu""",set(),"[""A"", ""A"", ""A""]"
"""Walter""",26,,,"""FwlCXI""",{'p'},"[""A"", ""x""]"
"""Fabian""",34,"""Bremen""",92175.0,"""mjöVeCaupA""",{'d'},"[""c"", ""b"", ""a""]"
"""Tom""",38,"""Berlin""",52054.0,"""ücDPZwoÖn""","{'F', 'l', 'V', 'G'}","[""A"", ""B"", … ""C""]"
"""Till""",26,"""Berlin""",43430.0,"""BuOUq2Yg""","{'Z', 'y', 't', 'e', '0'}",[]
"""Anna""",23,,66523.0,"""n21Uqlee""","{'X', 'p'}","[""A"", ""C"", … ""B""]"
"""Georg""",33,"""Berlin""",43430.0,"""uvKSZt""","{'i', 'm'}","[""b"", ""b"", … ""b""]"
"""Tanja""",38,"""Bremen""",20970.0,"""yoÖG5Ivul2""",{'d'},"[""a"", ""b"", ""c""]"
"""Lisa""",36,"""Bremen""",17139.0,"""UtafQBCwy""","{'i', 'm'}","[""A"", ""A"", ""A""]"


### Create DataFrame from Counter

In [41]:
from collections import Counter
import re
text = "Dies ist ein Testtext, der nicht nur ein Wort mehrfach enthält. Testtext Testtext."
text = re.sub(r'[^\w\s]', '', text)  # Satzzeichen entfernen
word_counts = Counter(text.lower().split())
data = word_counts.most_common()
data

[('testtext', 3),
 ('ein', 2),
 ('dies', 1),
 ('ist', 1),
 ('der', 1),
 ('nicht', 1),
 ('nur', 1),
 ('wort', 1),
 ('mehrfach', 1),
 ('enthält', 1)]

In [42]:
pak.dataframe(data)

dataframe: Input rtype=('list', 'tuple', 'str') shape=(10, 2)


A,B
str,i64
"""testtext""",3
"""ein""",2
"""dies""",1
"""ist""",1
"""der""",1
"""nicht""",1
"""nur""",1
"""wort""",1
"""mehrfach""",1
"""enthält""",1


## Spielwiese