# Useful Python Script Bits

In this notebook one will find some usefull Python functions to be re-used throughout projects.

In [39]:
import colorama
from colorama import Fore

def color_text(txt, color) -> str:
    """Change str txt to color for terminal output
    Args:
        txt: text to change print color
        color: str color we want: ['green','cyan','magenta','red','blue']
    Return:
        txt: output text with ansi color
    """
    colorama.init()
    txt = str(txt)
    if color == "green":
        txt = Fore.GREEN + txt
    elif color == "red":
        txt = Fore.RED + txt
    elif color == "yellow":
        txt = Fore.YELLOW + txt
    elif color == "magenta":
        txt = Fore.MAGENTA + txt
    elif color == "blue":
        txt = Fore.BLUE + txt
    txt += Fore.RESET
    return txt

In [60]:
def dict_to_beautifulTable(dic, cols=[]):
    '''Converts a dict to beautifulTable
    '''
    from beautifultable import BeautifulTable 
    table = BeautifulTable()
    table.set_style(BeautifulTable.STYLE_BOX)
    if cols:
        table.column_headers = cols
    for key, val in dic.items():
        if isinstance(val, dict):
            table.append_row([key, dictToTable(val)])
        else:
            table.append_row([key, color_text(val, "yellow")])
    return table

## Summarizing any dataframe

We want:
- dimensions
- column types
- unique values of objects if not too many
- unique values of objects if not too many
- if numbers get min, max, mean, std (more ?)
- if date min and max

In [92]:
# necessary libraries
import numpy as np
import pandas as pd

def summarize_df(df, mode='dict', title=''):
    '''Function to summarize any dataframe (awesome)
    Args:
        df: dataframe we want to summarize
        mode: whether we want to print a dict, BeautifulTable or HTML table
        title: print title in summary if specified
    Return:
        summary: a dict with the summary of df
    '''
    if mode not in ['dict','table','html']:
        print("Error mode not recognized: {}".format(mode))
        return None
    
    summary = {}
    
    summary['shape'] = {'rows':df.shape[0],
                        'columns':df.shape[0]}
    #summary['columns'] = {}
    for col in df.columns:
        summary[col] = {'dtype':df[col].dtype}
        if df[col].dtype == np.object:
            uniques = df[col].unique()
            uniques_str = (', '.join(uniques[:11]), ', '.join(uniques[:8]) + ', ...')[len(uniques) > 11] 
            summary[col]['values'] = uniques_str
        elif df[col].dtype == np.int64 or df[col].dtype == np.float64:
            uniques = df[col].unique()
            uniques_str = (', '.join([str(x) for x in uniques[:11]]), ', '.join([str(x) for x in uniques[:8]]) + ', ...')[len(uniques) > 11]  
            summary[col]['values'] = uniques_str
            summary[col]['min'] = min(df[col])
            summary[col]['max'] = max(df[col])
            summary[col]['mean'] = round(df[col].mean(),4)
        elif np.issubdtype(df[col].dtype, np.datetime64):
            summary[col] = '{} -> {}'.format(min(df[col]),max(df[col]))
    
    if mode == 'dict':
        print(dict)
    elif mode == 'table':
        # make beautiful table and print 
        print(dictToTable(summary))
    elif mode == 'html':
        # make html table and print 
        pass

    return summary
    

In [27]:
from sklearn import datasets

In [31]:
iris = datasets.load_iris()
iris_df = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                     columns= iris['feature_names'] + ['target'])
iris_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0
5,5.4,3.9,1.7,0.4,0.0
6,4.6,3.4,1.4,0.3,0.0
7,5.0,3.4,1.5,0.2,0.0
8,4.4,2.9,1.4,0.2,0.0
9,4.9,3.1,1.5,0.1,0.0


In [93]:
r = summarize_df(iris_df,mode='table')

┌───────────────────┬──────────────────────────────────────────────────────────┐
│       shape       │                    ┌─────────┬─────┐                     │
│                   │                    │  rows   │ 150 │                     │
│                   │                    ├─────────┼─────┤                     │
│                   │                    │ columns │ 150 │                     │
│                   │                    └─────────┴─────┘                     │
├───────────────────┼──────────────────────────────────────────────────────────┤
│ sepal length (cm) │ ┌────────┬─────────────────────────────────────────────┐ │
│                   │ │ dtype  │                   float64                   │ │
│                   │ ├────────┼─────────────────────────────────────────────┤ │
│                   │ │ values │ 5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.4, 4.8, ... │ │
│                   │ ├────────┼─────────────────────────────────────────────┤ │
│                   │ │  min

In [77]:
vals = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s']
if len(vals) > 11:
    output = ', '.join(vals[:8]) + ', ...'
else:
    output = ', '.join(vals[:11])
print(output)

a, b, c, d, e, f, g, h, ...


In [89]:
vals = [5.1,3.1,2.5,1.3,3.5,4.3,2.3,5.6,7.6,8.7,8.9,9.3,4.5,6.4]
output = (', '.join([str(x) for x in vals[:11]]), ', '.join([str(x) for x in vals[:8]]) + ', ...')[len(vals) > 11] 
print(output)

5.1, 3.1, 2.5, 1.3, 3.5, 4.3, 2.3, 5.6, ...
