In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
from datetime import datetime as dt
import os
import gc
import string
import warnings
warnings.filterwarnings('ignore')

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
#set cell width as percentage of window
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:

def my_timer(orig_func):
    import time

    @wraps(orig_func)
    def wrapper(*args, **kwargs):
        t1 = time.time()
        result = orig_func(*args, **kwargs)
        t2 = time.time() - t1
        print('{} ran in: {} sec'.format(orig_func.__name__, t2))
        return result

    return wrapper

In [None]:
def my_logger(orig_func):
    import logging
    logging.basicConfig(filename='{}.log'.format(orig_func.__name__), level=logging.INFO)

    @wraps(orig_func)
    def wrapper(*args, **kwargs):
        logging.info(
            'Ran with args: {}, and kwargs: {}'.format(args, kwargs))
        return orig_func(*args, **kwargs)

    return wrapper



In [None]:
#function to parallel compute dataframe operations

def parallelize_dataframe(df, func, n_cores=4):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [1]:
def try_div_itr(itr):
    for elem in itr:
        try:
             yield 1 / elem
        except ZeroDivisionError:
             pass

result = list(try_div_itr([-2, -1, 0, 1, 2]))
result

[-0.5, -1.0, 1.0, 0.5]

In [None]:
def try_itr(func, itr, *exceptions, **kwargs):
    for elem in itr:
        try:
            yield func(elem, **kwargs)
        except exceptions:
            pass

In [None]:
eggs = (1,3,0,3,2)
[1/egg for egg in eggs]#cause division by 0 error
def catch(func, handle=lambda e : e, *args, **kwargs):
    try:
        return func(*args, **kwargs)
    except Exception as e:
        return handle(e)
[catch(lambda : 1/egg) for egg in eggs]
[1, 0, ('integer division or modulo by zero'), 0, 0]

In [3]:
eggs = (1,3,0,3,2)
#[1/egg for egg in eggs]#cause division by 0 error
def catch(func, handle=lambda e : e, *args, **kwargs):
    try:
        return func(*args, **kwargs)
    except Exception as e:
        return handle(e)
[catch(lambda : 1/egg) for egg in eggs]
[1, 0, ('integer division or modulo by zero'), 0, 0]

[1, 0, 'integer division or modulo by zero', 0, 0]

In [None]:
# fastest method to flatten list of arrays
chain = itertools.chain.from_iterable([[1,2],[3],[5,89],[],[6]])
print(list(chain))
[1, 2, 3, 5, 89, 6]

In [None]:
#expand to any dtype
def check_int(value):
    try:
        int(value)
        return np.NaN
    except ValueError:
        return value
#use in list comprehension to convert dytype == True to nan
#dropna on resulting dataframe to find all errors in dtype

In [None]:
def reduce_memory_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df.loc[:,col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df.loc[:,col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df.loc[:,col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df.loc[:,col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df.loc[:,col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df.loc[:,col] = df[col].astype(np.float32)
                else:
                    df.loc[:,col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
def strp_rename_df(df):
    df=df.apply(lambda x: x.strip() if type(x)==str else x).rename(columns=lambda x: x.strip())
    return df

In [None]:
def strp_by_col(df,col):#col is with string notation
    df[col]=df[col].apply(lambda x: x.strip() if type(x)==str else x)
    return df

In [None]:
def unique_values(series):
    print(series.nunique())
    print(series.value_counts())
    print(series.value_counts().sum())
    print(len(series))
    return

In [None]:
ltr_chars = string.ascii_letters
punct_chars = string.punctuation
punct_chars_dollars = punct_chars[:13]+punct_chars[15:]
def clean_ints(entry,ltr_chars=ltr_chars,punct_chars=punct_chars):
    proxies = ['!' for ltr in ltr_chars]
    proxy_chars = ''.join(map(str,proxies))
    puncts = ['!' for p in punct_chars]
    proxy_puncts = ''.join(map(str,puncts))
    ltr_cleaner = str.maketrans(ltr_chars,proxy_chars)
    entry_ltr = entry.translate(ltr_cleaner)
    punct_cleaner = str.maketrans(punct_chars,proxy_puncts)
    entry_clean = entry_ltr.translate(punct_cleaner)
    entry_clean = entry_clean.replace('!','')
    return entry_clean

In [None]:
def str_to_int(db,col):#col is WITH string notation
    db[col]=db[col].apply(lambda x: x.strip() if type(x)==str else x)
    db[col]=db[col].apply(lambda x: x.lstrip('0') if type(x)==str else x)
    db[col]=db[col].apply(lambda x: x.rstrip() if type(x)==str else x)
    db[col]=db[col].apply(lambda x: clean_ints(x) if type(x)==str else x)
    db[col]=db[col].apply(lambda x: x if type(x)==int else int(x) if x.isdigit() else 0)
    #db.dropna(inplace=True)
    return db

In [None]:
ltr_chars = string.ascii_letters
punct_chars = string.punctuation
punct_chars_dollars = punct_chars[:13]+punct_chars[15:]
def clean_money(entry,ltr_chars=ltr_chars,punct_chars=punct_chars_dollars):
    entry=str(entry)
    proxies = ['!' for ltr in ltr_chars]
    proxy_chars = ''.join(map(str,proxies))
    puncts = ['!' for p in punct_chars]
    proxy_puncts = ''.join(map(str,puncts))
    ltr_cleaner = str.maketrans(ltr_chars,proxy_chars)
    entry_ltr = entry.translate(ltr_cleaner)
    punct_cleaner = str.maketrans(punct_chars,proxy_puncts)
    entry_clean = entry_ltr.translate(punct_cleaner)
    entry_clean = entry_clean.replace('!','')
    return entry_clean

In [None]:
def str_to_money(db,col):#col is with string notation
    db[col]=db[col].apply(lambda x: x.strip() if type(x)==str else x)
    db[col]=db[col].apply(lambda x: x.lstrip('0') if type(x)==str else x)
    db[col]=db[col].apply(lambda x: x.rstrip() if type(x)==str else x)
    db[col]=db[col].apply(lambda x: clean_money(x))
    db[col]=db[col].apply(lambda x: pd.to_numeric(x,downcast='float'))
    return db

In [None]:
ltr_chars = string.ascii_letters
punct_chars = string.punctuation
punct_chars_date = punct_chars[:6]+punct_chars[7:12]+punct_chars[15:]
def clean_date(entry,ltr_chars=ltr_chars,punct_chars=punct_chars_date):
    proxies = ['!' for ltr in ltr_chars]
    proxy_chars = ''.join(map(str,proxies))
    puncts = ['!' for p in punct_chars]
    proxy_puncts = ''.join(map(str,puncts))
    ltr_cleaner = entry.maketrans(ltr_chars,proxy_chars)
    entry_ltr = entry.translate(ltr_cleaner)
    punct_cleaner = entry_ltr.maketrans(punct_chars,proxy_puncts)
    entry_clean = entry_ltr.translate(punct_cleaner)
    entry_clean = entry_clean.replace('!','')
    return entry_clean
    

In [None]:
def date_format_MN(db,name):
    db[name]=db[name].apply(lambda x: x.replace('.0 AM',''))
    db[name]=db[name].apply(lambda x: x.replace('.0 PM',''))
    db[name]=db[name].apply(lambda x: x.rstrip())
    db[name]=db[name].apply(lambda x: x.lstrip())
    db=db[db[name]!='?']
    db[name]=db[name].apply(lambda x: dt.strptime(str(x),'%b %d, %Y %I:%M:%S'))
    db[name]=db[name].apply(lambda x: x.strftime('%m/%d/%Y'))
    return db

In [None]:
def date_format_CRM(db,name):
    db[name]=db[name].apply(lambda x: x.rstrip() if x != '?' else x)
    db[name]=db[name].apply(lambda x: x.lstrip() if x != '?' else x)
    db[name]=db[name].apply(lambda x: dt.strptime(str(x),'%m/%d/%Y') if x != '?' else x)
    db[name]=db[name].apply(lambda x: x.strftime('%m/%d/%Y') if x != '?' else x)
    return db

In [None]:
def date_format_CRM_date(db,name):
    db[name]=db[name].apply(lambda x: x.rstrip() if x != '?' else x)
    db[name]=db[name].apply(lambda x: x.lstrip() if x != '?' else x)
    db[name]=db[name].apply(lambda x: dt.strptime(str(x),'%b %d, %Y') if x != '?' else x)
    db[name]=db[name].apply(lambda x: x.strftime('%m/%d/%Y') if x != '?' else x)
    return db

In [None]:
def date_format_line(db,name):
    db[name]=db[name].apply(lambda x: x.rstrip())
    db[name]=db[name].apply(lambda x: x.lstrip())
    db[name]=db[name].apply(lambda x: clean_date(x))
    db[name]=db[name].apply(lambda x: dt.strptime(str(x),'%Y%m%d').strftime('%m/%d/%Y') if x != '' else x)
    db[name]=db[name].apply(lambda x: pd.to_datetime(str(x), format='%m/%d/%Y') if x != '' else x)
    return db