In [35]:
import tenzing
from tenzing.core.models import model_relation
from tenzing.core.typesets import tenzing_typeset, tenzingTypeset
from tenzing.core.model_implementations import *
from collections import Counter
import pandas as pd
import numpy as np
from shapely import wkt
import datetime
from IPython.core.display import display, HTML
import networkx as nx
import itertools
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [36]:
def build_relation_graph(root_nodes, derivative_nodes):
    relation_graph = nx.DiGraph()
    relation_graph.add_node('root')
    relation_graph.add_nodes_from(root_nodes)
    relation_graph.add_edges_from(itertools.product(['root'], root_nodes))
    relation_graph.add_nodes_from(derivative_nodes)
    relation_graph.add_edges_from(node.edge for s_node in root_nodes for to_node, node in s_node.relations.items())
    relation_graph.add_edges_from(node.edge for s_node in derivative_nodes for to_node, node in s_node.relations.items())

    cycles = list(nx.simple_cycles(relation_graph))
    assert len(cycles) == 0, f'Cyclical relations between types {cycles} detected'
    return relation_graph

root_nodes = [tenzing_bool, tenzing_float, tenzing_object,
              tenzing_complex, tenzing_categorical, tenzing_timestamp,
              tenzing_integer]
derivative_types = [tenzing_string]

G = build_relation_graph(root_types, derivative_types)

def traverse_relation_graph(series, G, node='root'):
    successors = list(G.successors(node))
    if not successors:
        return node

    for tenz_type in successors:
        if series in tenz_type:
            return traverse_relation_graph(series, G, tenz_type)

series = pd.Series([2.0 * i for i in range(3)])
traverse_relation_graph(series, G)

tenzing_integer

In [37]:
series

0    0.0
1    2.0
2    4.0
dtype: float64

In [38]:
from tenzing.core.model_implementations.typesets import tenzing_standard

df = pd.DataFrame({'Col 1': range(3), 
                   'Col 2': [2 * i for i in range(3)], 
                   'Col 3': ['howdy', 'howdy', 'doody'],
                   'times': pd.to_datetime([datetime.date(2011,1,1), datetime.date(2012, 1, 1), datetime.date(2013,1,1)])})
x = tenzing_standard()
x.prep(df)
summer = x.summarize(df)


In [40]:
x = tenzing_standard()
x.prep(df)
y = x.summary_report(df)
y

0,1
Number of Observations,3
Number of Variables,4

0,1
tenzing_integer,2
tenzing_object,1
tenzing_timestamp,1

0,1
nunique,3.0
mean,1.0
std,1.0
max,2.0
min,0.0
median,1.0
n_records,3.0
n_zeros,1.0
perc_zeros,0.33
na_count,0.0

0,1
nunique,3.0
mean,2.0
std,2.0
max,4.0
min,0.0
median,2.0
n_records,3.0
n_zeros,1.0
perc_zeros,0.33
na_count,0.0

0,1
nunique,2
frequencies,"{'howdy': 2, 'doody': 1}"
n_records,3
na_count,0
perc_na,0.0

0,1
nunique,3
min,2011-01-01 00:00:00
max,2013-01-01 00:00:00
n_records,3
perc_unique,1.0
range,731 days 00:00:00
na_count,0
perc_na,0.0


In [None]:

from pandas_profiling import ProfileReport
ProfileReport(df)


In [None]:
types = tenzing_standard().types
series = df['Col 3']
[tenzing_type for tenzing_type in types if series in tenzing_type]

In [None]:
x.relation_map

In [None]:
from tenzing.core.models import tenzing_model

type(tenzing_model)

In [None]:
tenzing_integer

In [None]:
import pandas as pd
import numpy as np
x = pd.Series([1,2,3, np.nan])

print(x in tenzing_integer)
print(x in tenzing_float)

In [None]:
int_float_relation = model_relation(tenzing_float, lambda x: False, lambda x: False)
tenzing_float.register_relation(int_float_relation)

In [3]:
import numpy as np

x = pd.Series(list(range(10000)))
%timeit x.astype(np.float)
%timeit x.astype(float)

103 µs ± 9.1 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
99.5 µs ± 4.77 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [4]:
x.astype(int).nbytes

80000

In [5]:
test_series = pd.Series([pd.datetime(2010, 1, 1), pd.datetime(2010, 8, 2), pd.datetime(2011, 2, 1), np.nan])
tenzing_timestamp.summarize(test_series)

{'nunique': 3,
 'min': Timestamp('2010-01-01 00:00:00'),
 'max': Timestamp('2011-02-01 00:00:00'),
 'n_records': 4,
 'perc_unique': 1.0,
 'range': Timedelta('396 days 00:00:00'),
 'na_count': 1,
 'perc_na': 0.25}

In [6]:
import networkx as nx


In [9]:
class type_map:
    def __init__(self, name, to_node=False):
        self.name = name
        self.to_node = to_node

A = type_map('A', 'B')
B = type_map('B', 'C')
C = type_map('C', 'D')
D = type_map('D')
E = type_map('E', 'A')

types = [A, B, C, D, E]

my_graph = nx.DiGraph()
my_graph.add_node('root')
for typ in types:
    my_graph.add_node(typ.name)

for typ in types:
    if typ.to_node:
        my_graph.add_edge(typ.name, typ.to_node)

my_graph.add_edge('root', 'A')
my_graph.add_edge('root', 'E') 
    
list(my_graph.successors('root'))

['A', 'E']

In [10]:
my_graph = nx.DiGraph()
my_graph.add_nodes_from(types)
my_graph.add_edes_from

AttributeError: 'DiGraph' object has no attribute 'add_edes_from'

In [11]:
import itertools
list(itertools.product(types, 'A'))

[(<__main__.type_map at 0xa1609a2b0>, 'A'),
 (<__main__.type_map at 0xa1609a320>, 'A'),
 (<__main__.type_map at 0xa1609a358>, 'A'),
 (<__main__.type_map at 0xa1609a278>, 'A'),
 (<__main__.type_map at 0xa1609a390>, 'A')]

In [12]:
list(nx.simple_cycles(my_graph))

[]

In [13]:
root_types = [tenzing_bool, tenzing_float, tenzing_object,
                      tenzing_complex, tenzing_categorical, tenzing_timestamp,
                      tenzing_integer]

In [14]:
[node.edge for s_node in root_types for to, node in s_node.relations.items()]

[(tenzing_float, tenzing_integer)]

In [15]:
root_types[-1].relations.items()

dict_items([(tenzing_float, <tenzing.core.models.model_relation object at 0x113fdbd30>)])

In [16]:
root_types

[tenzing_bool,
 tenzing_float,
 tenzing_object,
 tenzing_complex,
 tenzing_categorical,
 tenzing_timestamp,
 tenzing_integer]