# Known issues with type detection

https://pandas.pydata.org/docs/user_guide/basics.html#basics-dtypes
https://pandas.pydata.org/docs/user_guide/integer_na.html#integer-na

For now, we do something incredibly naive: use the infered type directly. This has some obvious quirks:

- By default, if there are missing values in an int column, it will 
be cast to float64 instead of a nullable int type like int32D. This means directly infering type from the column types will be faulty with missing values. https://stackoverflow.com/questions/21287624/convert-pandas-column-containing-nans-to-dtype-int

- The object type is used both for pure string columns and mixed columns. This means one cannot reliably detect string types. We do something to try work around this for small datasets



In [1]:
import pandas as pd
import numpy as np

In [2]:
N_cols_per_type = 4
N_per_col = 1000

ints_df = pd.DataFrame(
    np.random.randint(0,100,size=(N_per_col, N_cols_per_type)),
    columns=[f"int_{n}" for n in range(N_cols_per_type)]
)

floats_df = pd.DataFrame(
    np.random.random(size=(N_per_col, N_cols_per_type)),
    columns=[f"float_{n}" for n in range(N_cols_per_type)]
)

strings_df = pd.DataFrame(
    np.random.choice(
        ["eeny", "meeny", "miny", "moe"],
        size=(N_per_col, N_cols_per_type)
    ),
    columns=[f"str_{n}" for n in range(N_cols_per_type)]
)

full_df = pd.concat([ints_df, floats_df, strings_df], axis=1)

In [3]:
full_df.head()

Unnamed: 0,int_0,int_1,int_2,int_3,float_0,float_1,float_2,float_3,str_0,str_1,str_2,str_3
0,94,66,45,73,0.879347,0.771386,0.670628,0.824132,miny,miny,eeny,miny
1,57,58,75,32,0.84925,0.218199,0.576388,0.154711,moe,meeny,eeny,moe
2,31,69,82,15,0.920094,0.470284,0.61351,0.801468,meeny,miny,miny,moe
3,56,27,71,52,0.431131,0.806534,0.340942,0.209477,eeny,eeny,miny,miny
4,50,4,3,1,0.685177,0.69236,0.926932,0.409046,eeny,moe,meeny,miny


In [4]:
null_targets = {
    "int_0": 0.1,
    "float_0": 0.2,
    "str_0": 0.3
}

for target_col, target_frac in null_targets.items():
  indeces = np.random.choice(range(N_per_col), size=int(N_per_col*target_frac), replace=False)
  full_df.loc[indeces, target_col] = None

In [5]:
full_df

Unnamed: 0,int_0,int_1,int_2,int_3,float_0,float_1,float_2,float_3,str_0,str_1,str_2,str_3
0,94.0,66,45,73,0.879347,0.771386,0.670628,0.824132,,miny,eeny,miny
1,,58,75,32,0.849250,0.218199,0.576388,0.154711,,meeny,eeny,moe
2,31.0,69,82,15,0.920094,0.470284,0.613510,0.801468,,miny,miny,moe
3,56.0,27,71,52,0.431131,0.806534,0.340942,0.209477,,eeny,miny,miny
4,50.0,4,3,1,0.685177,0.692360,0.926932,0.409046,eeny,moe,meeny,miny
...,...,...,...,...,...,...,...,...,...,...,...,...
995,,21,0,76,0.117591,0.482974,0.581569,0.770461,eeny,miny,moe,miny
996,16.0,5,79,43,0.670093,0.212384,0.665407,0.601058,,eeny,moe,meeny
997,84.0,94,75,26,0.389107,0.939397,0.391561,0.996783,miny,eeny,miny,moe
998,80.0,62,51,3,0.900197,0.259002,0.297772,0.595164,eeny,moe,moe,miny


In [6]:
full_df.dtypes

int_0      float64
int_1        int64
int_2        int64
int_3        int64
float_0    float64
float_1    float64
float_2    float64
float_3    float64
str_0       object
str_1       object
str_2       object
str_3       object
dtype: object

In [7]:
from monitor.generators import gen_constraints, gen_statistics

In [9]:
gen_constraints(full_df)

Constraints(features=[Feature(name='int_0', inferred_type=<FeatureType.FRACTIONAL: 'fractional'>, completeness=0.9, num_constraints=NumericalConstraints(is_non_negative=True), string_constraints=None, monitoringConfigOverrides=None), Feature(name='int_1', inferred_type=<FeatureType.INTEGRAL: 'integral'>, completeness=1.0, num_constraints=NumericalConstraints(is_non_negative=True), string_constraints=None, monitoringConfigOverrides=None), Feature(name='int_2', inferred_type=<FeatureType.INTEGRAL: 'integral'>, completeness=1.0, num_constraints=NumericalConstraints(is_non_negative=True), string_constraints=None, monitoringConfigOverrides=None), Feature(name='int_3', inferred_type=<FeatureType.INTEGRAL: 'integral'>, completeness=1.0, num_constraints=NumericalConstraints(is_non_negative=True), string_constraints=None, monitoringConfigOverrides=None), Feature(name='float_0', inferred_type=<FeatureType.FRACTIONAL: 'fractional'>, completeness=0.8, num_constraints=NumericalConstraints(is_non_ne

In [10]:
gen_statistics(full_df)

Statistics(dataset=Dataset(item_count=1000), version=0, features=[Feature(name='int_0', inferred_type=<FeatureType.FRACTIONAL: 'fractional'>, numerical_statistics=NumericalStatistics(common=CommonStatistics(num_present=900, num_missing=100), mean=47.80444444444444, sum=43024.0, std_dev=30.10348451595715, min=0.0, max=99.0, distribution=None), string_statistics=None), Feature(name='int_1', inferred_type=<FeatureType.INTEGRAL: 'integral'>, numerical_statistics=NumericalStatistics(common=CommonStatistics(num_present=1000, num_missing=0), mean=48.73, sum=48730, std_dev=28.693176861379406, min=0, max=99, distribution=None), string_statistics=None), Feature(name='int_2', inferred_type=<FeatureType.INTEGRAL: 'integral'>, numerical_statistics=NumericalStatistics(common=CommonStatistics(num_present=1000, num_missing=0), mean=51.777, sum=51777, std_dev=28.563703474097267, min=0, max=99, distribution=None), string_statistics=None), Feature(name='int_3', inferred_type=<FeatureType.INTEGRAL: 'integ