Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add custom sample limit. #72

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/infer/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ func main() {
}
fmt.Println("## Raw Table ##")
fmt.Println(tab)
sch, err := schema.Infer(tab)
sch, err := schema.Infer(tab, schema.SampleLimit(schema.SampleAllRows))
if err != nil {
panic(err)
}
Expand Down
61 changes: 52 additions & 9 deletions schema/infer.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,29 +43,51 @@ var (
noConstraints = Constraints{}
)

// Maximum number of rows used to infer schema.
const maxNumRowsInfer = 100
const (
// SampleAllRows can be passed to schema.SampleLimit(int) to sample all rows.
// schema.SampleLimit(int) is an optional argument to
// schema.Infer(table.Table, ...InferOpts)
SampleAllRows = -1
// Default maximum number of rows used to infer schema.
// This can be changed by passing schema.SampleLimit(int) to
// schema.Infer(table.Table, ...InferOpts)
defaultMaxNumRowsInfer = 100
)

// Infer infers a schema from a slice of the tabular data. For columns that contain
// cells that can inferred as different types, the most popular type is set as the field
// type. For instance, a column with values 10.1, 10, 10 will inferred as being of type
// "integer".
russmack marked this conversation as resolved.
Show resolved Hide resolved
func Infer(tab table.Table) (*Schema, error) {
s, err := sample(tab)
func Infer(tab table.Table, opts ...InferOpts) (*Schema, error) {
cfg := &inferConfig{}
for _, opt := range opts {
if err := opt(cfg); err != nil {
return nil, err
}
}
s, err := sample(tab, cfg)
if err != nil {
return nil, err
}
return infer(tab.Headers(), s)
}

func sample(tab table.Table) ([][]string, error) {
func sample(tab table.Table, cfg *inferConfig) ([][]string, error) {
limit := defaultMaxNumRowsInfer
if cfg.sampleLimit != 0 {
limit = cfg.sampleLimit
}
iter, err := tab.Iter()
if err != nil {
return nil, err
}
var t [][]string
for count := 0; count < maxNumRowsInfer && iter.Next(); count++ {
for count := 0; iter.Next(); count++ {
t = append(t, iter.Row())
// A negative limit will continue to sample the entire table.
if limit > 0 && count == limit-1 {
break
}
}
if iter.Err() != nil {
return nil, iter.Err()
Expand All @@ -86,7 +108,7 @@ func infer(headers []string, table [][]string) (*Schema, error) {
if inferredTypes[cellIndex] == nil {
inferredTypes[cellIndex] = make(map[string]int)
}
// The list bellow must be ordered by the narrower field type.
// The list below must be ordered by the narrower field type.
t := findType(cell, orderedTypes)
inferredTypes[cellIndex][t]++
}
Expand Down Expand Up @@ -116,8 +138,14 @@ func infer(headers []string, table [][]string) (*Schema, error) {
// will inferred as being of type "number" ("integer" can be implicitly cast to "number").
//
// For medium to big tables, this method is faster than the Infer.
func InferImplicitCasting(tab table.Table) (*Schema, error) {
s, err := sample(tab)
func InferImplicitCasting(tab table.Table, opts ...InferOpts) (*Schema, error) {
cfg := &inferConfig{}
for _, opt := range opts {
if err := opt(cfg); err != nil {
return nil, err
}
}
s, err := sample(tab, cfg)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -209,3 +237,18 @@ func findType(value string, checkOrder []string) string {
}
return StringType
}

// InferOpts defines functional options for inferring a schema.
type InferOpts func(c *inferConfig) error

type inferConfig struct {
sampleLimit int
}

// SampleLimit specifies the maximum number of rows to sample for inference.
func SampleLimit(limit int) InferOpts {
return func(c *inferConfig) error {
c.sampleLimit = limit
return nil
}
}
50 changes: 50 additions & 0 deletions schema/infer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,56 @@ func ExampleInferImplicitCasting() {
// {Name:Height Type:number Format:default}
}

func TestInferSampleLimit(t *testing.T) {
data := []struct {
desc string
sampleLimit int
headers []string
table [][]string
want int
}{
{"SampleZero", 0, []string{"Age"}, [][]string{[]string{"1"}, []string{"2"}, []string{"3"}}, 3},
{"SampleOne", 1, []string{"Age"}, [][]string{[]string{"1"}, []string{"2"}, []string{"3"}}, 1},
{"SampleTwo", 2, []string{"Age"}, [][]string{[]string{"1"}, []string{"2"}, []string{"3"}}, 2},
{"SampleThree", 3, []string{"Age"}, [][]string{[]string{"1"}, []string{"2"}, []string{"3"}}, 3},
{"SampleTen", 10, []string{"Age"}, [][]string{[]string{"1"}, []string{"2"}, []string{"3"}}, 3},
{"SampleMinusOne", -1, []string{"Age"}, [][]string{[]string{"1"}, []string{"2"}, []string{"3"}}, 3},
{"SampleMinusTen", -10, []string{"Age"}, [][]string{[]string{"1"}, []string{"2"}, []string{"3"}}, 3},
{"SampleEmptyZero", 0, []string{"Age"}, [][]string{}, 0},
{"SampleEmptyOne", 1, []string{"Age"}, [][]string{}, 0},
{"SampleEmptyMinusTen", -10, []string{"Age"}, [][]string{}, 0},
}
for _, d := range data {
t.Run(d.desc, func(t *testing.T) {
is := is.New(t)
s, err := sample(table.FromSlices(d.headers, d.table), &inferConfig{sampleLimit: d.sampleLimit})
is.NoErr(err)

is.Equal(len(s), d.want)
})
}
t.Run("LimitNotSpecified", func(t *testing.T) {
data := []struct {
desc string
headers []string
table [][]string
want int
}{
{"SampleDefault", []string{"Age"}, [][]string{[]string{"1"}, []string{"2"}, []string{"3"}}, 3},
{"SampleDefault", []string{"Age"}, [][]string{}, 0},
}
for _, d := range data {
t.Run(d.desc, func(t *testing.T) {
is := is.New(t)
s, err := sample(table.FromSlices(d.headers, d.table), &inferConfig{})
is.NoErr(err)

is.Equal(len(s), d.want)
})
}
})
}

func TestInfer(t *testing.T) {
data := []struct {
desc string
Expand Down