# Tablextract insights

## Available features

In [56]:
from tablextract import PROPERTY_KINDS

for k, v in PROPERTY_KINDS.items():
    print('%s:\n\t- %s' % (k, '\n\t- '.join([feat for feat in v if '-variability-' not in feat])))

style:
	- background-color-b
	- background-color-g
	- background-color-r
	- border-bottom-color-b
	- border-bottom-color-g
	- border-bottom-color-r
	- border-bottom-width
	- border-left-color-b
	- border-left-color-g
	- border-left-color-r
	- border-left-width
	- border-right-color-b
	- border-right-color-g
	- border-right-color-r
	- border-right-width
	- border-top-color-b
	- border-top-color-g
	- border-top-color-r
	- border-top-width
	- color-b
	- color-g
	- color-r
	- display
	- font-family
	- font-size
	- font-weight
	- outline-color-b
	- outline-color-g
	- outline-color-r
	- padding-bottom
	- padding-left
	- padding-right
	- padding-top
	- text-align
	- text-decoration
	- text-transform
	- vertical-align
syntax:
	- density-alphanumeric
	- density-digit
	- density-lowercase
	- density-stopwords
	- density-symbol
	- density-token
	- density-uppercase
	- density-whitespace
	- match-allcaps
	- match-amount
	- match-capitalised
	- match-date
	- match-empty
	- match-money
	- match-rang

## Normalisation ranges for unbounded features

In [57]:
from tablextract import NUMERIC_STYLE_PROPERTIES

for k, (v_min, v_range) in NUMERIC_STYLE_PROPERTIES.items():
    print('%s: [%d, %d]' % (k.rjust(20), v_min, v_min + v_range))

 border-bottom-width: [-40, 40]
   border-left-width: [-40, 40]
  border-right-width: [-40, 40]
    border-top-width: [-40, 40]
           font-size: [0, 60]
      padding-bottom: [-40, 40]
        padding-left: [-40, 40]
       padding-right: [-40, 40]
         padding-top: [-40, 40]
         font-weight: [400, 700]


## Existing functions

In [58]:
from tablextract import FUNCTIONS

print('- ' + '\n- '.join(FUNCTIONS.values()))

- empty
- data
- metadata
- context
- decorator
- total
- indexer
- factorised


## Orientations

In [59]:
from tablextract import ORIENTATIONS

print('- ' + '\n- '.join(ORIENTATIONS))

- row
- col
- tab


## Text of a table

In [60]:
from tablextract import tables

table = tables('https://en.wikipedia.org/wiki/Albedo')[0]

print('ORIGINAL TABLE')
for row in table.texts:
    print(' '.join([cell.ljust(20) for cell in row]))
print()

ORIGINAL TABLE
Surface              Typical albedo      
Fresh asphalt        0.04                
Open ocean           0.06                
Worn asphalt         0.12                
Conifer forest (Summer) 0.08, 0.09 to 0.15  
Deciduous trees      0.15 to 0.18        
Bare soil            0.17                
Green grass          0.25                
Desert sand          0.40                
New concrete         0.55                
Ocean ice            0.5–0.7             
Fresh snow           0.80                



## Features of a table

In [61]:
for k in table.features[0][0].keys():
    print(k.upper())
    for row in table.features:
        print(' '.join(cell[k] if type(cell[k]) == str else '%.4f' % cell[k] for cell in row))
    print()

COLOR-R
0.1333 0.1333
0.1333 0.1333
0.1333 0.1333
0.1333 0.1333
0.1333 0.1333
0.0750 0.1333
0.1333 0.1333
0.1333 0.1333
0.1333 0.1333
0.1333 0.1333
0.1333 0.1333
0.1333 0.1333

COLOR-G
0.1333 0.1333
0.1333 0.1333
0.1333 0.1333
0.1333 0.1333
0.1333 0.1333
0.2062 0.1333
0.1333 0.1333
0.1333 0.1333
0.1333 0.1333
0.1333 0.1333
0.1333 0.1333
0.1333 0.1333

COLOR-B
0.1333 0.1333
0.1333 0.1333
0.1333 0.1333
0.1333 0.1333
0.1333 0.1333
0.4227 0.1333
0.1333 0.1333
0.1333 0.1333
0.1333 0.1333
0.1333 0.1333
0.1333 0.1333
0.1333 0.1333

BACKGROUND-COLOR-R
0.9176 0.9176
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000

BACKGROUND-COLOR-G
0.9255 0.9255
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000

BACKGROUND-COLOR-B
0.9412 0.9412
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.00

0.5000 0.5000
0.5000 0.5000

ROW-VARIABILITY-LAST-CHAR-UPPERCASE
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000

ROW-VARIABILITY-FIRST-CHAR-ALPHANUMERIC
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000

ROW-VARIABILITY-LAST-CHAR-ALPHANUMERIC
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.5000 0.5000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000

ROW-VARIABILITY-FIRST-CHAR-DIGIT
0.0000 0.0000
0.5000 0.5000
0.5000 0.5000
0.5000 0.5000
0.5000 0.5000
0.5000 0.5000
0.5000 0.5000
0.5000 0.5000
0.5000 0.5000
0.5000 0.5000
0.5000 0.5000
0.5000 0.5000

ROW-VARIABILITY-LAST-CHAR-DIGIT
0.0000 0.0000
0.5000 0.5000
0.5000 0.5000
0.5000 0.5000
0.5000 0.5000
0.5000 0.5000
0.5000 0.5000
0.5000 0.5000
0.5

0.0000 0.0000

TAB-VARIABILITY-LENGTH
0.0521 0.1771
0.1771 0.3229
0.1771 0.3229
0.1771 0.3229
0.1771 0.1771
0.1771 0.1771
0.1771 0.3229
0.1771 0.3229
0.1771 0.3229
0.1771 0.3229
0.1771 0.0521
0.1771 0.3229

TAB-VARIABILITY-FIRST-CHAR-LOWERCASE
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000

TAB-VARIABILITY-LAST-CHAR-LOWERCASE
0.5000 0.5000
0.5000 0.5000
0.5000 0.5000
0.5000 0.5000
0.5000 0.5000
0.5000 0.5000
0.5000 0.5000
0.5000 0.5000
0.5000 0.5000
0.5000 0.5000
0.5000 0.5000
0.5000 0.5000

TAB-VARIABILITY-FIRST-CHAR-UPPERCASE
0.4583 0.4583
0.4583 0.5417
0.4583 0.5417
0.4583 0.5417
0.4583 0.5417
0.4583 0.5417
0.4583 0.5417
0.4583 0.5417
0.4583 0.5417
0.4583 0.5417
0.4583 0.5417
0.4583 0.5417

TAB-VARIABILITY-LAST-CHAR-UPPERCASE
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000
0.0000 0.0000


## Functions of a table

In [62]:
print('ORIGINAL TABLE')
for row in table.functions:
    print(' '.join([FUNCTIONS[cell].center(8) for cell in row]))
print()

ORIGINAL TABLE
metadata metadata
  data     data  
  data     data  
  data     data  
  data     data  
  data     data  
  data     data  
  data     data  
  data     data  
  data     data  
  data     data  
  data     data  



## Kind of a table

In [63]:
table.kind

'horizontal listing'

## Context of a table

In [64]:
for k, v in table.context.items():
    print('%s %s' % (k.ljust(12), v))

h_2          Terrestrial albedo[edit]
h_1          Albedo
text_before  Terrestrial albedo[edit]
text_after   Sample albedos


## Variabilities of a table

In [65]:
for k, v in table.variabilities.items():
    if type(k) == tuple: k = '.'.join(k)
    print('%s %.4f' % (k.ljust(16), v))

row              1.4742
row.style        0.0957
row.syntax       0.4013
row.structural   0.2561
row.semantic     0.3996
col              0.8709
col.style        0.4536
col.syntax       0.1619
col.structural   0.3163
col.semantic     0.1693
tab              1.7294
tab.style        0.4508
tab.syntax       0.4367
tab.structural   0.4276
tab.semantic     0.4311


## Record of a table

In [66]:
from pprint import pprint

pprint(table.record)

[{'Surface': 'Fresh asphalt', 'Typical albedo': '0.04'},
 {'Surface': 'Open ocean', 'Typical albedo': '0.06'},
 {'Surface': 'Worn asphalt', 'Typical albedo': '0.12'},
 {'Surface': 'Conifer forest (Summer)', 'Typical albedo': '0.08, 0.09 to 0.15'},
 {'Surface': 'Deciduous trees', 'Typical albedo': '0.15 to 0.18'},
 {'Surface': 'Bare soil', 'Typical albedo': '0.17'},
 {'Surface': 'Green grass', 'Typical albedo': '0.25'},
 {'Surface': 'Desert sand', 'Typical albedo': '0.40'},
 {'Surface': 'New concrete', 'Typical albedo': '0.55'},
 {'Surface': 'Ocean ice', 'Typical albedo': '0.5–0.7'},
 {'Surface': 'Fresh snow', 'Typical albedo': '0.80'}]


## Score of the table extraction

In [67]:
print('%.2f %%' % (100 * table.score))

93.06 %
