In [1]:
#Import library
import ipywidgets as widgets
from IPython.display import HTML
from  data_highlight.highlighter import  HighlightedFile

from re import finditer

## Find a sample file

This mock file contains two different types of data. One is a set of time-stamped measurements of direction speed and engine temperature:

| date | time | vehicle | color | direction | speed | temperature |
|------|------|---------|-------|-----------|-------|-------------|
|951212|050000|MONDEO_44|  @C   |   269.7   |  2.0  |    10       |

Interspersed in these measurements are some time-stamped events:

| marker | type | date | time | event name |
|--------|------|------|------|------------|
|//      |EVENT |951212|050300|  BRAVO     |

Let's load such a file, then look at the contents:

In [3]:
text_file = open("data_highlight/file.txt")
file_content = text_file.read()
print(file_content)
text_file.close()

951212 050000.000 MONDEO_44   @C   269.7   2.0      10
// EVENT 951212 050300.000 BRAVO
// EVENT 951212 050300.000 CHARLIE
951212 050300.000 FORD_11   @C   354.7   2.1      14
951212 050200.000 COROLLA_44   @C   177.9   3.1      15
// EVENT 951212 050300.000 DELTA
951212 050300.000 COROLLA_44   @C   200   3.1      15



## Try out a simple event importer

We're going to parse this datafile, so it can be inserted into a database.  But, we're also going to keep track of how the file was tokenized, how those tokens were parsed, and how the parsed data was interpreted.

**Note:** the code to actually insert data into a database isn't present in this file, since that's a capability that is independent of this library.

So, this library is going to provide us with tools to loop through the lines and tokens, and then to record the fact that data was extracted from a token.

To start with, well look at those time-stamped events.

In [17]:
# create the self-highlighter
dataFile = HighlightedFile('data_highlight/file.txt')

# get the set of self-describing lines
lines = dataFile.lines()

# process the lines
for thisLine in lines:
    tokens = thisLine.tokens()

    # check the type
    firstToken = tokens[0]
    if firstToken.text == "//":
        # event marker
        eventImporter = "Simple Event importer"
        dateToken = tokens[2]
        dateToken.record(eventImporter,"Date", dateToken.text)
        timeToken = tokens[3]
        timeToken.record(eventImporter,"Time", timeToken.text)
        eventToken = tokens[4]
        eventToken.record(eventImporter,"Event", timeToken.text)

        # and the whole=line record
        thisLine.record(eventImporter, "Whole line")

# output to file, display
dataFile.export("out4.html")
HTML(filename="./out4.html")


You can see that the three rows have been highlighted, as triggered from the `thisLine.record()` method call.  In addition to that, the three fields that got parsed were also highlighted.  Hover over the text to view a tooltip explaining how that element was utilised.

## Try a State import tool
State data is a little more complex. In this importer we're going to indicate that the date and time tokens were combined to give a single timestamp.  We're also going to indicate that the speed token was exploited in two ways.

In [22]:
# create the self-highlighter
dataFile = HighlightedFile('data_highlight/file.txt')

# get the set of self-describing lines
lines = dataFile.lines()

# process the lines
for thisLine in lines:
    tokens = thisLine.tokens()
    myName = "State importer"

    # check the type
    firstToken = tokens[0]
    if firstToken.text != "//":
        dateToken = tokens[0]
        dateToken.record(myName,"Date", dateToken.text, "n/a")

        timeToken = tokens[1]
        timeToken.record(myName,"Time", timeToken.text, "n/a")

        vehicleToken = tokens[2]
        vehicleVal = vehicleToken.text
        vehicleToken.record(myName,"Vehicle", vehicleVal,"n/a")  

        directionToken = tokens[4]
        directionVal = float(directionToken.text)
        directionToken.record(myName,"Direction", directionVal,"degs")  

        speedToken = tokens[5]
        speedVal = float(speedToken.text)
        speedToken.record(myName,"Speed", speedVal,"m/s")  

        # special processing. Store low speed if value below 3 m/s
        if(speedVal < 30):
            speedToken.record(myName,"LOW SPEED", speedVal,"m/s")  
        if(speedVal > 60):
            speedToken.record(myName,"HIGH SPEED", speedVal,"m/s")  

            
        temperatureToken = tokens[6]
        temperatureVal = float(temperatureToken.text)
        temperatureToken.record(myName,"Temperature", temperatureVal,"degs")

# output to file, display
dataFile.export("out4.html")
HTML(filename="./out4.html")

Note in the above output summary that in the speed column (the last but one column) 
their tooltips indicate how many times each token was used.  Their background highlight colors
all change to match which tools utilised them.

## Event and state at the same time
Next we'll process event and state in the same file.

In [23]:
# create the self-highlighter
dataFile = HighlightedFile('data_highlight/file.txt')

# get the set of self-describing lines
lines = dataFile.lines()

for thisLine in lines:
    tokens = thisLine.tokens()

    # check the type
    firstToken = tokens[0]
    if firstToken.text == "//":
        # event marker
        eventImporter = "Simple Event importer"
        dateToken = tokens[2]
        dateToken.record(eventImporter,"Date", dateToken.text)
        timeToken = tokens[3]
        timeToken.record(eventImporter,"Time", timeToken.text)
        eventToken = tokens[4]
        eventToken.record(eventImporter,"Event", timeToken.text)

        # and the whole=line record
        thisLine.record(eventImporter, "Whole line")
    else:
        dateToken = tokens[0]
        dateToken.record(myName,"Date", dateToken.text, "n/a")

        timeToken = tokens[1]
        timeToken.record(myName,"Time", timeToken.text, "n/a")

        vehicleToken = tokens[2]
        vehicleVal = vehicleToken.text
        vehicleToken.record(myName,"Vehicle", vehicleVal,"n/a")  

        directionToken = tokens[4]
        directionVal = float(directionToken.text)
        directionToken.record(myName,"Direction", directionVal,"degs")  

        speedToken = tokens[5]
        speedVal = float(speedToken.text)
        speedToken.record(myName,"Speed", speedVal,"m/s")  

        speedToken = tokens[5]
        speedVal = float(speedToken.text)
        speedToken.record(myName,"Other Speed", speedVal,"m/s")  

        temperatureToken = tokens[6]
        temperatureVal = float(temperatureToken.text)
        temperatureToken.record(myName,"Temperature", temperatureVal,"degs")


# output to file, display
dataFile.export("out4.html")
HTML(filename="./out4.html")

## Find a comma-delimited file

In [7]:
text_file = open("data_highlight/file_comma.txt")
file_content = text_file.read()
print(file_content)
text_file.close()

951212, 050000.000, MONDEO_44, @C, 269.7, 2.0, 10
//, EVENT, 951212, 050300.000, BRAVO
//, EVENT, 951212, 050300.000, CHARLIE
951212, 050300.000, FORD_11, @C, 354.7, 2.1, 14
951212, 050200.000, COROLLA_44, @C, 177.9, 3.1, 15
//, EVENT, 951212, 050300.000, DELTA
951212, 050300.000, COROLLA_44, @C, 200, 3.1, 15



#### Process a comma-delimited file

In [24]:
# create the self-highlighter
dataFile = HighlightedFile('data_highlight/file_comma.txt')

# get the set of self-describing lines
lines = dataFile.lines()

CSV_DELIM = "(?:,\"|^\")(\"\"|[\w\W]*?)(?=\",|\"$)|(?:,(?!\")|^(?!\"))([^,]*?)(?=$|,)|(\r\n|\n)"
for thisLine in lines:


    tokens = thisLine.tokens(CSV_DELIM, ",") # note we specify delimiter

    #print(tokens[0], " ", tokens[1], " ", tokens[2], " ", tokens[3]) 

    # check the type
    firstToken = tokens[0]

    if firstToken.text == "//":
        # event marker
        eventImporter = "Simple CSV Event importer"
        dateToken = tokens[2]
        dateToken.record(eventImporter,"Date", dateToken.text, "n/a")
        timeToken = tokens[3]
        timeToken.record(eventImporter,"Time", timeToken.text, "n/a")
        eventToken = tokens[4]
        eventToken.record(eventImporter,"Event", timeToken.text, "n/a")

        # and the whole=line record
        thisLine.record(eventImporter, "Whole line")
# output to file, display
dataFile.export("out5.html")
HTML(filename="./out5.html")