In [1]:
with open("part2_alt.csv") as infile:
    lines = infile.readlines()

headers = lines.pop(0).strip().split(",")

In [2]:
import csv # since csv was used to write data, it is useful for reading it
import re

> Due to the structure of the data (a list of lists of elements), neither list comprehensions nor piping seem to be a good fit for processing. Even with multiple comprehensions/pipes, the need to insert data as its own column and switch items around are complicated to implement. Hence an iterative approach is used instead, which will process an entire row of data at a time, rather than trying to apply a process to the same chunk of each line. A function/lambda expression could be defined for each step and used with comprehension/piping, but this just boils down to semantic differences (e.g. "for l in lines: f(l)" vs "\[f(l) for l in lines\]" vs "lines >> map(f)").

> Updates to the headers are based on my best guess of what they are supposed to represent

In [3]:
line_reader = csv.reader(lines) # used to handle the cases of values containing commas themselves, which are quote-wrapped

processed_lines = []

for line in line_reader:
    
    # handle commas in Qty (e.g. 2,756) - csv library dealt with issue of reading it in
    line[3] = line[3].replace(",", "")
    
    # handle joined price and bid
    # The price field contains extra chracters
    if len(line[4]) > 8:
        bid_start = line[4].find(".", 2) -1 # start at index 2 to skip first decimal place
        price = line[4][:bid_start]
        bid = line[4][bid_start:]
        line[4] = price
        line.insert(5, bid)

    # handle glob of ask, 2 dates, and an id
    data_extract = re.compile(r'(\d+\.\d+)(\d{2}/\d{2})(\d{2}/\d{2})(.+)')    
    ask, date1, date2, idnum = data_extract.search(line[6]).groups()
    line[6] = ask
    line.insert(7, date1)
    headers[7] = "Date1"
    line.insert(8, date2)
    line.insert(9, idnum)
    
    # handle time and label
    second_colon = line[10].find(":", 4)
    label = line[10][second_colon+1:]
    line[10] = line[10][:second_colon]
    line.insert(11, label)
   
    # handle transposed data
    if line[12] == r"/17":
        # this order makes more sense according to how I understand the headers
        to_move_right = line[12:16]
        to_move_left = line[16:22]
        
        line[12:18] = to_move_left
        line[18:22] = to_move_right
    
    # handle number and STA=
    gtl_num, gtl_sta = line[17].split("#")
    line[17] = gtl_num
    line.insert(18, gtl_sta.split("=")[1])
    
    # sometimes last element is empty - fill with 0 for consistent row length.
    # I think it's a categorical value, anyway, if it is meaningful at all.
    if len(line) == 23:
        line.append("0")
    
    processed_lines.append(line)


# One-time header work - don't want to do this for every iteration of the loop
headers.insert(8, "Date2")   
headers.insert(9, "TradeID")
headers.insert(11, "Label")
headers.insert(18, "GTL-STA")
headers.insert(22, "Val1") # no idea what this is supposed to be
headers.insert(23, "Val2") # same

In [4]:
with open("final.csv", "w+") as of:
    writer = csv.writer(of)
    writer.writerow(headers)
    writer.writerows(processed_lines)