# 1. Read Data from S3 Bucket

In [1]:
from sagemaker import get_execution_role
role = get_execution_role()

#Bucket and Folder
bucket = 'project-b-data'
subfolder = 'proj-b/dataset-b01'

In [2]:
import boto3
conn = boto3.client('s3')
contents = conn.list_objects(Bucket=bucket, Prefix=subfolder)['Contents']
for i in range(5):
    print(contents[i]['Key'])

proj-b/dataset-b01/README.txt
proj-b/dataset-b01/Tst2022-01-04LOBs.txt
proj-b/dataset-b01/Tst2022-01-04tapes.csv
proj-b/dataset-b01/Tst2022-01-05LOBs.txt
proj-b/dataset-b01/Tst2022-01-05tapes.csv


## 1.1 Read all the csv data

In [3]:
#Generate String Date
import pandas as pd
date_list = pd.date_range(start = "2022-01-04",end = "2022-04-29").tolist()
exact_date = [str(i)[0:10] for i in date_list]   #2022-01-04 format"
print(exact_date[:5])

['2022-01-04', '2022-01-05', '2022-01-06', '2022-01-07', '2022-01-08']


In [4]:
#Read the dataset of tapes.csv file

from time import time
tape_csv = []

total_time = 0
for the_date in exact_date:
    t1 = time()
    tape_from_s3 = 's3://{}/{}'.format(bucket,subfolder+'/Tst'+the_date+'tapes.csv')

    try: 
        tape_csv.append(pd.DataFrame(pd.read_csv(tape_from_s3,header = None)))
    except:
        continue
    t2 = time()
    total_time += t2 - t1
    print('The {} is finished. Elapsed time is {} seconds.'.format(the_date,t2-t1))

print("Total .csv file Read time: {}".format(total_time))





The 2022-01-04 is finished. Elapsed time is 13.629462480545044 seconds.
The 2022-01-05 is finished. Elapsed time is 0.4644284248352051 seconds.
The 2022-01-06 is finished. Elapsed time is 0.4622969627380371 seconds.
The 2022-01-07 is finished. Elapsed time is 0.7449972629547119 seconds.
The 2022-01-10 is finished. Elapsed time is 0.5672264099121094 seconds.
The 2022-01-11 is finished. Elapsed time is 0.9384832382202148 seconds.
The 2022-01-12 is finished. Elapsed time is 0.6380221843719482 seconds.
The 2022-01-13 is finished. Elapsed time is 0.5366456508636475 seconds.
The 2022-01-14 is finished. Elapsed time is 0.8418676853179932 seconds.
The 2022-01-17 is finished. Elapsed time is 0.9818775653839111 seconds.
The 2022-01-18 is finished. Elapsed time is 0.9363741874694824 seconds.
The 2022-01-19 is finished. Elapsed time is 0.973344087600708 seconds.
The 2022-01-20 is finished. Elapsed time is 0.8428158760070801 seconds.
The 2022-01-21 is finished. Elapsed time is 0.8278946876525879 se

In [5]:
full_csv_frame = pd.concat(tape_csv)
print("There are {} rows and {} columns of the full csv frame.".format(full_csv_frame.shape[0],full_csv_frame.shape[1]))
print(full_csv_frame[:5])

There are 5230364 rows and 11 columns of the full csv frame.
              0          1       2    3                        4   \
0  Tst2022-01-04   Exch0Lit   8.192  203   {'pool_id': 'Exch0Lit'   
1  Tst2022-01-04   Exch0Lit   8.192  199   {'pool_id': 'Exch0Lit'   
2  Tst2022-01-04   Exch0Lit  10.336  196   {'pool_id': 'Exch0Lit'   
3  Tst2022-01-04   Exch0Lit  10.672  196   {'pool_id': 'Exch0Lit'   
4  Tst2022-01-04   Exch0Lit  11.040  201   {'pool_id': 'Exch0Lit'   

                 5                6              7          8   \
0   'type': 'Trade'    'time': 8.192   'price': 203   'qty': 2   
1   'type': 'Trade'    'time': 8.192   'price': 199   'qty': 3   
2   'type': 'Trade'   'time': 10.336   'price': 196   'qty': 3   
3   'type': 'Trade'   'time': 10.672   'price': 196   'qty': 4   
4   'type': 'Trade'    'time': 11.04   'price': 201   'qty': 1   

                 9                  10  
0   'party1': 'S26'   'party2': 'B24'}  
1   'party1': 'S26'   'party2': 'B08'}  
2   

## 1.2 Read all the LOB data


In [6]:
!pip install smart_open
!pip install ijson
from smart_open import smart_open
import ijson

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting smart_open
  Downloading smart_open-5.2.1-py3-none-any.whl (58 kB)
     |████████████████████████████████| 58 kB 8.7 MB/s             
[?25hInstalling collected packages: smart-open
Successfully installed smart-open-5.2.1
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting ijson
  Downloading ijson-3.1.4-cp36-cp36m-manylinux2010_x86_64.whl (124 kB)
     |████████████████████████████████| 124 kB 27.6 MB/s            
[?25hInstalling collected packages: ijson
Successfully installed ijson-3.1.4


In [7]:
%%time
import json
import re
#'s3://project-b-data/proj-b/dataset-b01/Tst2022-01-04LOBs.txt'
with smart_open('Example.txt',encoding='utf-8') as f:
    data = f.read()
data = data.replace(',',':')
data=data.replace(']\n[',',')
data = data.replace(']:','],')

CPU times: user 832 µs, sys: 196 µs, total: 1.03 ms
Wall time: 1.02 ms


In [8]:
print(data)

[
    "time":
    0.0:
    [
        "bid":
        []
    ],
    [
        "ask":
        []
    ]
,
    "time":
    0.016:
    [
        "bid":
        []
    ],
    [
        "ask":
        []
    ]
]


In [9]:
ijson.parse??

In [10]:
def parse_json(json_filename):
    with smart_open(json_filename, 'rb') as input_file:
        # load json iteratively
        parser = ijson.parse(input_file,multiple_values=True)
        for prefix, event, value in parser:
            print(value)
            

In [12]:
#parse_json(data)

In [13]:
time = []
bid = []
ask = []
for i in range(int(df.shape[0]/4)):
    time.append(df.iloc[i*4+1].values[0])
    bid.append(df.iloc[i*4+2].values[0][1])
    ask.append(df.iloc[i*4+3].values[0][1])
    
print(time)
print("-------")
print(bid)
print(ask)

print(pd.DataFrame([time,bid,ask]).transpose())

NameError: name 'df' is not defined

# 2. Exploratory Data Analysis

In [89]:
print(full_csv_frame[7])

0         'price': 203
1         'price': 199
2         'price': 196
3         'price': 196
4         'price': 201
             ...      
64888     'price': 134
64889     'price': 134
64890     'price': 134
64891     'price': 134
64892     'price': 133
Name: 7, Length: 5230364, dtype: object


In [84]:
import matplotlib.pyplot as plt
plt.plot(df[7])

Matplotlib is building the font cache; this may take a moment.


KeyError: 7