## Author: Tianheng Zhou

# 1. Read Data from S3 Bucket

In [1]:
from sagemaker import get_execution_role
role = get_execution_role()

#Bucket and Folder
bucket = 'project-b-data'
subfolder = 'proj-b/dataset-b01'

In [2]:
import boto3
conn = boto3.client('s3')
contents = conn.list_objects(Bucket=bucket, Prefix=subfolder)['Contents']
#Data Set Name
for i in range(5):
    print(contents[i]['Key'])

proj-b/dataset-b01/README.txt
proj-b/dataset-b01/Tst2022-01-04LOBs.txt
proj-b/dataset-b01/Tst2022-01-04tapes.csv
proj-b/dataset-b01/Tst2022-01-05LOBs.txt
proj-b/dataset-b01/Tst2022-01-05tapes.csv


## 1.1 Read all the csv data

In [3]:
#Generate String Date
import pandas as pd
date_list = pd.date_range(start = "2022-01-04",end = "2022-04-29").tolist()
exact_date = [str(i)[0:10] for i in date_list]   #2022-01-04 format"
print(exact_date[:5])

['2022-01-04', '2022-01-05', '2022-01-06', '2022-01-07', '2022-01-08']


In [4]:
#Read the dataset of tapes.csv file

import time
tape_csv = []

total_time = 0
for the_date in exact_date:
    t1 = time.time()
    tape_from_s3 = 's3://{}/{}'.format(bucket,subfolder+'/Tst'+the_date+'tapes.csv')

    try: 
        tape_csv.append(pd.DataFrame(pd.read_csv(tape_from_s3,header = None)))
    except:
        continue
    t2 = time.time()
    total_time += t2 - t1
    print('The {} is finished. Elapsed time is {} seconds.'.format(the_date,t2-t1))

print("Total .csv file Read time: {}".format(total_time))





The 2022-01-04 is finished. Elapsed time is 1.6339831352233887 seconds.
The 2022-01-05 is finished. Elapsed time is 0.5036125183105469 seconds.
The 2022-01-06 is finished. Elapsed time is 0.6255755424499512 seconds.
The 2022-01-07 is finished. Elapsed time is 0.48229074478149414 seconds.
The 2022-01-10 is finished. Elapsed time is 0.5079731941223145 seconds.
The 2022-01-11 is finished. Elapsed time is 0.5332159996032715 seconds.
The 2022-01-12 is finished. Elapsed time is 0.5182175636291504 seconds.
The 2022-01-13 is finished. Elapsed time is 0.5416808128356934 seconds.
The 2022-01-14 is finished. Elapsed time is 0.5518534183502197 seconds.
The 2022-01-17 is finished. Elapsed time is 0.48590850830078125 seconds.
The 2022-01-18 is finished. Elapsed time is 0.48256516456604004 seconds.
The 2022-01-19 is finished. Elapsed time is 0.5489239692687988 seconds.
The 2022-01-20 is finished. Elapsed time is 0.6964015960693359 seconds.
The 2022-01-21 is finished. Elapsed time is 0.581975221633911

In [5]:
#Concatenate all the csv file into one data frame 
full_csv_frame = pd.concat(tape_csv)
print("There are {} rows and {} columns of the full csv frame.".format(full_csv_frame.shape[0],full_csv_frame.shape[1]))
print(full_csv_frame[:5])

There are 5230364 rows and 11 columns of the full csv frame.
              0          1       2    3                        4   \
0  Tst2022-01-04   Exch0Lit   8.192  203   {'pool_id': 'Exch0Lit'   
1  Tst2022-01-04   Exch0Lit   8.192  199   {'pool_id': 'Exch0Lit'   
2  Tst2022-01-04   Exch0Lit  10.336  196   {'pool_id': 'Exch0Lit'   
3  Tst2022-01-04   Exch0Lit  10.672  196   {'pool_id': 'Exch0Lit'   
4  Tst2022-01-04   Exch0Lit  11.040  201   {'pool_id': 'Exch0Lit'   

                 5                6              7          8   \
0   'type': 'Trade'    'time': 8.192   'price': 203   'qty': 2   
1   'type': 'Trade'    'time': 8.192   'price': 199   'qty': 3   
2   'type': 'Trade'   'time': 10.336   'price': 196   'qty': 3   
3   'type': 'Trade'   'time': 10.672   'price': 196   'qty': 4   
4   'type': 'Trade'    'time': 11.04   'price': 201   'qty': 1   

                 9                  10  
0   'party1': 'S26'   'party2': 'B24'}  
1   'party1': 'S26'   'party2': 'B08'}  
2   

## 1.2 Read all the LOB data


In [6]:
!pip install smart_open
!pip install ijson

import ijson
import json
import re

import smart_open as sm
#help(sm)

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


### 1.2.1 Read the Example LOB data
#### 1.2.1.1 Read the Example LOB set to see the data format.
Below is some methods that **I didn't adopt.**

In [7]:
with sm.open('Example.txt',"r",encoding='utf-8') as f:
    example_data = f.read()
print(example_data)

[
    "time",
    0.0,
    [
        "bid",
        []
    ],
    [
        "ask",
        []
    ]
]
[
    "time",
    0.016,
    [
        "bid",
        []
    ],
    [
        "ask",
        []
    ]
]
[
    "time",
    0.528,
    [
        "bid",
        [
            [
                27,
                3
            ]
        ]
    ],
    [
        "ask",
        []
    ]
]


1) Here is the way that I try to use regular expression to get the data frame of the LOB file. But the re.sub() function would have **Memory Error** if we use the full LOB file, so this way is not plausible.

In [8]:
#1. Change the file into a readable list
import ast
example_data1 = example_data.replace(']\n[',',')
example_data2 = re.sub(" ","",example_data1)
example_data3 = re.sub("\n","",example_data2)
example_data4 = re.sub(r"\s+","",example_data3)
example_data5 = re.sub('"time",','',example_data4)
example_data6 = re.sub('"bid",','',example_data5)
example_data7 = re.sub('"ask",','',example_data6)

#safely convert the nested list in string to list
example_data8 = ast.literal_eval(example_data7)  
print("List of LOB: ",example_data8)
print("Type of the Number: ",type(example_data8[0]))
print("----------------------------------------------")


#2. Extract the features in the list
example_time = []
example_bid = []
example_ask = []

for i,item in enumerate(example_data8):
    #time
    if i % 3 == 0:
        example_time.append(item)
    #bid
    elif i % 3 == 1:
        try:
            example_bid.append((item[0][0][0],item[0][0][1]))
        except:
            example_bid.append(float('nan'))
    elif i % 3 == 2 :
        try:
            example_ask.append((item[0][0][0],item[0][0][1]))
        except:
            example_ask.append(float('nan'))

example_df = pd.DataFrame([example_time,example_bid,example_ask]).T
example_df = example_df.rename(columns={0: 'Time', 1: 'Bid',2: 'Ask'})
print(example_df)
            
    

List of LOB:  [0.0, [[]], [[]], 0.016, [[]], [[]], 0.528, [[[27, 3]]], [[]]]
Type of the Number:  <class 'float'>
----------------------------------------------
    Time      Bid  Ask
0      0      NaN  NaN
1  0.016      NaN  NaN
2  0.528  (27, 3)  NaN


In [9]:
def Read_LOB_file(source):
    """
    source: S3 Bucket file address
    
    """
    t1 = time.time()
    with sm.smart_open(source,encoding='utf-8',buffering = 10000000) as f:
        LOB_data = f.read()
    f.close()
        
    LOB_data1 = LOB_data.replace(']\n[',',')
    LOB_data2 = re.sub(" ","",LOB_data1)
    LOB_data3 = re.sub("\n","",LOB_data2)
    LOB_data4 = re.sub(r"\s+","",LOB_data3)
    LOB_data5 = re.sub('"time",','',LOB_data4)
    LOB_data6 = re.sub('"bid",','',LOB_data5)
    LOB_data7 = re.sub('"ask",','',LOB_data6)

    #safely convert the nested list in string to list
    LOB_data8 = ast.literal_eval(LOB_data7)  
    t2 = time.time()
    
    print(f"Read LOB file {source} finish, the elapsed time is {t2-t1} seconds.")
    return LOB_data8

def Create_LOB_Dataframe(LOB_data,file_date):
    """
    LOB_data: The return value(list) from Read_LOB_file
    file_date: The date of the LOB file
    
    """
    t1 = time.time()
    the_time = []
    bid = []
    ask = []
    date = []
    
    for i,item in enumerate(LOB_data):
        #time
        if i % 3 == 0:
            the_time.append(item)
        #bid
        elif i % 3 == 1:
            try:
                bid.append((item[0][0][0],item[0][0][1]))
            except:
                bid.append(float('nan'))
        #ask
        elif i % 3 == 2 :
            try:
                ask.append((item[0][0][0],item[0][0][1]))
            except:
                ask.append(float('nan'))
                
    date.append(file_date)
    date = date * len(the_time)
    
    df = pd.DataFrame([date,the_time,bid,ask]).T
    df = df.rename(columns={0: 'Date',1: 'Time', 2: 'Bid',3: 'Ask'})
    t2 = time.time()
    
    print(f"Create {file_date} LOB dataframe finish, the elapsed time is {t2-t1}.")
    return df

print(Create_LOB_Dataframe(Read_LOB_file(
    'Example.txt'),'2022-01-04'))

Read LOB file Example.txt finish, the elapsed time is 0.0006747245788574219 seconds.
Create 2022-01-04 LOB dataframe finish, the elapsed time is 0.002728700637817383.
         Date   Time      Bid  Ask
0  2022-01-04      0      NaN  NaN
1  2022-01-04  0.016      NaN  NaN
2  2022-01-04  0.528  (27, 3)  NaN


2) Here is the previous try of reading the LOB data using ijson package, but I failed because changing the large LOB file into a standard json format will also cause **Memory Error**. .

In [10]:
# ijson.parse??
# def parse_json(json_filename):
#     with smart_open(json_filename, 'rb') as input_file:
#         # load json iteratively
#         parser = ijson.parse(input_file,multiple_values=True)
#         for prefix, event, value in parser:
#             print(value)
# parse_json(example_data4)   

3) Here is the way I think I can directly use regular expressions to extract all the features, but because it needs to operate the full data, there will also be **Memory Error**

In [11]:
#regrex time
print("Time: ",re.findall(r'"time",\n\s*[\d]*[\.]?[\d]*',example_data))
#unfinished, regrex bid
print("Bid: ",re.findall(r'"bid",\n[\s]*\[[\]|\n\s*\[\n\s*]',example_data))

Time:  ['"time",\n    0.0', '"time",\n    0.016', '"time",\n    0.528']
Bid:  ['"bid",\n        []', '"bid",\n        []', '"bid",\n        [\n']


In [12]:
#Test regular expression
a = "123.4,0.5555566,7.85"
print(re.findall(r"[\d]+[\.]?[\d]*",a))

['123.4', '0.5555566', '7.85']


#### 1.2.1.2 Read a 1000000+ lines LOB file 
Here I would like to create a 1000000+ lines LOB subset to validate the reading method.

In [13]:
#!pip install json-stream

LOB_subset_01_04 = ""
with sm.open('s3://project-b-data/proj-b/dataset-b01/Tst2022-01-04LOBs.txt',
             "r",encoding='utf-8') as f:
    for i,line in enumerate(f):        
        LOB_subset_01_04 += line
        
        #56861
        if i == 1000021:
            break
            
    f.close()

#print(LOB_subset_01_04[-1000:-1])


Try to read in meta data chunk

In [14]:
%%time
LOB_subset_time = []
LOB_subset_bid = []
LOB_subset_ask = []
time_i = -10000
bid_i = -10000
ask_i = -10000
time_change_sign = False
bid_change_sign = False
ask_change_sign = False


for i,line in enumerate(LOB_subset_01_04.splitlines()):
    if i% 100000 == 0:
        print(f"This is {i//100000}th 100000 row")
    
    #Time
    if line.strip() == '"time",':
        time_i = i 
        time_change_sign = True
        continue        
    if i == (time_i +1) and time_change_sign == True:
        time_number = re.findall(r"[\d]+[\.]?[\d]*",line)[0]
        #print(f"Time Number {time_number} Appended")
        time_change_sign = False
        LOB_subset_time.append(time_number)
        continue
        
    
    
    #Bid
    if line.strip() == '"bid",':
        bid_i = i
        bid_change_sign = True
        bid_string = ""
        continue
    if i == (bid_i + 1) and line.strip() == "[]" and bid_change_sign == True:
        #print("Bid None Appended")
        bid_change_sign = False
        LOB_subset_bid.append(None)
        continue          
    if i == (bid_i + 1) and line.strip() == "[" and bid_change_sign == True:
        bid_string += line.strip()
        continue
    elif line.strip() != "]" and bid_change_sign == True:
        bid_string += line.strip()
        continue
    elif line.strip() == "]" and bid_change_sign == True:
        bid_string += "]]"
        LOB_subset_bid.append(ast.literal_eval(bid_string))
        #print(f"Bid {bid_string} Appended")
        bid_change_sign = False
        continue
            
    #Ask
    if line.strip() == '"ask",':
        ask_i = i
        ask_change_sign = True
        ask_string = ""
        continue       
    if i == (ask_i + 1) and line.strip() == "[]" and ask_change_sign == True:
        LOB_subset_ask.append(None)
        #print("Ask None Appended")
        ask_change_sign = False
        continue       
    if i == (ask_i + 1) and line.strip() == "[" and ask_change_sign == True:
        ask_string += line.strip()
        continue
    elif line.strip() != "]" and ask_change_sign == True:
        ask_string += line.strip()
        continue
    elif line.strip() == "]" and ask_change_sign == True:
        ask_string += "]]"
        LOB_subset_ask.append(ast.literal_eval(ask_string))
        #print(f"Ask {ask_string} Appended")
        ask_change_sign = False
        continue
        
        
        

        
#     if line == "]":
#         print("-------------------Next Meta Data--------------------")

    
LOB_subset_df = pd.DataFrame([LOB_subset_time,LOB_subset_bid,LOB_subset_ask]).T
LOB_subset_df = LOB_subset_df.rename(columns={0: 'Time', 1: 'Bid',2: 'Ask'})
LOB_subset_df.to_csv("1000021_LOB_01_04.csv")
print(LOB_subset_df)

This is 0th 100000 row
This is 1th 100000 row
This is 2th 100000 row
This is 3th 100000 row
This is 4th 100000 row
This is 5th 100000 row
This is 6th 100000 row
This is 7th 100000 row
This is 8th 100000 row
This is 9th 100000 row
This is 10th 100000 row
          Time                                                Bid  \
0          0.0                                               None   
1        0.016                                               None   
2        0.032                                               None   
3        0.048                                               None   
4        0.064                                               None   
...        ...                                                ...   
11184  178.944  [[177, 5], [157, 1], [140, 5], [112, 1], [36, 1]]   
11185   178.96  [[177, 5], [157, 1], [140, 5], [112, 1], [36, 1]]   
11186  178.976  [[177, 5], [157, 1], [140, 5], [112, 1], [36, 1]]   
11187  178.992  [[177, 5], [157, 1], [140, 5], [112, 1],

### 1.2.2 Read the True LOB data
The format is like 's3://project-b-data/proj-b/dataset-b01/Tst2022-01-04LOBs.txt'

In [18]:
%%time
#160037467 lines
print(sum(1 for line in sm.open('s3://project-b-data/proj-b/dataset-b01/Tst2022-01-04LOBs.txt',"r",encoding='utf-8')))

160037467
CPU times: user 57.1 s, sys: 3.19 s, total: 1min
Wall time: 1min 4s


In [15]:
def Read_LOB_Improved(source):
    LOB_subset_time = []
    LOB_subset_bid = []
    LOB_subset_ask = []
    time_i = -10000
    bid_i = -10000
    ask_i = -10000
    time_change_sign = False
    bid_change_sign = False
    ask_change_sign = False
    
    with sm.open(source,"r",encoding='utf-8') as f:
        for i,line in enumerate(f):
            if i% 1000000 == 0:
                print(f"This is {i//1000000}th 1000000 row")
    
            #Time
            if line.strip() == '"time",':
                time_i = i 
                time_change_sign = True
                continue        
                
            if i == (time_i +1) and time_change_sign == True:
                time_number = re.findall(r"[\d]+[\.]?[\d]*",line)[0]
                #print(f"Time Number {time_number} Appended")
                time_change_sign = False
                LOB_subset_time.append(time_number)
                continue



            #Bid
            if line.strip() == '"bid",':
                bid_i = i
                bid_change_sign = True
                bid_string = ""
                continue
            if i == (bid_i + 1) and line.strip() == "[]" and bid_change_sign == True:
                #print("Bid None Appended")
                bid_change_sign = False
                LOB_subset_bid.append(None)
                continue          
            if i == (bid_i + 1) and line.strip() == "[" and bid_change_sign == True:
                bid_string += line.strip()
                continue
            elif line.strip() != "]" and bid_change_sign == True:
                bid_string += line.strip()
                continue
            elif line.strip() == "]" and bid_change_sign == True:
                bid_string += "]]"
                LOB_subset_bid.append(ast.literal_eval(bid_string))
                #print(f"Bid {bid_string} Appended")
                bid_change_sign = False
                continue

            #Ask
            if line.strip() == '"ask",':
                ask_i = i
                ask_change_sign = True
                ask_string = ""
                continue       
            if i == (ask_i + 1) and line.strip() == "[]" and ask_change_sign == True:
                LOB_subset_ask.append(None)
                #print("Ask None Appended")
                ask_change_sign = False
                continue       
            if i == (ask_i + 1) and line.strip() == "[" and ask_change_sign == True:
                ask_string += line.strip()
                continue
            elif line.strip() != "]" and ask_change_sign == True:
                ask_string += line.strip()
                continue
            elif line.strip() == "]" and ask_change_sign == True:
                ask_string += "]]"
                LOB_subset_ask.append(ast.literal_eval(ask_string))
                #print(f"Ask {ask_string} Appended")
                ask_change_sign = False
                continue
        f.close()
        
        return [LOB_subset_time,LOB_subset_bid,LOB_subset_ask]
        
def Create_LOB_df_Improved(df_element,the_date):
    
    
    LOB_df = pd.DataFrame(df_element).T
    LOB_df['Date'] = the_date
    
    LOB_df = LOB_df.rename(columns={0: 'Time', 1: 'Bid',2: 'Ask'})
    LOB_df.to_csv(f"LOB_csv/{the_date}-LOB.csv")
    
    return LOB_df
        
        
    

Now let's test the 's3://project-b-data/proj-b/dataset-b01/Tst2022-01-04LOBs.txt'

In [21]:
%%time
df_2022_01_04 = Create_LOB_df_Improved(Read_LOB_Improved(
    "s3://project-b-data/proj-b/dataset-b01/Tst2022-01-04LOBs.txt"),"2022-01-04")
    

This is 0th 1000000 row
This is 1th 1000000 row
This is 2th 1000000 row
This is 3th 1000000 row
This is 4th 1000000 row
This is 5th 1000000 row
This is 6th 1000000 row
This is 7th 1000000 row
This is 8th 1000000 row
This is 9th 1000000 row
This is 10th 1000000 row
This is 11th 1000000 row
This is 12th 1000000 row
This is 13th 1000000 row
This is 14th 1000000 row
This is 15th 1000000 row
This is 16th 1000000 row
This is 17th 1000000 row
This is 18th 1000000 row
This is 19th 1000000 row
This is 20th 1000000 row
This is 21th 1000000 row
This is 22th 1000000 row
This is 23th 1000000 row
This is 24th 1000000 row
This is 25th 1000000 row
This is 26th 1000000 row
This is 27th 1000000 row
This is 28th 1000000 row
This is 29th 1000000 row
This is 30th 1000000 row
This is 31th 1000000 row
This is 32th 1000000 row
This is 33th 1000000 row
This is 34th 1000000 row
This is 35th 1000000 row
This is 36th 1000000 row
This is 37th 1000000 row
This is 38th 1000000 row
This is 39th 1000000 row
This is 40

In [28]:
%%time
pd.read_csv("LOB_csv/2022-01-04-LOB.csv")

CPU times: user 2.26 s, sys: 195 ms, total: 2.46 s
Wall time: 2.46 s


Unnamed: 0.1,Unnamed: 0,Time,Bid,Ask,Date
0,0,0.000,,,2022-01-04
1,1,0.016,,,2022-01-04
2,2,0.032,,,2022-01-04
3,3,0.048,,,2022-01-04
4,4,0.064,,,2022-01-04
...,...,...,...,...,...
1912495,1912495,30599.920,"[[220, 4], [211, 1], [175, 4], [170, 5], [161,...","[[236, 2], [411, 5], [436, 1], [659, 1], [725,...",2022-01-04
1912496,1912496,30599.936,"[[220, 4], [211, 1], [175, 4], [170, 5], [161,...","[[236, 2], [411, 5], [436, 1], [659, 1], [725,...",2022-01-04
1912497,1912497,30599.952,"[[220, 4], [211, 1], [175, 4], [170, 5], [161,...","[[236, 2], [411, 5], [436, 1], [659, 1], [725,...",2022-01-04
1912498,1912498,30599.968,"[[220, 4], [211, 1], [175, 4], [170, 5], [161,...","[[236, 2], [411, 5], [436, 1], [611, 1], [725,...",2022-01-04


In [16]:
exact_date.remove('2022-01-04','2022-01-05','2022-01-06','2022-01-07')
exact_date

['2022-01-05',
 '2022-01-06',
 '2022-01-07',
 '2022-01-08',
 '2022-01-09',
 '2022-01-10',
 '2022-01-11',
 '2022-01-12',
 '2022-01-13',
 '2022-01-14',
 '2022-01-15',
 '2022-01-16',
 '2022-01-17',
 '2022-01-18',
 '2022-01-19',
 '2022-01-20',
 '2022-01-21',
 '2022-01-22',
 '2022-01-23',
 '2022-01-24',
 '2022-01-25',
 '2022-01-26',
 '2022-01-27',
 '2022-01-28',
 '2022-01-29',
 '2022-01-30',
 '2022-01-31',
 '2022-02-01',
 '2022-02-02',
 '2022-02-03',
 '2022-02-04',
 '2022-02-05',
 '2022-02-06',
 '2022-02-07',
 '2022-02-08',
 '2022-02-09',
 '2022-02-10',
 '2022-02-11',
 '2022-02-12',
 '2022-02-13',
 '2022-02-14',
 '2022-02-15',
 '2022-02-16',
 '2022-02-17',
 '2022-02-18',
 '2022-02-19',
 '2022-02-20',
 '2022-02-21',
 '2022-02-22',
 '2022-02-23',
 '2022-02-24',
 '2022-02-25',
 '2022-02-26',
 '2022-02-27',
 '2022-02-28',
 '2022-03-01',
 '2022-03-02',
 '2022-03-03',
 '2022-03-04',
 '2022-03-05',
 '2022-03-06',
 '2022-03-07',
 '2022-03-08',
 '2022-03-09',
 '2022-03-10',
 '2022-03-11',
 '2022-03-

In [None]:
%%time
#"s3://project-b-data/proj-b/dataset-b01/Tst2022-01-04LOBs.txt" format
for the_date in exact_date:
    t1 = time.time()
    LOB_from_s3 = 's3://{}/{}'.format(bucket,subfolder+'/Tst'+the_date+'LOBs.txt')
    

    try:
        print(f"Start Read {the_date} LOB file.")
        Create_LOB_df_Improved(Read_LOB_Improved(LOB_from_s3),the_date)
    except:
        print(f"There is no {the_date} LOB file.")
        continue
        
    t2 = time.time()
    print('The {} is finished. Elapsed time is {} seconds.'.format(the_date,t2-t1))



Start Read 2022-01-05 LOB file.
This is 0th 1000000 row
This is 1th 1000000 row
This is 2th 1000000 row
This is 3th 1000000 row
This is 4th 1000000 row
This is 5th 1000000 row
This is 6th 1000000 row
This is 7th 1000000 row
This is 8th 1000000 row
This is 9th 1000000 row
This is 10th 1000000 row
This is 11th 1000000 row
This is 12th 1000000 row
This is 13th 1000000 row
This is 14th 1000000 row
This is 15th 1000000 row
This is 16th 1000000 row
This is 17th 1000000 row
This is 18th 1000000 row
This is 19th 1000000 row
This is 20th 1000000 row
This is 21th 1000000 row
This is 22th 1000000 row
This is 23th 1000000 row
This is 24th 1000000 row
This is 25th 1000000 row
This is 26th 1000000 row
This is 27th 1000000 row
This is 28th 1000000 row
This is 29th 1000000 row
This is 30th 1000000 row
This is 31th 1000000 row
This is 32th 1000000 row
This is 33th 1000000 row
This is 34th 1000000 row
This is 35th 1000000 row
This is 36th 1000000 row
This is 37th 1000000 row
This is 38th 1000000 row
Thi

# 2. Exploratory Data Analysis

In [89]:
print(full_csv_frame[7])

0         'price': 203
1         'price': 199
2         'price': 196
3         'price': 196
4         'price': 201
             ...      
64888     'price': 134
64889     'price': 134
64890     'price': 134
64891     'price': 134
64892     'price': 133
Name: 7, Length: 5230364, dtype: object
