# Input/Output Operations

Pickle can be used to serialize the majority of python objects

Serialization refers to the conversion of an object(hierarchy) to a byte stream.

Deserialization is the opposite operation.

# When to use 'wb' or 'rb'
## Use 'wb'
    - Purpose: Opens a file for writing in binary mode. If the file does not exist, it will be created. If it does exist, its contents will be truncated (i.e., overwritten).
    - Usage: You use 'wb' when you want to save or serialize data to a file using the pickle.dump() function.

## Use 'rb'
    - Purpose: Opens a file for reading in binary mode. The file must already exist; otherwise, an error will occur.
    - Usage: You use 'rb' when you want to load or deserialize data from a file using the pickle.load() function.


In [1]:
from pylab import  plt 
plt.style.use('ggplot')
%matplotlib inline

In [2]:
import pickle
import numpy as np
from random import gauss

In [7]:
a = [gauss(1.5, 2) for i in range(1000000)]
path = '/Users/davidkassin/python/python4finance/'
pkl_file = open(path + 'data.pkl', 'wb')

In [8]:
%time pickle.dump(a, pkl_file)
pkl_file.close()


CPU times: user 15.1 ms, sys: 9.33 ms, total: 24.4 ms
Wall time: 25.9 ms


In [17]:
pkl_file = open(path + 'data.pkl', 'rb')
%time b = pickle.load(pkl_file)
print(b[:3])
print("Type of b:", type(b))
np.allclose(np.array(a), np.array(b))

CPU times: user 19.3 ms, sys: 22.2 ms, total: 41.6 ms
Wall time: 68.5 ms
[0.8285310526943263, 0.37223315919704025, -0.4878676963268038]
Type of b: <class 'list'>


True

Pickle stores objects according to a first in, first out principle. Heads up, there is no meta information available to the user to know beforehand what is stored in a pickle file. 

In [24]:
pkl_file = open(path + 'data.pkl', 'wb')
%time pickle.dump(np.array(a), pkl_file)
%time pickle.dump(np.array(a) ** 2, pkl_file)
pkl_file.close()

CPU times: user 26.6 ms, sys: 23.2 ms, total: 49.8 ms
Wall time: 61.2 ms
CPU times: user 24.1 ms, sys: 3.28 ms, total: 27.4 ms
Wall time: 28.8 ms


In [28]:
pkl_file = open(path + 'data.pkl', 'rb')
x = pickle.load(pkl_file)
x[:4]

y = pickle.load(pkl_file)
y[:4]

pkl_file.close()

In [29]:
pkl_file = open(path + 'data.pkl', 'wb')
pickle.dump({'x':x, 'y':y}, pkl_file)
pkl_file.close()

pkl_file = open(path + 'data.pkl', 'rb')
data = pickle.load(pkl_file)
pkl_file.close()
for key in data.keys():
    print(key, data[key][:4])

x [ 0.82853105  0.37223316 -0.4878677   3.54319414]
y [ 0.68646371  0.13855752  0.23801489 12.5542247 ]


# Reading and Writing Text Files

In [34]:
import pandas as pd
rows = 5000
a = np.random.standard_normal((rows, 5)).round(4)
a
t = pd.date_range(start='2019/1/1', periods=rows, freq='H')
t

csv_file = open(path + 'data.csv', 'w')
header = 'date,no1,no2,no3,no4,no5\n'
csv_file.write(header)

for t_, (no1,no2,no3,no4,no5) in zip(t, a):
    s = '{},{},{},{},{}\n'.format(t_, no1,no2,no3,no4,no5)
    csv_file.write(s)

csv_file.close()

  t = pd.date_range(start='2019/1/1', periods=rows, freq='H')


In [39]:
csv_file = open(path + 'data.csv', 'r')
for i in range(5):
    print(csv_file.readline(), end='')
csv_file.close()
csv_file = open(path + 'data.csv', 'r')
content = csv_file.readlines()
content[:5]
csv_file.close()

date,no1,no2,no3,no4,no5
2019-01-01 00:00:00,-0.0564,-0.7167,0.5548,1.2196
2019-01-01 01:00:00,-0.1468,-1.2097,-0.0862,0.433
2019-01-01 02:00:00,1.0664,0.0166,-0.1262,1.8399
2019-01-01 03:00:00,0.361,0.2379,0.608,-0.3203


In [41]:
import csv

with open(path + 'data.csv', 'r') as f:
    csv_reader = csv.reader(f)
    lines = [line for line in csv_reader]

lines[:5]



In [43]:
with open(path + 'data.csv', 'r') as f:
    csv_reader = csv.DictReader(f)
    lines = [line for line in csv_reader]

lines[:3]

[{'date': '2019-01-01 00:00:00',
  'no1': '-0.0564',
  'no2': '-0.7167',
  'no3': '0.5548',
  'no4': '1.2196',
  'no5': None},
 {'date': '2019-01-01 01:00:00',
  'no1': '-0.1468',
  'no2': '-1.2097',
  'no3': '-0.0862',
  'no4': '0.433',
  'no5': None},
 {'date': '2019-01-01 02:00:00',
  'no1': '1.0664',
  'no2': '0.0166',
  'no3': '-0.1262',
  'no4': '1.8399',
  'no5': None}]

# Working with SQL DBs

In [45]:
import sqlite3 as sq3

con = sq3.connect(path + 'numbs.db')
query = 'CREATE TABLE numbs (Date date, No1 real, No2 real)'

con.execute(query)



OperationalError: table numbs already exists

In [47]:
con.commit()

q = con.execute
q('SELECt * FROM sqlite_master').fetchall()

[('table',
  'numbs',
  'numbs',
  2,
  'CREATE TABLE numbs (Date date, No1 real, No2 real)')]

In [48]:
import datetime
now = datetime.datetime.now()
q('INSERT INTO numbs VALUES(?, ?, ?)', (now, 0.12, 7.3))

<sqlite3.Cursor at 0x10d8c4c00>

In [51]:
np.random.seed(100)
data = np.random.standard_normal((10000, 2)).round(4)

In [52]:
%%time
for row in data:
    now = datetime.datetime.now()
    q('INSERT INTO numbs VALUES(?, ?,?)', (now, row[0], row[1]))
con.commit()


CPU times: user 44.8 ms, sys: 3.76 ms, total: 48.6 ms
Wall time: 50.1 ms


In [53]:
q('SELECt * FROM numbs').fetchmany(4)

[('2024-08-19 11:19:23.068499', 0.12, 7.3),
 ('2024-08-19 11:21:29.471758', -1.7498, 0.3427),
 ('2024-08-19 11:21:29.472364', 1.153, -0.2524),
 ('2024-08-19 11:21:29.472448', 0.9813, 0.5142)]