In [32]:
import pathlib
import yaml
import json
import os
import shutil
import io

# output every line
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# pathlib  
* pathlib largely replaces both os.path and glob (has wrappers for a lot of their methods)
* should be used for path related code in most cases (except shutil high level file operations, os.walk, os.PathLike, yaml, json)

In [3]:
# user home dir
pathlib.Path.home()

# cwd
pathlib.Path.cwd()

PosixPath('/home/vlad')

PosixPath('/home/vlad/projects/notebooks')

In [5]:
sample_path = pathlib.Path.home() / 'dir' / 'filename.txt'
sample_path.parent

PosixPath('/home/vlad/dir')

In [6]:
project_path = pathlib.Path('/home/vlad/projects')
project_path

PosixPath('/home/vlad/projects')

In [7]:
# alternatives for path join
project_path / 'notebooks' / 'src'
project_path.joinpath('notebooks', 'src')

PosixPath('/home/vlad/projects/notebooks/src')

PosixPath('/home/vlad/projects/notebooks/src')

In [9]:
# replaces built-in open() function

with project_path.joinpath('notebooks', 'pyproject.toml').open(mode='r') as fid:
    # ...
    pass

In [10]:
# simple functions like .read_text(), read_bytes(), write_text(), write_bytes
project_path.joinpath('notebooks', 'pyproject.toml').read_text()

'[tool.poetry]\nname = "notebooks"\nversion = "0.1.0"\ndescription = ""\nauthors = ["Your Name <you@example.com>"]\n\n[tool.poetry.dependencies]\npython = "^3.8"\nnumpy = "*"\npyyaml = "^5.3.1"\nrequests = "^2.24.0"\n\n[tool.poetry.dev-dependencies]\nipython = "*"\njupyter = "*"\nmatplotlib = "*"\npytest = "*" \nblack = {version = "^19.10b0", allow-prereleases = true}\nrope = "^0.17.0"\n\n[build-system]\nrequires = ["poetry>=0.12"]\nbuild-backend = "poetry.masonry.api"\n'

In [11]:
# resolve file full path
pathlib.Path('pathlib_src.py').resolve()

PosixPath('/home/vlad/projects/notebooks/pathlib_src.py')

In [12]:
# path components

file_path = project_path.joinpath('notebooks', 'pyproject.toml')
file_path.name
file_path.parent
file_path.stem
file_path.suffix

file_path.parents[2]

'pyproject.toml'

PosixPath('/home/vlad/projects/notebooks')

'pyproject'

'.toml'

PosixPath('/home/vlad')

In [13]:
file_path.exists()

True

In [19]:
tmp_dir = pathlib.Path.cwd().joinpath('tmp_dir')
if tmp_dir.exists():
    shutil.rmtree(tmp_dir)

tmp_dir.mkdir()
try:
    tmp_dir.mkdir()
except:
    'Folder already exists, need exists_ok=True'

'Folder already exists, need exists_ok=True'

In [20]:
tmp_dir.mkdir(exist_ok=True)
file_path = tmp_dir.joinpath('tmp.txt')

with file_path.open(mode='w') as f:
    f.write('some line')
    pass

file_path.read_text()


9

'some line'

In [21]:

file_path.exists()


True

In [22]:
# safe file moving (avoids race conditions)
file_copy_path = file_path.parent / 'tmp2.txt'

with file_copy_path.open(mode='xb') as fid:
    fid.write(file_path.read_bytes())
# need to delete the old file now

file_copy_path.read_text()

# less safe file move
if not file_copy_path.exists():
    file_path.replace(file_copy_path)


9

'some line'

In [23]:

file_path.with_name('another_file.txt')  # same dir but another file
file_path.with_suffix('.py')  # same file name but another suffix


PosixPath('/home/vlad/projects/notebooks/tmp_dir/another_file.txt')

PosixPath('/home/vlad/projects/notebooks/tmp_dir/tmp.py')

In [24]:
# iterate over dir using .iterdir()
for p in pathlib.Path.cwd().iterdir():
    print(p)

print('\n Only directories')
for p in pathlib.Path.cwd().iterdir():
    if p.is_dir():
        print(p)

# using .glob() for file patterns
print('\n File type pattern')
for p in pathlib.Path.cwd().glob('*.to*'):
    print(p)

# recursive glob .rglob()
for p in pathlib.Path.cwd().rglob("*.ipynb"):
    print(p)


/home/vlad/projects/notebooks/pyproject.toml
/home/vlad/projects/notebooks/src
/home/vlad/projects/notebooks/.vscode
/home/vlad/projects/notebooks/notebooks
/home/vlad/projects/notebooks/.env
/home/vlad/projects/notebooks/README.md
/home/vlad/projects/notebooks/.gitignore
/home/vlad/projects/notebooks/tmp_dir
/home/vlad/projects/notebooks/poetry.lock
/home/vlad/projects/notebooks/.git

 Only directories
/home/vlad/projects/notebooks/src
/home/vlad/projects/notebooks/.vscode
/home/vlad/projects/notebooks/notebooks
/home/vlad/projects/notebooks/.env
/home/vlad/projects/notebooks/tmp_dir
/home/vlad/projects/notebooks/.git

 File type pattern
/home/vlad/projects/notebooks/pyproject.toml
/home/vlad/projects/notebooks/notebooks/pathlib.ipynb
/home/vlad/projects/notebooks/notebooks/files.ipynb
/home/vlad/projects/notebooks/.env/lib/python3.8/site-packages/matplotlib/backends/web_backend/nbagg_uat.ipynb
/home/vlad/projects/notebooks/.env/lib/python3.8/site-packages/nbconvert/preprocessors/test

In [25]:
# removing file
file_path.unlink()
file_path.exists()


False

In [41]:
# using glob (rglob for recursive search)
for f in resource_dir.glob("*"):
    print(f)

for f in resource_dir.rglob('text_*.txt'):
    print(f)

/home/vlad/projects/notebooks/src/pathlib_rsrc/test_yaml.yml
/home/vlad/projects/notebooks/src/pathlib_rsrc/binary_file2
/home/vlad/projects/notebooks/src/pathlib_rsrc/new.zip
/home/vlad/projects/notebooks/src/pathlib_rsrc/text_file.txt
/home/vlad/projects/notebooks/src/pathlib_rsrc/sample.json
/home/vlad/projects/notebooks/src/pathlib_rsrc/binary_file
/home/vlad/projects/notebooks/src/pathlib_rsrc/nested_dir
/home/vlad/projects/notebooks/src/pathlib_rsrc/zip_extract
/home/vlad/projects/notebooks/src/pathlib_rsrc/level1
/home/vlad/projects/notebooks/src/pathlib_rsrc/text_file.txt
/home/vlad/projects/notebooks/src/pathlib_rsrc/nested_dir/text_file_copy.txt
/home/vlad/projects/notebooks/src/pathlib_rsrc/zip_extract/text_file.txt
/home/vlad/projects/notebooks/src/pathlib_rsrc/zip_extract/home/vlad/projects/notebooks/src/pathlib_rsrc/text_file.txt


In [42]:
# file statistics
resource_dir.stat()

os.stat_result(st_mode=16893, st_ino=955704, st_dev=2049, st_nlink=5, st_uid=1000, st_gid=1000, st_size=4096, st_atime=1595968645, st_mtime=1595968441, st_ctime=1595968441)

# useful path tools from os
(for most cases pathlib should be used instead of os)

In [48]:
import os

# os.walk
for f in os.walk(tmp_dir):
    print(f)

# check if argument is path-like
isinstance(file_copy_path, os.PathLike)

('/home/vlad/projects/notebooks/tmp_dir', [], ['tmp2.txt'])


True

# python file buffer support

* below python standard file read/write functionality
* similar to C++ : can read/write bytes in binary files or symbols and lines in text files
* file handler has a pointer at some place of the file (can be at the start, middle, end)
* .seek(..) will change the pointer position

 ## Binary and text files
 * python handles 2 types of files: binary and text
 * most of the files that require specific handling are binary : like .doc, .xls, .pdf, .png, .sqlite
 * a test file has no special encoding (can be opened with text editor)
 * open(filename, mode) function (context manager):
   modes: 
   - 'r' 
   - 'w' (erases the existing file) 
   - 'x' (creating a file - will fail if already exists) 
   - 'a' (append, pointer at the end)  
   - 'r+' (read and change, pointer at the start)
 * fileobj.read(size) - size in bytes, default - full file
 * fileobj.name
 * for text files: fileobj.readline(), fileobj.readlines()
 * for line in fileobj:
       print)

* both text and binary files are sequences of bytes, but text files are stored as  it is, while binary files apply specific rules.

* in Python binary string can be encoded/decoded simply by putting ```b'in front of a string'```

In [30]:
resource_dir = pathlib.Path.cwd() / 'src' / 'pathlib_rsrc'
text_file = resource_dir.joinpath('text_file.txt')


if text_file.exists():
    text_file.unlink()

# creating text file (python built-in tools)
with open(text_file, mode='x') as f:
    f.write(
        """When a file operation fails for an I/O-related reason, 
    the exception IOError is raised. This includes situations where
     the operation is not defined for some reason, like seek() on a
      tty device or writing a file opened for reading."""
    )

# same as running f = open(...); ...; f.close()


with open(text_file, 'r+') as f:
    f.read()

with open(text_file, 'r+') as f:
    f.readline()
    next(f)

with open(text_file, 'r+') as f:
    f.readlines()

with open(text_file, 'r+') as f:
    for i, line in enumerate(f):
        print(f'line number {i}', line)

with open(text_file, 'r+') as f:
    f.seek(10)
    print(f.read())
    f.tell()
    f.write(" <extra string> ")

with open(text_file, 'r+') as f:
    print(f.read())

246

'When a file operation fails for an I/O-related reason, \n    the exception IOError is raised. This includes situations where\n     the operation is not defined for some reason, like seek() on a\n      tty device or writing a file opened for reading.'

'When a file operation fails for an I/O-related reason, \n'

'    the exception IOError is raised. This includes situations where\n'

['When a file operation fails for an I/O-related reason, \n',
 '    the exception IOError is raised. This includes situations where\n',
 '     the operation is not defined for some reason, like seek() on a\n',
 '      tty device or writing a file opened for reading.']

line number 0 When a file operation fails for an I/O-related reason, 

line number 1     the exception IOError is raised. This includes situations where

line number 2      the operation is not defined for some reason, like seek() on a

line number 3       tty device or writing a file opened for reading.


10

e operation fails for an I/O-related reason, 
    the exception IOError is raised. This includes situations where
     the operation is not defined for some reason, like seek() on a
      tty device or writing a file opened for reading.


246

16

When a file operation fails for an I/O-related reason, 
    the exception IOError is raised. This includes situations where
     the operation is not defined for some reason, like seek() on a
      tty device or writing a file opened for reading. <extra string> 


pathlib has aliases for standard python file operations (like .open() etc)

* using open() is prefered when we need to check the file existance and avoid race condition
* when we use ```if file.exists()``` and then do smth, another thread may delete or write file in between *if* and the action
* when we use open(.., mode='x') (or 'xb') the program will fail if the file doesn't exist; if it doesn't - the next atomic operation will create it (so another process won't be able to delete it or will see that the file already exists and won't overwrite it with a new one, if it uses the same technique)

## python io library

* open(..) used with a file path argument will return a file handler that has .read(..) attribute (i.e. is file-like objects)
* not only file handlers are file-like
* io library contains StringIO, BytesIO, TextIOWrapper - those object are file-like and can be passed to many reader instead of a file 

In [33]:
# read bytes from binary file
with open(resource_dir / 'binary_file', 'rb') as f:
    isinstance(f, io.BufferedIOBase)
    bytes_ = f.read()

# create a buffer
binary_buffer = io.BytesIO(bytes_)
# .. can perform r/w operations on the buffer here

# write it to another file
with open(resource_dir / 'binary_file2', 'wb') as f:
    f.write(binary_buffer.getvalue())

binary_buffer.close()  # discard buffer memory


resource_dir.joinpath('binary_file2').read_bytes()

# using StringIO
output = io.StringIO()
output.write("sample string")
output.seek(0)
output.read()
output.close()

buff = io.BytesIO('àéè'.encode('utf-8'))

buff.getvalue()

text_wrapper = io.TextIOWrapper(buff, encoding='utf-8')
text_wrapper.seek(0)
text_wrapper.read()

True

24

b'some binary stringf\x04\x0cf\x04\x0c'

13

0

'sample string'

b'\xc3\xa0\xc3\xa9\xc3\xa8'

0

'àéè'

In [34]:
# python encode/decode between str and binary string (using utf-8)
'àéè'.encode('utf-8')
b'\xc3\xa0\xc3\xa9\xc3\xa8'.decode('utf-8')

b'\xc3\xa0\xc3\xa9\xc3\xa8'

'àéè'

In [36]:
# modes for binary files : 'xb', 'rb', 'wb'

binary_file = resource_dir.joinpath('binary_file')

with open(binary_file, 'wb') as f:
    f.write(b"some binary string")


18

In [37]:
binary_file.read_bytes()

b'some binary string'

 full description here https://docs.python.org/2.4/lib/bltin-file-objects.html

In [38]:
with open(binary_file, 'rb+') as f:
    print(f.readline())
    # to write array to a binary file - use bytearray
    byte_array = bytearray([102, 4, 12])
    f.write(byte_array)

binary_file.read_bytes()

b'some binary string'


3

b'some binary stringf\x04\x0c'

# shutil
use shutil (https://docs.python.org/3/library/shutil.html) for high level file operations (copy, remove, copytree, archive etc) - no support for this in pathlib for now

 ** shutil methods **:
 * .copy(source, dest)
 * .copy2(..)  - preserve file metadata (like creation time)
 * .move(..)
 * copytree
 * rename
 * copyfileobj - copy file objects - can be used together with double ```open``` for overwrite security
 (need to check if destination file exist, shutil will overwrite it)
 has arg ```follow_symlinks``` to decide whether e.g. to copy a symlink or it's content (if true)
 path.open(..) context manager is very useful to lock file and avoid race condition
 (because the race condition can happen between any existence check and the next line)

In [None]:
import shutil

shutil.copy(
    resource_dir / 'text_file.txt', resource_dir / 'nested_dir/text_file_copy.txt'
)


# zipfile

In [47]:

import zipfile
import io

if resource_dir.joinpath('new.zip').exists():
    resource_dir.joinpath('new.zip').unlink()

# create zip archive from a list of files
file_list = ['text_file.txt', 'level1/']
with zipfile.ZipFile(resource_dir / 'new.zip', 'w') as new_zip:
    for name in file_list:
        new_zip.write(
            resource_dir / name, name
        )  # second arg - shortname (full path won't be stored)

# read zip
with zipfile.ZipFile(resource_dir / 'new.zip', 'a') as new_zip:
    new_zip.write(resource_dir / 'binary_file', 'binary_file')
    [f.filename for f in new_zip.filelist]
    new_zip.namelist()

# extract zip
with zipfile.ZipFile(resource_dir / 'new.zip', 'r') as new_zip:
    new_zip.extractall(path=resource_dir / 'zip_extract/')

# read from file inside a zip archive (w/o extraction)
with zipfile.ZipFile(resource_dir / 'new.zip', 'r') as new_zip:
    with new_zip.open('binary_file') as f:
        f.read()
        isinstance(f, io.IOBase)


['text_file.txt', 'level1/', 'binary_file']

['text_file.txt', 'level1/', 'binary_file']

b'some binary stringf\x04\x0c'

True

# YAML files
mostly used offline for config files

In [2]:
# yaml file example (combinations of dicts and lists)

"""
-  martin:
    name: Martin D'vloper
    job: Developer
    skills:
      - python
      - perl
      - pascal
-  tabitha:
    name: Tabitha Bitumen
    job: Developer
    skills:
      - lisp
      - fortran
      - erlang
""";

In [43]:
# working with yaml files 
import yaml

dct = {'first': [{'a': 5, 'b': 4}, {'x': 10, 'y': 12}], 'second': 5}

with open(resource_dir / 'test_yaml.yml', 'w') as f:
    yaml.dump(dct, f)


with open(resource_dir / 'test_yaml.yml', 'r') as f:
    data = yaml.load(f, Loader=yaml.FullLoader)
    print(data)

dict

{'first': [{'a': 5, 'b': 4}, {'x': 10, 'y': 12}], 'second': 5}


# JSON files
used for data transfer, e.g. by http APIs, cf requests

In [45]:
# working with JSON 
import json

dct = {
    "firstName": "Jane",
    "lastName": "Doe",
    "hobbies": ["running", "sky diving", "singing"],
    "age": 35,
    "children": [{"firstName": "Alice", "age": 6}, {"firstName": "Bob", "age": 8}],
}

json.dumps(dct)

# serialization - process of encoding json
# python pickle uses format specific to python (that is not json)

with open(resource_dir / 'sample.json', 'w') as f:
    json.dump(dct, f)

with open(resource_dir / 'sample.json', 'r') as f:
    output = json.load(f)
    type(output)
    print(output)


'{"firstName": "Jane", "lastName": "Doe", "hobbies": ["running", "sky diving", "singing"], "age": 35, "children": [{"firstName": "Alice", "age": 6}, {"firstName": "Bob", "age": 8}]}'

dict

{'firstName': 'Jane', 'lastName': 'Doe', 'hobbies': ['running', 'sky diving', 'singing'], 'age': 35, 'children': [{'firstName': 'Alice', 'age': 6}, {'firstName': 'Bob', 'age': 8}]}


In [46]:
import requests


response = requests.get("https://jsonplaceholder.typicode.com/todos")
todos = json.loads(response.text)

todos = response.json()  # same result


# some simple python objects are naturally json-serializable (like e.g. tuple, list, dict)
json.dumps((5, 4))


# set
set_ = {1, 2, 3}  # not serializable

# custom json encoder
def set_json_encoder(set_):
    return list(set_)


json.dumps(set_, default=set_json_encoder)


'[5, 4]'

'[1, 2, 3]'