# Files

# os module
- many low level operating system operations, including file status and manipulation
- [doc](https://docs.python.org/3/library/os.html#module-os)

# tempfile module
- will create a valid unique temporary pathname on any OS
- [doc](https://docs.python.org/3/library/tempfile.html)
- does NOT create a file

In [1]:
import sys
import os
import tempfile
from pathlib import Path

anaconda = Path('~/opt/anaconda3') if sys.platform == 'darwin' else Path('~/anaconda3')
anaconda = anaconda.expanduser()

tp = Path(tempfile.NamedTemporaryFile().name)
tp2 = Path(tempfile.NamedTemporaryFile().name)

# os.path.exists(path) - True if file path exists

tp, tp2, tp.exists(), tp.exists(), anaconda

(PosixPath('/var/folders/2z/vj69b89s1xxfb51stm_z4fnr0000gr/T/tmpv3_2gr5b'),
 PosixPath('/var/folders/2z/vj69b89s1xxfb51stm_z4fnr0000gr/T/tmp60sualdu'),
 False,
 False,
 PosixPath('/Users/dbenson30/opt/anaconda3'))

# Getting file status

In [2]:
# os.path.exists and os.access reports 
# file status without throwning errors
# os.stat throws an error if the path doesn't exist. 

# this is similar to linux 'touch' command - 
# make an empty file

tp.touch()

# get info about file

def ac(p):
    # can check exists, readable, writeable, executable
    return([ os.access(p, m) \
    for m in [os.F_OK, os.R_OK, os.W_OK, os.X_OK] ])

ac(tp)


[True, True, True, False]

In [3]:
# gets several pieces of info in one call
# returns a named tuple

sr = os.stat(tp)
sr

os.stat_result(st_mode=33188, st_ino=9365791, st_dev=16777220, st_nlink=1, st_uid=504, st_gid=20, st_size=0, st_atime=1583266699, st_mtime=1583266699, st_ctime=1583266699)

In [4]:
# get attributes

sr.st_mode, sr.st_atime

(33188, 1583266699.7943497)

In [5]:
# does a path refer to a file or a directory?

tp.is_file(), tp.is_dir()

(True, False)

In [6]:
# removes a file, but raises error if it doesn't exist

tp.unlink()
ac(tp)

[False, False, False, False]

In [7]:
# file is gone

tp.exists()

False

In [8]:
# stat raises an error if the file doesn't exist

tp.stat()

FileNotFoundError: [Errno 2] No such file or directory: '/var/folders/2z/vj69b89s1xxfb51stm_z4fnr0000gr/T/tmpv3_2gr5b'

In [9]:
# Returns list of files and dirs in a directory
# can use isfile and isdir to figure out 
# which is which

[(t.is_file(), t.is_dir(), t) for t in anaconda.iterdir() ]

[(False, True, PosixPath('/Users/dbenson30/opt/anaconda3/man')),
 (False, True, PosixPath('/Users/dbenson30/opt/anaconda3/conda-meta')),
 (False, True, PosixPath('/Users/dbenson30/opt/anaconda3/condabin')),
 (False, True, PosixPath('/Users/dbenson30/opt/anaconda3/ssl')),
 (True, False, PosixPath('/Users/dbenson30/opt/anaconda3/.DS_Store')),
 (False, True, PosixPath('/Users/dbenson30/opt/anaconda3/qml')),
 (False, True, PosixPath('/Users/dbenson30/opt/anaconda3/bin')),
 (False, True, PosixPath('/Users/dbenson30/opt/anaconda3/plugins')),
 (False, True, PosixPath('/Users/dbenson30/opt/anaconda3/shell')),
 (False, True, PosixPath('/Users/dbenson30/opt/anaconda3/libexec')),
 (False, True, PosixPath('/Users/dbenson30/opt/anaconda3/include')),
 (False, True, PosixPath('/Users/dbenson30/opt/anaconda3/resources')),
 (False, True, PosixPath('/Users/dbenson30/opt/anaconda3/sbin')),
 (False, True, PosixPath('/Users/dbenson30/opt/anaconda3/etc')),
 (False, True, PosixPath('/Users/dbenson30/opt/anac

In [10]:
# string version

fds = os.listdir(anaconda)
fds

['man',
 'conda-meta',
 'condabin',
 'ssl',
 '.DS_Store',
 'qml',
 'bin',
 'plugins',
 'shell',
 'libexec',
 'include',
 'resources',
 'sbin',
 'etc',
 'python.app',
 'org.freedesktop.dbus-session.plist',
 'pkgs',
 'lib',
 'doc',
 'mkspecs',
 'phrasebooks',
 'translations',
 'Anaconda-Navigator.app',
 'share']

# 'walk' - gets all the files and dirs under a start dir
- is a generator
- very easy to use
- each 'next' call yields a directory, and all the files and directories inside it

In [11]:
# deeply nested list - use pretty printer

import pprint

e = anaconda / 'bin'
print(e)
g = os.walk(e)
ld = list(g)

pprint.pprint(ld, depth=2)

/Users/dbenson30/opt/anaconda3/bin
[('/Users/dbenson30/opt/anaconda3/bin', [...], [...]),
 ('/Users/dbenson30/opt/anaconda3/bin/Assistant.app', [...], []),
 ('/Users/dbenson30/opt/anaconda3/bin/Assistant.app/Contents', [...], [...]),
 ('/Users/dbenson30/opt/anaconda3/bin/Assistant.app/Contents/MacOS', [], [...]),
 ('/Users/dbenson30/opt/anaconda3/bin/Assistant.app/Contents/Resources',
  [],
  [...]),
 ('/Users/dbenson30/opt/anaconda3/bin/Designer.app', [...], []),
 ('/Users/dbenson30/opt/anaconda3/bin/Designer.app/Contents', [...], [...]),
 ('/Users/dbenson30/opt/anaconda3/bin/Designer.app/Contents/MacOS', [], [...]),
 ('/Users/dbenson30/opt/anaconda3/bin/Designer.app/Contents/Resources',
  [],
  [...]),
 ('/Users/dbenson30/opt/anaconda3/bin/__pycache__', [], [...]),
 ('/Users/dbenson30/opt/anaconda3/bin/pixeltool.app', [...], []),
 ('/Users/dbenson30/opt/anaconda3/bin/pixeltool.app/Contents', [...], [...]),
 ('/Users/dbenson30/opt/anaconda3/bin/pixeltool.app/Contents/MacOS', [], [...]

# open function
- used to open files for reading and writing
- returns a file descriptor that should be closed when IO is complete
- in the builtin namespace

# Writing files 
- no automatic newlines - you must write them out explicitly if you want them

In [12]:
# open file, write to file descriptor, close file descriptor
# can be error prone - easy to forget to close. also, if there
# is an error, the close call could be skipped
# not closing file descriptors can cause a server to crash
# 'w' is the 'open mode' - tells 'open' to 
# open the file for writing

fd = open(tp, 'w')
for e in ['one', 'two', 'three', 'four']:
    fd.write(e + '\n')
fd.close()

# with 
- 'with' is a 'context manager'
- binds return value from open to 'fd'
- ':' and indenting defines a statement block over which 'fd' will be bound
- 'with' will automatically close the file descriptor when the 'with' block is exited, even if by error

In [13]:
with open(tp, 'w') as fd:
    for e in ['one', 'two', 'three', 'four']:
        fd.write(e + '\n')

In [14]:
# could do one write with join
# why the '' ? 

with open(tp, 'w') as fd:
    fd.write('\n'.join(['one', 'two', \
                        'three', 'four','']))

In [15]:
# or write out the string with explicit newlines

with open(tp, 'w') as fd:
    fd.write("one\ntwo\nthree\nfour\n")

In [16]:
# before append

os.stat(tp)

os.stat_result(st_mode=33188, st_ino=9367097, st_dev=16777220, st_nlink=1, st_uid=504, st_gid=20, st_size=19, st_atime=1583270034, st_mtime=1583270033, st_ctime=1583270033)

In [17]:
# can append(open mode 'a') to an existing file

with open(tp, 'a') as f:
    for l in ['five', 'six']:
        f.write(l + '\n')

In [18]:
# file is longer now

os.stat(tp)

os.stat_result(st_mode=33188, st_ino=9367097, st_dev=16777220, st_nlink=1, st_uid=504, st_gid=20, st_size=28, st_atime=1583270065, st_mtime=1583270064, st_ctime=1583270064)

# special behavior on windows


In [19]:
with open(tp, 'w') as fd:
    for e in ['one', 'two', 'three', 'four']:
        fd.write(e + '\n')

In [20]:
# on windows, a \r will be inserted before each \n,
# because 'newline' in windows is '\r\n', two chars.
# so the windows file may be longer than you might except

# reading in binary shows what is actually in the file
# on mac/linux there is no \r

with open(tp, 'rb') as fd:
    d = fd.read()
    
d

b'one\ntwo\nthree\nfour\n'

In [21]:
# when reading on windows, the \r chars are deleted

with open(tp, 'r') as fd:
    d = fd.read()
    
d

'one\ntwo\nthree\nfour\n'

# print function output can goto a file

In [22]:

with open(tp2, "w") as f:
    print(1,2,3,4, sep='\n', file=f)

# 'r' means open file for reading

with open(tp2, 'r') as f:
    print(f.read())

1
2
3
4



# Reading files - eager
- read the entire file immediately

In [23]:
# eager read - read the entire file into one string

with open(tp, 'r') as fd:    
    print(fd.read())

one
two
three
four



In [24]:
# eager read - get a list of all the lines 

with open(tp,'r') as fd:
    print(fd.readlines())

['one\n', 'two\n', 'three\n', 'four\n']


# Reading files - lazy
- suppose you are looking for a substring in a huge unsorted file of text lines
    - lazy read probably wins
    - don't have to read in entire file before you can start searching
    - don't have to allocate memory to hold the whole file
    - once you find the substring, you don't have to read the rest of the file

In [25]:
# read one line at a time 

with open(tp, 'r') as fd:
    while True:
        x = fd.readline()
        # returns empty string when finished
        if x == '':
            break;
        print(x)

one

two

three

four



In [26]:
# note double spacing
# each line in the file has a newline,  
# plus print is adding one
# can turn off the print newline 
# with keyword arg 'end'

with open(tp, 'r') as fd:
    while True:
        x = fd.readline()
        # returns empty string when finished
        if x == '':
            break;
        print(x, end='')

one
two
three
four


In [27]:
fd = open(tp, 'r')
fd

<_io.TextIOWrapper name='/var/folders/2z/vj69b89s1xxfb51stm_z4fnr0000gr/T/tmpv3_2gr5b' mode='r' encoding='UTF-8'>

In [28]:
# a file descriptor is an iterator 
# over the file lines

[fd, iter(fd), fd is iter(fd)]

[<_io.TextIOWrapper name='/var/folders/2z/vj69b89s1xxfb51stm_z4fnr0000gr/T/tmpv3_2gr5b' mode='r' encoding='UTF-8'>,
 <_io.TextIOWrapper name='/var/folders/2z/vj69b89s1xxfb51stm_z4fnr0000gr/T/tmpv3_2gr5b' mode='r' encoding='UTF-8'>,
 True]

In [29]:
next(fd)

'one\n'

In [30]:
# don't have to finish iterator...

next(fd)

'two\n'

In [31]:
# note with readline and readlines 
# each line has a trailing '\n', 
# which you usually don't want
# use strip() to remove
# can this cause a problem?

'one\n'.strip()

'one'

In [32]:
# read N chars at a time
# final read might be shorter

def readn(n):
    with open(tp, 'r')  as f:
        res = []
        while True:
            s = f.read(n)
            res.append(s)
            if s == '':
                # file is exhausted
                break;
    return res

In [33]:
readn(4)

['one\n', 'two\n', 'thre', 'e\nfo', 'ur\n', '']

In [34]:
readn(2)

['on', 'e\n', 'tw', 'o\n', 'th', 're', 'e\n', 'fo', 'ur', '\n', '']

In [35]:
readn(1)

['o',
 'n',
 'e',
 '\n',
 't',
 'w',
 'o',
 '\n',
 't',
 'h',
 'r',
 'e',
 'e',
 '\n',
 'f',
 'o',
 'u',
 'r',
 '\n',
 '']

In [36]:
# ... or can finish iterator later on

next(fd), next(fd), next(fd), next(fd)

StopIteration: 

# Can do I/O in unicode or binary
- 'open' defaults to 'str' (unicode)
- pass 'b' flag to 'open' for 'bytes'(binary)


In [37]:
uni = '\U00002119\u01b4\u2602\u210c\xf8\u1f24'

utf8, utf16, utf32 = [uni.encode(et) \
                      for et in \
                      ['utf-8', 'utf-16', 'utf-32']]

uni, utf8, utf16, utf32

('ℙƴ☂ℌøἤ',
 b'\xe2\x84\x99\xc6\xb4\xe2\x98\x82\xe2\x84\x8c\xc3\xb8\xe1\xbc\xa4',
 b'\xff\xfe\x19!\xb4\x01\x02&\x0c!\xf8\x00$\x1f',
 b'\xff\xfe\x00\x00\x19!\x00\x00\xb4\x01\x00\x00\x02&\x00\x00\x0c!\x00\x00\xf8\x00\x00\x00$\x1f\x00\x00')

In [38]:
# won't work - file stream expects a
# 'str' by default, but utf32 is type 'bytes'

import tempfile

path = tempfile.NamedTemporaryFile().name

with open(path, "w") as f:
    f.write(utf32)

TypeError: write() argument must be str, not bytes

In [39]:
# make a binary stream by adding 'b' flag to 'open'

with open(path, 'bw') as f:
    f.write(utf32)

In [40]:
# reading in 'str' mode defaults to utf-8, 
# but the file we wrote is utf-32
# so, this read fails

# but, somethimes if you give open the 
# wrong encoding, it will read
# w/o error and give you garbage!

with open(path, "r") as f:
    print(f.read())

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

In [41]:
# tell 'open' the right unicode encoding

with open(path, "r" , encoding='utf-32') as f:
    print(f.read())

ℙƴ☂ℌøἤ


In [42]:
# can read file bytes

with open(path, "rb") as f:
    b = f.read()
b

b'\xff\xfe\x00\x00\x19!\x00\x00\xb4\x01\x00\x00\x02&\x00\x00\x0c!\x00\x00\xf8\x00\x00\x00$\x1f\x00\x00'

In [43]:
utf32

b'\xff\xfe\x00\x00\x19!\x00\x00\xb4\x01\x00\x00\x02&\x00\x00\x0c!\x00\x00\xf8\x00\x00\x00$\x1f\x00\x00'

# Memory mapping
- advanced technique, but easy to use
- more efficient way to deal with large files
- avoids copying and system call overhead
- uses the virtual memory system
    - HIGHLY optimized with hardware support
- [doc](https://docs.python.org/3.0/library/mmap.html)

In [44]:
# write out some stuff 

tp = tempfile.NamedTemporaryFile().name
with open(tp, 'w') as fd:
    for e in ['one', 'two', 'three', 'four']:
        fd.write(e + '\n')

In [45]:
# read it with mmap

import mmap

fp = open(tp, "r+")
fn = fp.fileno()
map = mmap.mmap(fn, 0)

In [46]:
map[:]

b'one\ntwo\nthree\nfour\n'

In [47]:
map.seek(0)
map.find(b'two')


4

# In memory "files"
- occasionally very useful 
- [doc](https://docs.python.org/3.7/library/io.html#io.StringIO)

In [48]:
import io

ios = io.StringIO()

print('one', file=ios)
ios.write('two')

ios.getvalue()

'one\ntwo'

In [49]:
ios = io.StringIO('one\ntwo\nthree\n')

ios.readlines()

['one\n', 'two\n', 'three\n']


# shutil module 
- move, copy, delete file trees
- [doc](https://docs.python.org/3.7/library/shutil.html)

# glob - linux style filename matching
- [doc](https://docs.python.org/3.7/library/glob.html)

# modules that read/write archive formats, like zip and tar
- [doc](https://docs.python.org/3.7/library/archiving.html)
