# Read and Writing Text Files

In [None]:
from IPython.display import YouTubeVideo

YouTubeVideo("5YVmUUrI04g", height=600)

In [None]:
import os


In [None]:
DATADIR = os.path.join(os.path.expanduser("~"),"DATA","Misc")
os.path.exists(DATADIR)

# File Input/Output

* Ultimately you need to be able to 
    * get something into your program from a "file" system
    * get something out of your program into a file system
* File Input/Output is very easy in Python
    * First create a file object with the **[open()](http://docs.python.org/2/library/functions.html#open)** function:
	* **fileObject = open('./thisfile.txt','r')**
        * First argument: what file to open
        * Second argument: what mode to open the file with 
            * 'r'---read (default)
			* 'a'---append
			* 'w'---write
			* 'rb'---read binary, etc.

### Use `os` library to list contents of current directory

In [None]:
directoryFiles = os.listdir(os.getcwd())
directoryFiles

### Create a file object to write to

In [None]:
fileName = "mytestFile4.txt"
file0 = open(fileName,"w")
directoryFiles = os.listdir(os.getcwd())
directoryFiles
file0.close()
print(directoryFiles)

## How to use file objects
* What do you do with the object once you've opened it?
    * **read([NUMBYTES])**---read() takes as an optional argument the number of bytes you want to read from the file; the default is to read the entire file
	* **readline()**---read the next line 
        * *NOTE*: relies on correctly identifying linebreaks
	* **readlines()**---read all the lines into a list
        * **readline()** and **readlines()** often fail for me.
	* **write()**---write a string to file
	* **writelines()** ---write an iterable object to file.
    
    * "For reading lines from a file, you can loop over the file object."
    

## Example

## Open a file and read the content
### In Python 3.x all strings are unicode


In [None]:
f = open(os.path.join(DATADIR,"icd9-short.txt"),"r")#
data = f.read()
print(len(data),type(data))
print(data)
f.close()

### We  can specify encodings when we open the file

In [None]:
f = open(os.path.join(DATADIR,"icd9-short.txt"),"r",encoding="ISO-8859-1")
data = f.read()
print(len(data),type(data))
print(data)


## Now let's read in the data by lines

In [None]:
lines = f.readlines()
print(len(lines),type(lines))

## What Happened?
As we read data our file object(``f``) moves through the file. After we read all the data, ``f`` is sitting at the end of the file (that is how it knew to stop).

### We can move our file object back and forth in the file with ``seek``

In [None]:

f.seek(0) # returns file object to beginning of file
lines = f.readlines()
print (type(lines))
print (len(lines))
#print lines




#### Sometimes Unix/Linux machines have a hard time recognizing line breaks from Windows

In [None]:
lines = lines[0].split("\r")
print (len(lines))

In [None]:
f.seek(0)
data = f.read()
type(data)

In [None]:
f.close()

## Using the [**with**](https://docs.python.org/2/reference/compound_stmts.html#the-with-statement) statement

* With the **with** statement I can
    * open a file
    * iterate over it
    * automatically close it when done

In [None]:
with open(os.path.join(DATADIR,"icd9-short.txt"),"r") as f0:
    read_data = f0.readlines()
print (len(read_data))
print (f0.closed)
print(f.closed)

## Python comes with a csv file format reader
* There is a drop in replacement unicodecsv (pip install unicodecsv)

#### Example: Read in icd9-short.txt. Keep line if it is in the top 100 diagnoses

1. use the ``with open as `` syntax
1. Read the first twenty lines as header information
1. The third column indicates whether it is a top 100 diagnosis
1. Just for show, we will use the [tabulate](https://pypi.python.org/pypi/tabulate) package to format our data into an HTML table and use the IPython display functionality to render the HTML

In [None]:
!pip install tabulate

In [None]:
from tabulate import tabulate
from IPython.display import display, HTML

In [None]:
demo = [row.split("\t") for row in """I. Infectious & Parasitic Diseases			995.91	Sepsis
I. Infectious & Parasitic Diseases			038.9	"Septicemia, NOS"
I. Infectious & Parasitic Diseases			005.0	Staphylococcal food poisoning
I. Infectious & Parasitic Diseases		Yes	034.0	Strep throat
I. Infectious & Parasitic Diseases			097.9	"Syphilis, unspec."
I. Infectious & Parasitic Diseases			131.9	"Trichomoniasis, unspec."
I. Infectious & Parasitic Diseases			011.90	"Tuberculosis, pulmonary, NOS"
I. Infectious & Parasitic Diseases			099.9	"Venereal disease, unspec."
I. Infectious & Parasitic Diseases		Yes	079.99	"Viral infection, unspec."
I. Infectious & Parasitic Diseases			078.11	"Warts, condyloma"
I. Infectious & Parasitic Diseases		Yes	078.10	"Warts, viral, unspec.""".split("\n")]

display(HTML(tabulate(demo, tablefmt="html")))

In [None]:
import csv
data = []
with open(os.path.join(DATADIR,"icd9-short.txt"),"r") as f:
    reader = csv.reader( f, delimiter="\t" )
    # read the header lines
    # Example of list comprehension
    
    header = [next(reader) for i in range(20)]
    for row in reader:
        if row[2] == 'Yes':
            data.append( row)
#    data = [row for row in reader]
print (len(data))
display(HTML(tabulate([h for h in header if h], tablefmt="html")))
display(HTML(tabulate(data, tablefmt="html")))
data[:3]

# Writing Data to Files
* file objects write out *STRINGS*. 
* Thus any other object that I want to write to file must first be converted to a string
* Note that **write()** does NOT add line breaks
* Also **writelines()** (also does NOT add line breaks)


In [None]:
with open("myTestFile.txt","w") as fout:
    fout.write("This is the first line in my file.")
    fout.write(" This will also be on my first line of the file because I didn't add a linebreak"+"\n")

    #fout.write(5)
    fout.write("%d\n"%5)
    fout.write("%s\n"%5)

with open("myTestFile.txt","r") as f:
    msg = f.read()
print (msg)


In [None]:
f = open("top100-icd9.txt","w")
for d in data:
    f.write("\t".join(d)+"\n")
f.close()
         

In [None]:
with open("top100-icd9.txt") as f0:
    read_top_100 = f0.readlines()
print(len(read_top_100))

In [None]:
read_top_100