# Python Popular Utilities Modules

### 1 - File System - os, os.path, shutil, commands
* 1.1 **os**
  * filenames = os.listdir(dir) -- list of filenames in that directory path (not including . and ..). The filenames are just the names in the directory, not their absolute paths.
  * os.path.join(dir, filename) -- given a filename from the above list, use this to put the dir and filename together to make a path
  * os.path.abspath(path) -- given a path, return an absolute form, e.g. /home/nick/foo/bar.html
  * os.path.dirname(path), os.path.basename(path) -- given dir/foo/bar.html, return the dirname "dir/foo" and basename "bar.html"
  * os.path.exists(path) -- true if it exists
  * os.mkdir(dir_path) -- makes one dir, os.makedirs(dir_path) makes all the needed dirs in this path
* 1.2 **shutil.copy**(source-path, dest-path) -- copy a file (dest path directories should exist)
* 1.3 **commands** -- runs shell commands
* 1.4 **glob**
* 1.5 **sys**
* 1.6 **pickle**

### 2 -  HTTP - urlib, urlparse
* ufile = urllib.urlopen(url) -- returns a file like object for that url
* text = ufile.read() -- can read from it, like a file (readlines() etc. also work)
* info = ufile.info() -- the meta info for that request. info.gettype() is the mime time, e.g. 'text/html'
* baseurl = ufile.geturl() -- gets the "base" url for the request, which may be different from the original because of redirects
* urllib.urlretrieve(url, filename) -- downloads the url data to the given file path
* urlparse.urljoin(baseurl, url) -- given a url that may or may not be full, and the baseurl of the page it comes from, return a full url. Use geturl() above to provide the base url.

### 3 -  DATETIME - datetime module
* get system datetime

### 4 - argparse


----

# 1 - File System - Standard Library
> **`os`** module

> **`shutil`**: high-level file operations

> **`commands`** module

> **`glob`**: pattern matching on files

> **`sys`** module: system-specific information

> **`pickle`**: easy persistence

## 1.1 - `os` module

In [1]:
import os
# dir(os)

### # 1.1.1 directory and file manipulation

In [9]:
# list directory
os.listdir('../')

['.DS_Store', 'python_basic_notebook', 'python_data_analysis_notebook']

In [2]:
# current working directory
os.getcwd()

'/Users/DSinmotion/Dropbox/DataScience/Github/datascience/python/python_basics/python_basic_notebook'

In [3]:
# make a directory
os.mkdir('tmpdir')
'tmpdir' in os.listdir(os.curdir)

True

In [4]:
# rename the directory
os.rename('tmpdir', 'tmpdir1')
'tmpdir1' in os.listdir(os.curdir)

True

In [5]:
# delete a file/dir
os.rmdir('tmpdir1')
'tmpdir' in os.listdir(os.curdir)

False

In [7]:
# delete a file
fp = open('junk.txt', 'w')
fp.close()
print 'junk.txt' in os.listdir(os.curdir)

os.remove('junk.txt')
print 'junk.txt' in os.listdir(os.curdir)

True
False


In [15]:
# list directory as a list
import os

def List(dir):
    files = os.listdir(dir)
    print files
    
def main():
    List('../')
    
if __name__ == '__main__':
    main()

['.DS_Store', 'python_basic_notebook', 'python_data_analysis_notebook', 'training_material_original']


### # 1.1.2 `os.path` Path Manipulations

In [8]:
fp = open('junk.txt', 'w')
fp.close()

a = os.path.abspath('junk.txt')
a

'/Users/DSinmotion/Dropbox/DataScience/Github/datascience/python/python_basics/python_basic_notebook/junk.txt'

In [9]:
os.path.split(a)

('/Users/DSinmotion/Dropbox/DataScience/Github/datascience/python/python_basics/python_basic_notebook',
 'junk.txt')

In [10]:
os.path.dirname(a)

'/Users/DSinmotion/Dropbox/DataScience/Github/datascience/python/python_basics/python_basic_notebook'

In [11]:
os.path.basename(a)

'junk.txt'

In [12]:
os.path.splitext(os.path.basename(a))

('junk', '.txt')

In [13]:
os.path.exists('junk.txt')

True

In [14]:
os.path.isfile('junk.txt')

True

In [15]:
os.path.isdir('junk.txt')

False

In [17]:
os.path.join(os.path.dirname(a), 'ttt','kkk')

'/Users/DSinmotion/Dropbox/DataScience/Github/datascience/python/python_basics/python_basic_notebook/ttt/kkk'

#### filenames, absolute path, relative path 

In [18]:
## Example pulls filenames from a dir, prints their relative and absolute paths
def printdir(dir):
  filenames = os.listdir(dir)
  for filename in filenames:
    print filename  ## foo.txt
    print os.path.join(dir, filename) ## dir/foo.txt (relative to current dir)
    print os.path.abspath(os.path.join(dir, filename)) ## /home/nick/dir/foo.txt
    
def main():
    printdir('../')
    
if __name__ == '__main__':
    main()

.DS_Store
../.DS_Store
/Users/DSinmotion/Dropbox/DataScience/Github/datascience/python/python_basics/.DS_Store
python_basic_notebook
../python_basic_notebook
/Users/DSinmotion/Dropbox/DataScience/Github/datascience/python/python_basics/python_basic_notebook
python_data_analysis_notebook
../python_data_analysis_notebook
/Users/DSinmotion/Dropbox/DataScience/Github/datascience/python/python_basics/python_data_analysis_notebook
training_material_original
../training_material_original
/Users/DSinmotion/Dropbox/DataScience/Github/datascience/python/python_basics/training_material_original


####check file existence

In [2]:
import os
os.path.exists('../python_data_analysis_notebook')

True

#### Walking a directory

In [5]:
for dirpath, dirnames, filenames in os.walk(os.curdir):
    for fp in filenames:
        print os.path.abspath(fp)

/Users/szhang/Dropbox/datascience/Github/datascience/python/python_basics/python_basic_notebook/.project
/Users/szhang/Dropbox/datascience/Github/datascience/python/python_basics/python_basic_notebook/.pydevproject
/Users/szhang/Dropbox/datascience/Github/datascience/python/python_basics/python_basic_notebook/00_python_basics_ipython_tutorial.ipynb
/Users/szhang/Dropbox/datascience/Github/datascience/python/python_basics/python_basic_notebook/01_python_basics_intro.ipynb
/Users/szhang/Dropbox/datascience/Github/datascience/python/python_basics/python_basic_notebook/02_python_basics_data_structure.ipynb
/Users/szhang/Dropbox/datascience/Github/datascience/python/python_basics/python_basic_notebook/03_python_basics_exception.ipynb
/Users/szhang/Dropbox/datascience/Github/datascience/python/python_basics/python_basic_notebook/04_python_basics_files_io.ipynb
/Users/szhang/Dropbox/datascience/Github/datascience/python/python_basics/python_basic_notebook/05_python_basics_utilities.ipynb
/Use

---

## 1.2 `shutil`: high-level file operations

>The shutil provides useful file operations:
- `shutil.rmtree`: Recursively delete a directory tree.
- `shutil.move`: Recursively move a file or directory to another location.
- `shutil.copy`: Copy files or directories.

### #copy files

In [23]:
import shutil
shutil.copy('../python_data_analysis_noteboo', './python_data_analysis_notebook')

IOError: [Errno 2] No such file or directory: '../python_data_analysis_noteboo'

---

## 1.3 `commands` module

### #commands - running external processes

* (status, output) = commands.getstatusoutput(cmd) -- runs the command, waits for it to exit, and returns its status int and output text as a tuple. 
* The command is run with its standard output and standard error combined into the one output text. The status will be non-zero if the command failed. Since the standard-err of the command is captured, if it fails, we need to print some indication of what happened.
* output = commands.getoutput(cmd) -- as above, but without the status int.
* There is a commands.getstatus() but it does something else, so don't use it -- dumbest bit of method naming ever!
* If you want more control over the running of the sub-process, see the "popen2" module (http://docs.python.org/lib/module-popen2.html)
* There is also a simple os.system(cmd) which runs the command and dumps its output onto your output and returns its error code. This works if you want to run the command but do not need to capture its output into your python data structures.

In [25]:
import commands

## Given a dir path, run an external 'ls -l' on it --
## shows how to call an external program
def listdir(dir):
  cmd = 'ls -l ' + dir
  print "Command to run:", cmd   ## good to debug cmd before actually running it
  (status, output) = commands.getstatusoutput(cmd)
  if status:    ## Error case, print the command's output to stderr and exit
    sys.stderr.write(output)
    sys.exit(1)
  print output  ## Otherwise do something with the command's output

listdir('..')

Command to run: ls -l ..
total 0
drwxr-xr-x@ 24 DSinmotion  staff  816 16 Feb 18:54 python_basic_notebook
drwxr-xr-x@  4 DSinmotion  staff  136 15 Feb 22:47 python_data_analysis_notebook
drwxr-xr-x@  5 DSinmotion  staff  170 15 Feb 11:18 training_material_original


---

## 1.4 `glob`: pattern matching on files
> The glob module provides convenient file pattern matching.

In [20]:
# Find all files ending in .txt:
import glob
glob.glob('*.txt')

['junk.txt', 'test.txt']

---

## 1.5 `sys` module: system-specific info


In [21]:
import sys

### python versions

In [27]:
print sys.platform
print
print sys.version
print
print sys.prefix

darwin

2.7.8 |Anaconda 2.1.0 (x86_64)| (default, Aug 21 2014, 15:21:46) 
[GCC 4.2.1 (Apple Inc. build 5577)]

/Library/anaconda


### list of command line arguments passed to a python script

In [25]:
sys.argv

['-c',
 '-f',
 '/Users/DSinmotion/.ipython/profile_default/security/kernel-d2ce44d9-e262-4188-a28c-a3d8ac84a97a.json',
 '--pylab=inline',
 "--IPKernelApp.parent_appname='ipython-notebook'",
 '--profile-dir',
 '/Users/DSinmotion/.ipython/profile_default',
 '--parent=1']

### list of search path for modules
> initialized from PYTHONPATH

In [28]:
sys.path

['',
 '/Library/anaconda/lib/python27.zip',
 '/Library/anaconda/lib/python2.7',
 '/Library/anaconda/lib/python2.7/plat-darwin',
 '/Library/anaconda/lib/python2.7/plat-mac',
 '/Library/anaconda/lib/python2.7/plat-mac/lib-scriptpackages',
 '/Library/anaconda/lib/python2.7/lib-tk',
 '/Library/anaconda/lib/python2.7/lib-old',
 '/Library/anaconda/lib/python2.7/lib-dynload',
 '/Library/anaconda/lib/python2.7/site-packages',
 '/Library/anaconda/lib/python2.7/site-packages/PIL',
 '/Library/anaconda/lib/python2.7/site-packages/Sphinx-1.2.3-py2.7.egg',
 '/Library/anaconda/lib/python2.7/site-packages/runipy-0.1.1-py2.7.egg',
 '/Library/anaconda/lib/python2.7/site-packages/setuptools-5.8-py2.7.egg',
 '/Library/anaconda/lib/python2.7/site-packages/IPython/extensions']

## 1.6 `pickle`: easy persistence
> Useful to store arbitrary objects to a file. Not safe or fast!

In [32]:
import pickle
l = [1, None, 'Stan']
pickle.dump(l, file('data/test.pkl', 'w'))
pickle.load(file('data/test.pkl'))

[1, None, 'Stan']

----

# 2 - HTTP - urllib, urlparse

## HTTP - urlib, urlparse
* ufile = urllib.urlopen(url) -- returns a file like object for that url
* text = ufile.read() -- can read from it, like a file (readlines() etc. also work)
* info = ufile.info() -- the meta info for that request. info.gettype() is the mime time, e.g. 'text/html'
* baseurl = ufile.geturl() -- gets the "base" url for the request, which may be different from the original because of redirects
* urllib.urlretrieve(url, filename) -- downloads the url data to the given file path
* urlparse.urljoin(baseurl, url) -- given a url that may or may not be full, and the baseurl of the page it comes from, return a full url. Use geturl() above to provide the base url.

In [2]:
## Given a url, try to retrieve it. If it's text/html,
## print its base url and its text.
import urllib, urlparse

url = 'http://www.dmoz.org/Arts/Movies/'

ufile = urllib.urlopen(url)  ## get file-like object for url
print 'urlfile: ', ufile, '\n'

info = ufile.info()          ## meta-info about the url content
print info, '\n'

if info.gettype() == 'text/html':
    print 'base url: ' + ufile.geturl()
    
if info.gettype() == 'text/html':
    text = ufile.read()  ## read all its text
    #print text




urlfile:  <addinfourl at 4427116344 whose fp = <socket._fileobject object at 0x107fb9d50>> 

Date: Sun, 03 May 2015 04:36:55 GMT
Server: Apache
Set-Cookie: JSESSIONID=F3F9C93859BC7C93744DF8EE47ED5BA8; Path=/
Content-Language: en
Content-Length: 31936
Connection: close
Content-Type: text/html;charset=UTF-8
 

base url: http://www.dmoz.org/Arts/Movies/


In [4]:
## Given a url, try to retrieve it. If it's text/html,
## print its base url and its text.
import urllib, urlparse

def wget(url):
  ufile = urllib.urlopen(url)  ## get file-like object for url
  info = ufile.info()   ## meta-info about the url content
  print info
  if info.gettype() == 'text/html':
    print 'base url:' + ufile.geturl()
    text = ufile.read()  ## read all its text
    #print text

url = 'http://www.dmoz.org'
wget(url)

Date: Sun, 03 May 2015 04:38:17 GMT
Server: Apache
Set-Cookie: JSESSIONID=5577AD6FDADF91D06418790A23263145; Path=/
Content-Length: 17601
Connection: close
Content-Type: text/html;charset=UTF-8

base url:http://www.dmoz.org


In [6]:
## Version that uses try/except to print an error message if the
## urlopen() fails.
import urllib, urlparse


def wget2(url):
  try:
    ufile = urllib.urlopen(url)
    if ufile.info().gettype() == 'text/html':
      print ufile.read()
  except IOError:
    print 'problem reading url:', url

url = 'http://www.dmoz.org'
#wget2(url)
    

---

# 3 -  Datetime Module


In [3]:
import datetime
i = datetime.datetime.now()
 
print ("Current date & time = %s" % i)
print ("Date and time in ISO format = %s" % i.isoformat() )
print ("Current year = %s" % i.year)
print ("Current month = %s" % i.month)
print ("Current date (day) =  %s" % i.day)
print ("dd/mm/yyyy format =  %s/%s/%s" % (i.day, i.month, i.year) )
print ("Current hour = %s" %i.hour)
print ("Current minute = %s" %i.minute)
print ("Current second =  %s" %i.second)
print ("hh:mm:ss format = %s:%s:%s" % (i.hour, i.month, i.second) )

Current date & time = 2015-04-23 14:29:12.668797
Date and time in ISO format = 2015-04-23T14:29:12.668797
Current year = 2015
Current month = 4
Current date (day) =  23
dd/mm/yyyy format =  23/4/2015
Current hour = 14
Current minute = 29
Current second =  12
hh:mm:ss format = 14:4:12


------

# 4 - ArgParse 