In [6]:
# Import Necessary packages
import earthpy as et
from glob import glob
import os

### Download Files Using EarthPy

In [3]:
# Download data on average monthly temp for two California sites
file_url = "https://ndownloader.figshare.com/files/21894528"
et.data.get_data(url=file_url)

Downloading from https://ndownloader.figshare.com/files/21894528
Extracted output to C:\Users\Gabriel\earth-analytics\data\earthpy-downloads\avg-monthly-temp-fahr


'C:\\Users\\Gabriel\\earth-analytics\\data\\earthpy-downloads\\avg-monthly-temp-fahr'

In [10]:
# Set working directory to earth-analytics
os.chdir(os.path.join(et.io.HOME, "earth-analytics"))

# Create a path to the data folder
data_folder = os.path.join("data", "earthpy-downloads", "avg-monthly-temp-fahr")

### Glob in Python
- **os** helps manage and create specific pahts 
- **glob** is a powerful Python library that helps managing and filtering through large datasets and pull out only what is of interest
- glob() function uses rules of Unix shell to help users organize their files.
#### **Search for a specific folder or file**

In [11]:
# Creates a list of all files in a folder
file_list = glob(data_folder)
file_list

['data\\earthpy-downloads\\avg-monthly-temp-fahr']

In [12]:
type(file_list)

list

#### **Use glob() combined with os.path.join() to create lists of paths**

In [14]:
# Create a list of containing a specific file name
glob(os.path.join(data_folder, "San-Diego", "San-Diego-1999-temp.csv"))

['data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-1999-temp.csv']

#### * Operator
* The * is a wildcard to search for items that have differences in their names
* Example: You want eeryfile in a directory to be returned to you, you can put a * at the end of a directory path

In [16]:
# Get the list of all files/dirs in data folder
glob(os.path.join(data_folder, "*"))

['data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\Sonoma']

In [18]:
# Get list of all files/dirs in San-Diego folder
glob(os.path.join(data_folder, "San-Diego", "*"))

['data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-1999-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2000-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2001-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2002-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2003-temp.csv']

In [19]:
# Get only csv files
glob(os.path.join(data_folder, "San-Diego", "*csv"))

['data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-1999-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2000-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2001-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2002-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2003-temp.csv']

In [20]:
# Get only csv files with number 2 somewhere in the file name
glob(os.path.join(data_folder, "San-Diego", "*2*.csv"))

['data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2000-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2001-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2002-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2003-temp.csv']

In [21]:
# Get only csv files with number 1 in the file name
glob(os.path.join(data_folder, "San-Diego", "*1*.csv"))

['data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-1999-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2001-temp.csv']

Note that 2*.csv would only return files that start with the number 2

In [22]:
# Create an empty list (no file names begin with 2)
glob(os.path.join(data_folder, "San-Diego", "2*.csv"))

[]

### **Recursive Searches**
* Search on files across multiple directories
- Use multiple * in a file path to indicate you want every file in all folders in a directory
- The first * acess all directories in the starting directory
- The second loops through all subdirs to make a list of all their contents

In [23]:
# Search recursevily though both site folders
glob(os.path.join(data_folder, "*", "*"))

['data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-1999-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2000-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2001-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2002-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2003-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\Sonoma\\Sonoma-1999-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\Sonoma\\Sonoma-2000-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\Sonoma\\Sonoma-2001-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\Sonoma\\Sonoma-2002-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\Sonoma\\Sonoma-2003-temp.csv']

### **Sorting glob Lists**
- The lists that are provided with glob() are not sorted


In [26]:
# Get the list of CSVs in Sonoma directory
sonoma_files = glob(os.path.join(data_folder, "Sonoma", "*.csv"))
sonoma_files

['data\\earthpy-downloads\\avg-monthly-temp-fahr\\Sonoma\\Sonoma-1999-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\Sonoma\\Sonoma-2000-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\Sonoma\\Sonoma-2001-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\Sonoma\\Sonoma-2002-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\Sonoma\\Sonoma-2003-temp.csv']

- If it's important for a list to have a certain order, sort the list returned by glob() with .sort() method for lists

In [28]:
# Sort Glob List
sonoma_files.sort()
sonoma_files

['data\\earthpy-downloads\\avg-monthly-temp-fahr\\Sonoma\\Sonoma-1999-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\Sonoma\\Sonoma-2000-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\Sonoma\\Sonoma-2001-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\Sonoma\\Sonoma-2002-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\Sonoma\\Sonoma-2003-temp.csv']

In [30]:
# Another option for sorting Lists
sonoma_files = sorted(glob(os.path.join(data_folder, "Sonoma", "*.csv")))
sonoma_files

['data\\earthpy-downloads\\avg-monthly-temp-fahr\\Sonoma\\Sonoma-1999-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\Sonoma\\Sonoma-2000-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\Sonoma\\Sonoma-2001-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\Sonoma\\Sonoma-2002-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\Sonoma\\Sonoma-2003-temp.csv']

In [32]:
# Practicing
san_diego_files = sorted(glob(os.path.join(data_folder, "San-Diego", "*.csv")))
san_diego_files

['data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-1999-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2000-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2001-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2002-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2003-temp.csv']

### Why Sort glob Lists? 

In [36]:
unsorted_sonoma = glob(os.path.join(data_folder, "Sonoma", "*"))
print(unsorted_sonoma[4])

data\earthpy-downloads\avg-monthly-temp-fahr\Sonoma\Sonoma-2003-temp.csv


In [37]:
sorted_sonoma = sorted(glob(os.path.join(data_folder, "Sonoma", "*")))
print(sorted_sonoma[4])

data\earthpy-downloads\avg-monthly-temp-fahr\Sonoma\Sonoma-2003-temp.csv


### Using Ranges
- [ ] to specify a range of characters to search for
- To search for all files with 2001 to 2003 in the name use *200[1-3]*

In [38]:
# Get files from 2001 to 2003
glob(os.path.join(data_folder, "*", "*200[1-3]*"))

['data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2001-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2002-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2003-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\Sonoma\\Sonoma-2001-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\Sonoma\\Sonoma-2002-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\Sonoma\\Sonoma-2003-temp.csv']

- It can also be used with letters [d - q] for example
- You can use it with [2 - 7 ] but not [2 - 14] because it's more than one character

In [40]:
# Get incorrect range for 2002-2003
glob(os.path.join(data_folder, "*", "*[2001-2003]*"))

['data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-1999-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2000-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2001-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2002-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2003-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\Sonoma\\Sonoma-1999-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\Sonoma\\Sonoma-2000-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\Sonoma\\Sonoma-2001-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\Sonoma\\Sonoma-2002-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\Sonoma\\Sonoma-2003-temp.csv']

### ? Operator
- Works similarly to *, but only for a single character
- If one character in the file can be variable, and everything else stay the same
  

In [41]:
# Use ? operator for last value in year
glob(os.path.join(data_folder, "Sonoma", "*200?-temp.csv"))

['data\\earthpy-downloads\\avg-monthly-temp-fahr\\Sonoma\\Sonoma-2000-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\Sonoma\\Sonoma-2001-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\Sonoma\\Sonoma-2002-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\Sonoma\\Sonoma-2003-temp.csv']

- **`?`** is not limited to one use. It's possible to replace more tha one character

In [43]:
glob(os.path.join(data_folder, "*", "*19??-temp.csv"))

['data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-1999-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\Sonoma\\Sonoma-1999-temp.csv']

### Saving a glob Output to a Variable
- **`glob`** can be saved to a variable name

In [45]:
# Save Glob in a variable name
sd_data = glob(os.path.join(data_folder, "San-Diego", "*"))
sd_data.sort()

sd_data

['data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-1999-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2000-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2001-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2002-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2003-temp.csv']

### **os Advanced Functionality**
- `os` has a function **`os.path.normpath()`** that can be used to **clean up file paths**
- It takes out any unecessary characters to make the path more easily read
- Make sure the path is properly formatted before using other `os` functions

In [46]:
# Example of normpath cleaning up paths
example_path = "home//user//example_dir"
os.path.normpath(example_path)

'home\\user\\example_dir'

- `os.path.commonpath()` take a list of file paths and find the lowest directory in common between them
  


In [48]:
# Print the list of files
sd_data

['data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-1999-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2000-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2001-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2002-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2003-temp.csv']

In [49]:
# Get the shared directory from a list of files
os.path.commonpath(sd_data)

'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego'

- `os.path.basename()` finds the last section of a path and returns it. 

In [50]:
# Print normalized path
os.path.normpath(data_folder)

'data\\earthpy-downloads\\avg-monthly-temp-fahr'

In [51]:
# Get the last part of a file path with base name
os.path.basename(data_folder)

'avg-monthly-temp-fahr'

- `os.path.split()` split a path into two parts:
1. the last part
2. the rest 

In [53]:
# Get the last part of a file path and the rest of the path
os.path.split(os.path.normpath(data_folder))

('data\\earthpy-downloads', 'avg-monthly-temp-fahr')

- Use indexing on the result to get each piece of the split part

In [60]:
os.path.split(os.path.normpath(data_folder))[0]

'data\\earthpy-downloads'

In [61]:
os.path.split(os.path.normpath(data_folder))[1]

'avg-monthly-temp-fahr'

### String Manipulation

In [62]:
# Separate a path into parts
file_path_list = data_folder.split(os.sep)
file_path_list

['data', 'earthpy-downloads', 'avg-monthly-temp-fahr']

In [65]:
file_path_list[1]

'earthpy-downloads'

In [66]:
# Print list of files
sd_data

['data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-1999-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2000-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2001-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2002-temp.csv',
 'data\\earthpy-downloads\\avg-monthly-temp-fahr\\San-Diego\\San-Diego-2003-temp.csv']

In [69]:
# Get the files name
year_path = sd_data[0]
file_name = os.path.basename(year_path)
print(file_name)

San-Diego-1999-temp.csv


In [71]:
# Parse date from file name
year = file_name[10:14]
print(year)

1999
