In [None]:
import multiprocessing as mp

def my_func(x):
  print(mp.current_process())
  return x**x

def main():
  pool = mp.Pool(mp.cpu_count())
  result = pool.map(my_func, [4,2,3,5,3,2,1,2])
  result_set_2 = pool.map(my_func, [4,6,5,4,6,3,23,4,6])

  print(result)
  print(result_set_2)

if __name__ == "__main__":
  main()

In [None]:
# Program finds the TMax from CHIRTSMax Data and a raster with polygon IDS burned
# By Cascade Tuholske 2019-08-20

# Dependencies
import rasterio 
import numpy as np
import pandas as pd
import geopandas as gpd
from rasterstats import zonal_stats
from rasterio import features
import os
import xarray as xr
import fnmatch
import time
import multiprocessing as mp

In [None]:
# LOCAL TEST 
CHIRT_DIR = '/Users/cascade/Github/UrbanHeat/data/test_in/' # <<--- path to loop through
SHP_DIR = '/Users/cascade/Github/PopRaster/data/raw/JRC/ghs-ucdb/'
POLY_RST_DIR = '/Users/cascade/Github/PopRaster/data/interim/'
DATA_OUT = '/Users/cascade/Github/UrbanHeat/data/test_out/'

In [None]:
# Open Polygon Raster
polyRst_fn = 'GHS_UCDB_Raster_Raster_touched.tif'
polyRst = rasterio.open(POLY_RST_DIR+polyRst_fn)

# Open the file with GeoPANDAS read_file
shp_fn = 'GHS_STAT_UCDB2015MT_GLOBE_R2019A_V1_0.shp'
shps = gpd.read_file(SHP_DIR+shp_fn)

# Set fn out, change as needed 
fn_out = 'GHS-CHIRT-MONTHLY'  

# Isloate SHP Poly Col to merge back in later 
df_ghs = gpd.GeoDataFrame()

df_ghs['ID_HDC_G0'] = shps.ID_HDC_G0
df_ghs['CTR_MN_NM'] = shps.CTR_MN_NM

# Turn polyRst data as Xarray, 
polyRst_da = xr.DataArray(polyRst.read(1), dims = ['y', 'x'])




In [None]:
start = time.time()

# Loop through dirs 
for dirpath, dirnames, files in os.walk(CHIRT_DIR):
        # Set dir name for writing files
        dir_year = dirpath.split(CHIRT_DIR)[1]

        # make a copy of the ghs polys, reset for each dir
        df_merge = df_ghs.copy()
        
        for fn in files:

                # find all the tif files
                if fn.endswith('.tif'):
                
                        # NEED TO BUILD META DATA CHECK INTO ROUTINE and throw an error<<<<---------

                        # Get the date of each chirt file
                        date = (fn.split('CHIRTSmax.')[1].split('.tif')[0])
                        print(dir_year)
                        print(date)
                        
                        # Open CHIRT Data and turn data into array
                        tempRst = rasterio.open(dirpath+'/'+fn)
                        
                        # Make arrays into x    array DataArray
                        tempRst_da = xr.DataArray(tempRst.read(1), dims = ['y', 'x']) # y and x are our 2-d labels
                        
                        # Make xarray dataset
                        ds = xr.Dataset(data_vars = 
                                {'ghs' : (['y', 'x'], polyRst_da),
                                'temp' : (['y', 'x'], tempRst_da),})
                        
                        # UPDATED 2019-08-19 Mask the CHIRTS PIXELS FIRST, THEN GHS
                        # Mask values from chirt that are ocean in ghs and chirt in our ds 
                        ds_mask = ds.where(ds.temp != -9999, drop = False) #<<<<------ need to double check this
                        
                        # Mask pixels for both ghs and chirts where ghs cities are not present
                        ds_mask = ds_mask.where(ds_mask.ghs > 0, drop = False)
                        
                        # Group poly_IDs find temp
                        avg = ds_mask.groupby('ghs').mean(xr.ALL_DIMS)
                        
                        # turn GHS IDS and avg. CHIRTMax values into 1-D numpy arrays of equal length
                        avg_ID = np.array(avg.ghs)
                        avg_temp = np.array(avg.temp)
                        
                        print(len(avg_ID))
                        print(len(avg_temp))
                                
                        # turn chirt max and IDS into a DF
                        df_avg = pd.DataFrame()
                        df_avg[date] = avg_temp
                        df_avg['ID_HDC_G0'] = avg_ID
                        
                        # merge the df
                        df_merge = df_merge.merge(df_avg, on='ID_HDC_G0', how = 'outer')

                        # add to count and write out
                        # count = count +1
                        # print(count)
                        # count = 0
                        #if count == 3: #<<<<<<< ------ SET COUNT

        # write files out for each dir        
        # df_merge.to_file(DATA_OUT+fn_out+'_'+dir_year+'.shp') # shp out
        df_merge.to_csv(DATA_OUT+fn_out+'_'+dir_year+'.csv') # csv out

# Write out as a .shp file
# df_merge.to_file(DATA_OUT+shp_fn_out)
# df_merge.to_csv(DATA_OUT+csv_fn_out)

print('DONE ! ! !')
end = time.time()
print(end - start)

In [None]:
# Loop through dirs 
def chirt_ghs (dir_name):
    for dirpath, dirnames, files in os.walk(dir_name):
            # Set dir name for writing files
            dir_year = dirpath.split(dir_name)[1]

            # make a copy of the ghs polys, reset for each dir
            df_merge = df_ghs.copy()

            for fn in files:

                    # find all the tif files
                    if fn.endswith('.tif'):

                            # NEED TO BUILD META DATA CHECK INTO ROUTINE and throw an error<<<<---------

                            # Get the date of each chirt file
                            date = (fn.split('CHIRTSmax.')[1].split('.tif')[0])
                            print(dir_year)
                            print(date)

                            # Open CHIRT Data and turn data into array
                            tempRst = rasterio.open(dirpath+'/'+fn)

                            # Make arrays into x    array DataArray
                            tempRst_da = xr.DataArray(tempRst.read(1), dims = ['y', 'x']) # y and x are our 2-d labels

                            # Make xarray dataset
                            ds = xr.Dataset(data_vars = 
                                    {'ghs' : (['y', 'x'], polyRst_da),
                                    'temp' : (['y', 'x'], tempRst_da),})

                            # UPDATED 2019-08-19 Mask the CHIRTS PIXELS FIRST, THEN GHS
                            # Mask values from chirt that are ocean in ghs and chirt in our ds 
                            ds_mask = ds.where(ds.temp != -9999, drop = False) #<<<<------ need to double check this

                            # Mask pixels for both ghs and chirts where ghs cities are not present
                            ds_mask = ds_mask.where(ds_mask.ghs > 0, drop = False)

                            # Group poly_IDs find temp
                            avg = ds_mask.groupby('ghs').mean(xr.ALL_DIMS)

                            # turn GHS IDS and avg. CHIRTMax values into 1-D numpy arrays of equal length
                            avg_ID = np.array(avg.ghs)
                            avg_temp = np.array(avg.temp)

                            print(len(avg_ID))
                            print(len(avg_temp))

                            # turn chirt max and IDS into a DF
                            df_avg = pd.DataFrame()
                            df_avg[date] = avg_temp
                            df_avg['ID_HDC_G0'] = avg_ID

                            # merge the df
                            df_merge = df_merge.merge(df_avg, on='ID_HDC_G0', how = 'outer')

                            # add to count and write out
                            # count = count +1
                            # print(count)
                            # count = 0
                            #if count == 3: #<<<<<<< ------ SET COUNT

            # write files out for each dir        
            # df_merge.to_file(DATA_OUT+fn_out+'_'+dir_year+'.shp') # shp out
            df_merge.to_csv(DATA_OUT+fn_out+'_'+dir_year+'ParPro.csv') # csv out

    # Write out as a .shp file
    # df_merge.to_file(DATA_OUT+shp_fn_out)
    # df_merge.to_csv(DATA_OUT+csv_fn_out)

    print('DONE ! ! !')



In [None]:
def simple (dir_name):
    for dirpath, dirnames, files in os.walk(dir_name):
            # Set dir name for writing files
            dir_year = dirpath.split(dir_name)[1]

            for fn in files:

                    # find all the tif files
                    if fn.endswith('.tif'):
                        print(dir_year)
                        print(fn)

In [None]:
start = time.time()

def main():
  pool = mp.Pool(3) # mp.cpu_count() using 3 CPUS
  result = pool.map(simple, CHIRT_DIR)


if __name__ == "__main__":
  main()

end = time.time()
print(end - start)

In [None]:
import time
import multiprocessing as mp
import os
import rasterio
from multiprocessing import Process, Queue


start = time.time()

CHIRT_DIR = '/Users/cascade/Github/UrbanHeat/data/test_in/' # <<--- path to loop through

def process(fn):
    date = (fn.split('CHIRTSmax.')[1].split('.tif')[0])
    print(date)

    # Open CHIRT Data and turn data into array
    with rasterio.open(dirpath+'/'+fn, 'r') as dst:
        print(dst.meta)


queue = Queue()
processes = [Process(target=process, args=()) for x in range(4)]

for p in processes:
    p.start()

for p in processes:
    p.join()

# # p = multiprocessing.Pool()

# pool = mp.Pool(mp.cpu_count())

# # for dirpath, dirnames, files in os.walk(CHIRT_DIR):
# #         # Set dir name for writing files
# #         dir_year = dirpath.split(CHIRT_DIR)[1]

# #         # make a copy of the ghs polys, reset for each dir
# #         df_merge = df_ghs.copy()
        
# for dirpath, dirnames, files in os.walk(CHIRT_DIR):
#     for fn in files:
#         if fn.endswith('.tif'):
#             print(type([fn]))
# #             fn_in = files+fn
# #             pool.map(process, [fn_in])


# # for dirpath, dirnames, files in os.listdir(CHIRT_DIR):
# #         for fn in files:
# #             if fn.endswith('.tif'):
# #                 pool.map(process, [fn])

end = time.time()
print(end - start)

In [None]:
from multiprocessing import Process

def print_func(continent='Asia'):
    print('The name of continent is : ', continent)

if __name__ == "__main__":  # confirms that the code is under main function
    names = ['America', 'Europe', 'Africa']
    procs = []
    proc = Process(target=print_func)  # instantiating without any argument
    procs.append(proc)
    proc.start()

    # instantiating process with arguments
    for name in names:
        # print(name)
        proc = Process(target=print_func, args=(name,))
        procs.append(proc)
        proc.start()

    # complete the processes
    for proc in procs:
        proc.join()

In [None]:
from multiprocessing import Pool

import time

work = (["A", 5], ["B", 2], ["C", 1], ["D", 3])


def work_log(work_data):
    print(" Process %s waiting %s seconds" % (work_data[0], work_data[1]))
    time.sleep(int(work_data[1]))
    print(" Process %s Finished." % work_data[0])


def pool_handler():
    p = Pool(2)
    p.map(work_log, work)


if __name__ == '__main__':
    pool_handler()

In [None]:
import os
 
from multiprocessing import Process
 
def doubler(number):
    """
    A doubling function that can be used by a process
    """
    result = number * 2
    proc = os.getpid()
    print('{0} doubled to {1} by process id: {2}'.format(
        number, result, proc))
 
if __name__ == '__main__':
    numbers = [5, 10, 15, 20, 25]
    procs = []
 
    for index, number in enumerate(numbers):
        proc = Process(target=doubler, args=(number,))
        procs.append(proc)
        proc.start()
 
    for proc in procs:
        proc.join()

In [10]:
from multiprocessing import Pool
 
def doubler(number):
    return number * 2
 
if __name__ == '__main__':
    numbers = [5, 10, 20]
    pool = Pool(processes=3)
    print(pool.map(doubler, numbers))

[10, 20, 40]


In [None]:
CHIRT_DIR = '/Users/cascade/Github/UrbanHeat/data/test_in/year1/' # <<--- path to loop through

for fn in os.listdir(CHIRT_DIR):
    print(fn)

In [None]:
from multiprocessing import Process

def print_func(dir_list): 
    for fn in os.listdir(dir_list):
        print(fn)

proc = Process(target=print_func, args=CHIRT_DIR)  # instantiating without any argument)
proc.start()




In [None]:
#### HERE START HERE ! ! ! ####
# https://stackoverflow.com/questions/41992810/python-multiprocessing-across-different-files

CHIRT_DIR = '/Users/cascade/Github/UrbanHeat/data/test_in/year1' # <<--- path to loop through

from multiprocessing import Pool, Queue
import time 

start = time.time()

def crunch_file(queue):
    while not queue.empty():
        fn = queue.get()
        date = (fn.split('CHIRTSmax.')[1].split('.tif')[0])
        print(date)
        
for root, dirnames, filenames in os.walk(CHIRT_DIR):
    for fn in filenames:
        queue.put(fn)
        pool = Pool(None, crunch_file, (queue,))

        pool.close() # signal that we won't submit any more tasks to pool
pool.join() # wait until all processes are done

end = time.time()
print(end-start)

In [None]:
CHIRT_DIR = '/Users/cascade/Github/UrbanHeat/data/test_in/' # <<--- path to loop through
for root, dirnames, filenames in os.walk(CHIRT_DIR):
    print(root)
#    print(dirnames)
#     print(filenames)

In [None]:
os.walk(CHIRT_DIR)

In [None]:
from glob import glob
glob(CHIRT_DIR+'/*')

In [None]:
from multiprocessing.pool import Pool
from multiprocessing import JoinableQueue as Queue
import os

def explore_path(path):
    directories = []
    nondirectories = []
    for filename in os.listdir(path):
        fullname = os.path.join(path, filename)
        if os.path.isdir(fullname):
            directories.append(fullname)
        else:
            nondirectories.append(filename)
    outputfile = path.replace(os.sep, '_') + '.txt'
    with open(outputfile, 'w') as f:
        for filename in nondirectories:
            print >> f, filename
    return directories

def parallel_worker():
    while True:
        path = unsearched.get()
        dirs = explore_path(path)
        for newdir in dirs:
            unsearched.put(newdir)
        unsearched.task_done()

# acquire the list of paths
with open('paths.txt') as f:
    paths = f.split()

unsearched = Queue()
for path in paths:
    unsearched.put(path)

pool = Pool(5)
for i in range(5):
    pool.apply_async(parallel_worker)

unsearched.join()
print('Done')

In [None]:
for root, dirnames, filenames in os.walk(CHIRT_DIR):
    data = filenames

In [None]:
data

# HERE

In [None]:
# Loop through dirs

# helper function 
def temp_ghs(dir_nm):
#     print(type(dir_nm))
    for fn in os.listdir(dir_nm):
        
        # find all the tif files
        if fn.endswith('.tif'):

                # NEED TO BUILD META DATA CHECK INTO ROUTINE and throw an error<<<<---------

                # Get the date of each chirt file
                date = (fn.split('CHIRTSmax.')[1].split('.tif')[0])
                print(dir_year)
                print(date)

                # Open CHIRT Data and turn data into array
                tempRst = rasterio.open(dirpath+'/'+fn)

                # Make arrays into x    array DataArray
                tempRst_da = xr.DataArray(tempRst.read(1), dims = ['y', 'x']) # y and x are our 2-d labels

                # Make xarray dataset
                ds = xr.Dataset(data_vars = 
                        {'ghs' : (['y', 'x'], polyRst_da),
                        'temp' : (['y', 'x'], tempRst_da),})

                # UPDATED 2019-08-19 Mask the CHIRTS PIXELS FIRST, THEN GHS
                # Mask values from chirt that are ocean in ghs and chirt in our ds 
                ds_mask = ds.where(ds.temp != -9999, drop = False) #<<<<------ need to double check this

                # Mask pixels for both ghs and chirts where ghs cities are not present
                ds_mask = ds_mask.where(ds_mask.ghs > 0, drop = False)

                # Group poly_IDs find temp
                avg = ds_mask.groupby('ghs').mean(xr.ALL_DIMS)

                # turn GHS IDS and avg. CHIRTMax values into 1-D numpy arrays of equal length
                avg_ID = np.array(avg.ghs)
                avg_temp = np.array(avg.temp)

                print(len(avg_ID))
                print(len(avg_temp))

                # turn chirt max and IDS into a DF
                df_avg = pd.DataFrame()
                df_avg[date] = avg_temp
                df_avg['ID_HDC_G0'] = avg_ID

                # merge the df
                df_merge = df_merge.merge(df_avg, on='ID_HDC_G0', how = 'outer')

                # add to count and write out
                # count = count +1
                # print(count)
                # count = 0
                #if count == 3: #<<<<<<< ------ SET COUNT

# write files out for each dir        
# df_merge.to_file(DATA_OUT+fn_out+'_'+dir_year+'.shp') # shp out
df_merge.to_csv(DATA_OUT+fn_out+'_'+dir_year+'ParPro.csv') # csv out

# Write out as a .shp file
# df_merge.to_file(DATA_OUT+shp_fn_out)
# df_merge.to_csv(DATA_OUT+csv_fn_out)

print('DONE ! ! !')

In [8]:
#### HERE START HERE ! ! ! ####
# https://stackoverflow.com/questions/41992810/python-multiprocessing-across-different-files

CHIRT_DIR = '/Users/cascade/Github/UrbanHeat/data/test_in/' # <<--- path to loop through

from multiprocessing import Pool, Queue, Process
import time 
import os
from glob import glob

start = time.time()

# helper function 
def open_file(dir_nm):
#     print(type(dir_nm))
    for fn in os.listdir(dir_nm):
        date = (fn.split('CHIRTSmax.')[1].split('.tif')[0])
        print(date)

dir_list = glob(CHIRT_DIR+"/*")

for dir_nm in (dir_list):
    proc = Process(target=open_file, args=(dir_nm,))
    proc.start()
    proc.join()

end = time.time()
print(end-start)

1983.01
1983.03
1983.02
1983.06
1983.05
1983.04
1983.09
1983.08
1983.12
1983.07
1983.11
1983.10
0.07575726509094238


In [9]:
start = time.time()
for dir_nm in (dir_list):
    open_file(dir_nm)

end = time.time()
print(end-start)

1983.01
1983.03
1983.02
1983.06
1983.05
1983.04
1983.09
1983.08
1983.12
1983.07
1983.11
1983.10
0.0008687973022460938


In [None]:
import os
 
from multiprocessing import Process
 
def doubler(number):
    """
    A doubling function that can be used by a process
    """
    result = number * 2
    proc = os.getpid()
    print('{0} doubled to {1} by process id: {2}'.format(
        number, result, proc))
 

numbers = [5, 10, 15, 20, 25]
procs = []

for index, number in enumerate(numbers):
    proc = Process(target=doubler, args=(number,))
    procs.append(proc)
    proc.start()

for proc in procs:
    proc.join()