In [1]:
import datetime
import os
import sys
import glob
import pprint

import numpy as np

from osgeo import gdal
gdal.AllRegister()
import pandas as pd


In [2]:
reference = r"Z:\ancillary\NLCD\Original\h27v09"

prediction = r"Z:\eval_materials\sites.prep\h27v09_v2017.08.18\CoverMaps\CoverPrim_color"

workdir = r"C:\Users\dzelenak\Workspace\h27v09\Analysis\data\original_classes"

year = "1992"



In [3]:
def get_file(path, year):
    """

    :param path: Location to search for the appropriate input file based on year
    :type path: str
    :param name:
    :param year:
    :return: Item from templist based on the matching year
    """

    filelist = glob.glob("{p}{sep}*.tif".format(p=path, sep=os.sep))

    filelist.sort()

    templist = [item for item in filelist if year in os.path.basename(item)]

    if len(templist) == 0:

        print("\nCould not locate a file for year {} in the given path {}\n".format(year, path))

        print("Available files in path are:\n")

        pprint.pprint(filelist)

        sys.exit(0)

    elif len(templist) == 1:

        return templist[0]

    else:

        return templist[-1]

In [4]:
def readData(refdir, preddir, y):

    reffile = get_file(refdir, y)

    predfile = get_file(preddir, y)

    print("The reference file is:\n\t{}\n".format(reffile))

    print("The prediction file is:\n\t{}\n".format(predfile))

    # Load raster data into arrays
    refdata = gdal.Open(reffile, gdal.GA_ReadOnly).ReadAsArray()

    preddata = gdal.Open(predfile, gdal.GA_ReadOnly).ReadAsArray()

    # Obtain unique class values from the reference data array
    ref = np.unique(refdata)

    ref_ = list(ref.flatten().tolist())

    ccdc = np.unique(preddata)

    ccdc_ = list(ccdc.flatten().tolist())

    # combine both classes lists and remove duplicates
    classes = ref_ + list(set(ccdc_) - set(ref_))

    classes.sort()

    ref, ccdc = None, None

    return refdata, preddata, classes, reffile, predfile

In [5]:
def compute_confusion_matrix(truth, predicted, classes):

    total = float(len(classes) ** 2)

    # create boolean arrays of all zeros
    TP = np.zeros(truth.shape, np.bool)

    FP = np.zeros(truth.shape, np.bool)

    FN = np.zeros(truth.shape, np.bool)

    # create the confusion matrix, for now containing all zeros
    confusion_matrix = np.zeros((len(classes), len(classes)), np.int32)

    print("generating %s by %s confusion matrix" % (len(classes), len(classes)))

    # iterate through the unique classes
    counter = 1.0

    for c in classes:  # iterate through columns

        for r in classes:  # iterate through rows

            current = counter / total * 100.0  # as percent

            if c == r:  # TP case

                # print 'column: ', c, '\trow: ', r

                np.logical_and(truth == r, predicted == c, TP)

                confusion_matrix[classes.index(r), classes.index(c)] = np.sum(TP)

            elif classes.index(r) > classes.index(c):

                # print 'column: ', c, '\trow: ', r

                np.logical_and(truth == r, predicted == c, FP)

                confusion_matrix[classes.index(r), classes.index(c)] = np.sum(FP)

            elif classes.index(r) < classes.index(c):

                # print 'column: ', c, '\trow: ', r

                np.logical_and(truth == r, predicted == c, FN)

                confusion_matrix[classes.index(r), classes.index(c)] = np.sum(FN)

            # show the percent complete
            sys.stdout.write("\r%s%% Done " % str(current)[:5])

            # needed to display the current percent complete
            sys.stdout.flush()

            counter += 1.0

        sys.stdout.flush()

    # add row totals in a new column at the end
    x_sum = confusion_matrix.sum(axis=1)

    x_sum = np.reshape(x_sum, (len(classes), 1))

    confusion_matrix = np.append(arr=confusion_matrix, values=x_sum, axis=1)

    # add column totals in a new row at the end
    y_sum = confusion_matrix.sum(axis=0)

    y_sum = np.reshape(y_sum, (1, len(classes) + 1))

    confusion_matrix = np.append(arr=confusion_matrix, values=y_sum, axis=0)

    # insert a blank row and column at the top/left to contain class values
    confusion_matrix = np.insert(arr=confusion_matrix, obj=0, axis=0, values=0)

    confusion_matrix = np.insert(arr=confusion_matrix, obj=0, axis=1, values=0)

    # so len(classes) matches row/column shape of confusion matrix
    classes.insert(0, 0)

    # 99999999 instead of 'total' because can't have strings in array of numbers
    classes.append(99999999)

    # insert the class names into the blank columns/rows of the matrix
    for c in range(len(classes)):
        confusion_matrix[c, 0] = classes[c]

        confusion_matrix[0, c] = classes[c]

    return confusion_matrix

In [6]:
def get_fname(ref, y):
    
    names = ["nlcd", "NLCD", "trends", "Trendsblock", "Trends"]

    for n in names:

        if n in os.path.basename(ref):

            name = n

            if name == "Trendsblock" or name == "Trends":

                name = "trends"

            break
        
    # Create a name for the confusion matrix .csv file

    f_name = "{name}_pyccdc_{year}_cnfmatrix".format(name=name, year=y)
    
    return f_name
    

In [7]:
def write_to_csv(matrix, outdir, basename):

    lookfor = '99999999'

    if os.path.exists('%s/%s.csv' % (outdir, lookfor)):
        os.remove('%s/%s.csv' % (outdir, lookfor))

    if os.path.exists('%s/%s.csv' % (outdir, basename)):
        os.remove('%s/%s.csv' % (outdir, basename))

    # save the confusion matrix to a temporary .csv file named 999999.csv
    np.savetxt('%s/%s.csv' % (outdir, lookfor), matrix, fmt='%d')

    # open the temp .csv file and a new final output csv file named with the fname variable
    with open('%s/%s.csv' % (outdir, lookfor), 'r') as f:

        text = f.read()

        text = text.replace(lookfor, 'Total')

    with open('%s/%s.csv' % (outdir, basename), 'w') as out:

        out.write(text)

    for dirpath, folders, files in os.walk(outdir):

        for x in files:

            if x == '99999999.csv':
                os.remove(os.path.join(dirpath, x))

    return None


In [8]:
def array_to_dataframe(matrix):
    
    # Create a copy of the original numpy array to preserve it

    holder = np.copy(matrix)
    
    # Remove empty rows
    
    cnf_mat1 = np.copy(holder)

    for row in range(np.shape(matrix)[0]-1, -1, -1):

        try:

            test_row = matrix[row, 1:]

            if np.all(test_row == 0):

                cnf_mat_ = np.delete(cnf_mat1, row, axis=0)

                cnf_mat1 = np.copy(cnf_mat_)

        except: IndexError
            
    # Remove empty columns

    cnf_mat2 = np.copy(cnf_mat1)

    for c in range(np.shape(cnf_mat1)[1]-1, -1, -1):

        try:

            test_col = cnf_mat1[1:,c]

            if np.all(test_col == 0):

                cnf_mat_ = np.delete(cnf_mat2, c, axis=1)

                cnf_mat2 = np.copy(cnf_mat_)

        except: IndexError
            
    # Dataframe with empty rows and columns removed
    
    df = pd.DataFrame(cnf_mat2[1:,1:], index=cnf_mat2[1:,0], columns=cnf_mat2[0,1:])
    
    # Find and replace 99999999 with "Total"

    try:

        # Find in dataframe index

        ind_list = df.index.tolist()

        idx = ind_list.index(99999999)

        ind_list[idx] = "Total"

        df.index = ind_list

        # Find in dataframe columns

        col_list = df.columns.tolist()

        idx = col_list.index(99999999)

        col_list[idx] = "Total"

        df.columns = col_list

    except: ValueError
    
    return df

In [9]:
def write_to_excel(loc, df, basename, y):
    
    # Create a Pandas Excel writer using XlsxWriter as the engine
    writer = pd.ExcelWriter(loc + os.sep + "{name}.xlsx".format(name=basename),
                        engine="xlsxwriter")
    
    # Convert the dataframe to an XLsxWriter Excel object
    df.to_excel(writer, sheet_name="{year}".format(year=y))
    
    # Close the Pandas Excel writer and output the Excel file
    writer.save()
    
    return None

In [10]:
if not os.path.exists(workdir):
    
    os.makedirs(workdir)

refData, predData, Classes, ref_file, pred_file = readData(reference, prediction, year)

cnf_mat = compute_confusion_matrix(refData, predData, Classes)

fname = get_fname(ref_file, year)

write_to_csv(cnf_mat, workdir, fname)

df = array_to_dataframe(cnf_mat)

write_to_excel(workdir, df, fname, year)

df

The reference file is:
	Z:\ancillary\NLCD\Original\h27v09\nlcd_1992_30meter_whole.tif

The prediction file is:
	Z:\eval_materials\sites.prep\h27v09_v2017.08.18\CoverMaps\CoverPrim_color\CoverPrim_1992.tif

generating 23 by 23 confusion matrix
100.0% Done 

Unnamed: 0,1,2,3,4,5,6,8,9,Total
11,17857,39023,10,68805,604053,8230,43,1351,739372
21,939550,171878,66,186230,992,276,3,4109,1303104
22,37227,440,0,341,89,1,0,149,38247
23,367299,44347,43,44787,4415,336,8,5035,466270
31,186,61,0,111,8,0,0,0,366
32,39324,12307,51,8597,408,2,5,1817,62511
33,84090,80521,373,133498,877,160,45,10356,309920
41,378156,911260,761,8566509,18143,16943,617,21950,9914339
42,59170,81095,24,1043950,10869,4538,168,7392,1207206
43,157147,381108,51,2483375,7134,5503,271,5144,3039733
